From aa4fc8dd601e5a832318bc907115a40ac7535a04 Mon Sep 17 00:00:00 2001
From: David Guillen Fandos <david@davidgf.net>
Date: Sun, 12 May 2019 23:25:43 +0200
Subject: [PATCH] Move cache invalidation to vmem_platform and implement NO_RWX
 on arm64

Still not functional yet, this commit breaks arm64 on android for instance.
Rewrites are not yet figured out, will push them in the next commit.
---
 core/hw/aica/dsp_arm64.cpp   | 14 +++----
 core/hw/arm7/arm64.cpp       |  8 ++--
 core/hw/mem/_vmem.cpp        |  1 +
 core/hw/mem/_vmem.h          |  2 +
 core/linux/posix_vmem.cpp    | 56 +++++++++++++++++++++++++++-
 core/rec-ARM64/rec_arm64.cpp | 71 ++++++++++++------------------------
 core/windows/win_vmem.cpp    |  2 +-
 7 files changed, 95 insertions(+), 59 deletions(-)

diff --git a/core/hw/aica/dsp_arm64.cpp b/core/hw/aica/dsp_arm64.cpp
index ba5515ed4..cac7b4c71 100644
--- a/core/hw/aica/dsp_arm64.cpp
+++ b/core/hw/aica/dsp_arm64.cpp
@@ -27,7 +27,7 @@
 #include "deps/vixl/aarch64/macro-assembler-aarch64.h"
 using namespace vixl::aarch64;
 
-extern void Arm64CacheFlush(void* start, void* end);
+extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end);
 
 class DSPAssembler : public MacroAssembler
 {
@@ -54,9 +54,9 @@ public:
 			Stp(xzr, xzr, MemOperand(x0, 48));
 			Ret();
 			FinalizeCode();
-#ifdef _ANDROID
-			Arm64CacheFlush(GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
-#endif
+			vmem_platform_flush_cache(
+				GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>(),
+				GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
 
 			return;
 		}
@@ -387,9 +387,9 @@ public:
 #endif
 		FinalizeCode();
 
-#ifdef _ANDROID
-		Arm64CacheFlush(GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
-#endif
+		vmem_platform_flush_cache(
+			GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>(),
+			GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
 	}
 
 private:
diff --git a/core/hw/arm7/arm64.cpp b/core/hw/arm7/arm64.cpp
index 89e57ab66..e712855bd 100644
--- a/core/hw/arm7/arm64.cpp
+++ b/core/hw/arm7/arm64.cpp
@@ -28,7 +28,7 @@
 using namespace vixl::aarch64;
 //#include "deps/vixl/aarch32/disasm-aarch32.h"
 
-extern void Arm64CacheFlush(void* start, void* end);
+extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end);
 extern u32 arm_single_op(u32 opcode);
 extern "C" void arm_dispatch();
 extern "C" void arm_exit();
@@ -41,7 +41,7 @@ extern reg_pair arm_Reg[RN_ARM_REG_COUNT];
 MacroAssembler *assembler;
 
 extern "C" void armFlushICache(void *bgn, void *end) {
-	Arm64CacheFlush(bgn, end);
+	vmem_platform_flush_cache(bgn, end, bgn, end);
 }
 
 static MemOperand arm_reg_operand(u32 regn)
@@ -143,7 +143,9 @@ void armv_end(void* codestart, u32 cycl)
 
 	assembler->FinalizeCode();
 	verify(assembler->GetBuffer()->GetCursorOffset() <= assembler->GetBuffer()->GetCapacity());
-	Arm64CacheFlush(codestart, assembler->GetBuffer()->GetEndAddress<void*>());
+	vmem_platform_flush_cache(
+		codestart, assembler->GetBuffer()->GetEndAddress<void*>(),
+		codestart, assembler->GetBuffer()->GetEndAddress<void*>());
 	icPtr += assembler->GetBuffer()->GetSizeInBytes();
 
 #if 0
diff --git a/core/hw/mem/_vmem.cpp b/core/hw/mem/_vmem.cpp
index 3fbfdf6b5..cabe7941c 100644
--- a/core/hw/mem/_vmem.cpp
+++ b/core/hw/mem/_vmem.cpp
@@ -469,6 +469,7 @@ bool _vmem_reserve() {
 	}
 	else {
 		printf("Info: nvmem is enabled, with addr space of size %s\n", vmemstatus == MemType4GB ? "4GB" : "512MB");
+		printf("Info: p_sh4rcb: %p virt_ram_base: %p\n", p_sh4rcb, virt_ram_base);
 		// Map the different parts of the memory file into the new memory range we got.
 		#define MAP_RAM_START_OFFSET  0
 		#define MAP_VRAM_START_OFFSET (MAP_RAM_START_OFFSET+RAM_SIZE)
diff --git a/core/hw/mem/_vmem.h b/core/hw/mem/_vmem.h
index 394194146..dca2c09e3 100644
--- a/core/hw/mem/_vmem.h
+++ b/core/hw/mem/_vmem.h
@@ -30,6 +30,8 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code
 // Same as above but uses two address spaces one with RX and RW protections.
 // Note: this function doesnt have to be implemented, it's a fallback for the above one.
 bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code_area_rw, uintptr_t *rx_offset);
+// This might not need an implementation (ie x86/64 cpus).
+void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end);
 
 // Note: if you want to disable vmem magic in any given platform, implement the
 // above functions as empty functions and make vmem_platform_init return MemTypeError.
diff --git a/core/linux/posix_vmem.cpp b/core/linux/posix_vmem.cpp
index 8547955e1..9a573cdae 100644
--- a/core/linux/posix_vmem.cpp
+++ b/core/linux/posix_vmem.cpp
@@ -215,9 +215,63 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code
 
 	*code_area_rw = ptr_rw;
 	*rx_offset = (char*)ptr_rx - (char*)ptr_rw;
-	printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %p\n", ptr_rx, ptr_rw, *rx_offset);
+	printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %lu\n", ptr_rx, ptr_rw, (unsigned long)*rx_offset);
 
 	return (ptr_rw != MAP_FAILED);
 }
 
+// Some OSes restrict cache flushing, cause why not right? :D
+
+#if HOST_CPU == CPU_ARM64
+
+// Code borrowed from Dolphin https://github.com/dolphin-emu/dolphin
+static void Arm64_CacheFlush(void* start, void* end) {
+	if (start == end)
+		return;
+
+#if HOST_OS == OS_DARWIN
+	// Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
+	sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
+#else
+	// Don't rely on GCC's __clear_cache implementation, as it caches
+	// icache/dcache cache line sizes, that can vary between cores on
+	// big.LITTLE architectures.
+	u64 addr, ctr_el0;
+	static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+	size_t isize, dsize;
+
+	__asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+	isize = 4 << ((ctr_el0 >> 0) & 0xf);
+	dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+	// use the global minimum cache line size
+	icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+	dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+	addr = (u64)start & ~(u64)(dsize - 1);
+	for (; addr < (u64)end; addr += dsize)
+		// use "civac" instead of "cvau", as this is the suggested workaround for
+		// Cortex-A53 errata 819472, 826319, 827319 and 824069.
+		__asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+	__asm__ volatile("dsb ish" : : : "memory");
+
+	addr = (u64)start & ~(u64)(isize - 1);
+	for (; addr < (u64)end; addr += isize)
+		__asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+	__asm__ volatile("dsb ish" : : : "memory");
+	__asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+
+void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end) {
+	Arm64_CacheFlush(dcache_start, dcache_end);
+
+	// Dont risk it and flush and invalidate icache&dcache for both ranges just in case.
+	if (icache_start != dcache_start)
+		Arm64_CacheFlush(icache_start, icache_end);
+}
+
+#endif // #if HOST_CPU == CPU_ARM64
 
diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp
index 2a9108349..03795ab13 100644
--- a/core/rec-ARM64/rec_arm64.cpp
+++ b/core/rec-ARM64/rec_arm64.cpp
@@ -45,12 +45,11 @@ using namespace vixl::aarch64;
 extern "C" void no_update();
 extern "C" void intc_sched();
 extern "C" void ngen_blockcheckfail(u32 pc);
-
 extern "C" void ngen_LinkBlock_Generic_stub();
 extern "C" void ngen_LinkBlock_cond_Branch_stub();
 extern "C" void ngen_LinkBlock_cond_Next_stub();
-
 extern "C" void ngen_FailedToFindBlock_();
+extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end);
 
 struct DynaRBI : RuntimeBlockInfo
 {
@@ -61,47 +60,6 @@ struct DynaRBI : RuntimeBlockInfo
 	}
 };
 
-// Code borrowed from Dolphin https://github.com/dolphin-emu/dolphin
-void Arm64CacheFlush(void* start, void* end)
-{
-	if (start == end)
-		return;
-
-#if HOST_OS == OS_DARWIN
-	// Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
-	sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
-#else
-	// Don't rely on GCC's __clear_cache implementation, as it caches
-	// icache/dcache cache line sizes, that can vary between cores on
-	// big.LITTLE architectures.
-	u64 addr, ctr_el0;
-	static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
-	size_t isize, dsize;
-
-	__asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
-	isize = 4 << ((ctr_el0 >> 0) & 0xf);
-	dsize = 4 << ((ctr_el0 >> 16) & 0xf);
-
-	// use the global minimum cache line size
-	icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
-	dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
-
-	addr = (u64)start & ~(u64)(dsize - 1);
-	for (; addr < (u64)end; addr += dsize)
-		// use "civac" instead of "cvau", as this is the suggested workaround for
-		// Cortex-A53 errata 819472, 826319, 827319 and 824069.
-		__asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
-	__asm__ volatile("dsb ish" : : : "memory");
-
-	addr = (u64)start & ~(u64)(isize - 1);
-	for (; addr < (u64)end; addr += isize)
-		__asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
-
-	__asm__ volatile("dsb ish" : : : "memory");
-	__asm__ volatile("isb" : : : "memory");
-#endif
-}
-
 double host_cpu_time;
 u64 guest_cpu_cycles;
 
@@ -147,7 +105,7 @@ __asm__
 	"ngen_LinkBlock_Shared_stub:			\n\t"
 		"mov x0, lr							\n\t"
 		"sub x0, x0, #4						\n\t"	// go before the call
-		"bl rdv_LinkBlock					\n\t"
+		"bl rdv_LinkBlock					\n\t"   // returns an RX addr
 		"br x0								\n"
 
 		".hidden ngen_FailedToFindBlock_	\n\t"
@@ -1013,7 +971,7 @@ public:
 
 			Ldr(w29, sh4_context_mem_operand(&next_pc));
 
-			GenBranch(no_update);
+			GenBranchRuntime(no_update);
 			break;
 
 		default:
@@ -1038,7 +996,12 @@ public:
 
 			emit_Skip(block->host_code_size);
 		}
-		Arm64CacheFlush(GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
+
+		// Flush and invalidate caches
+		vmem_platform_flush_cache(
+			CC_RW2RX(GetBuffer()->GetStartAddress<void*>()), CC_RW2RX(GetBuffer()->GetEndAddress<void*>()),
+			GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
+
 #if 0
 //		if (rewrite)
 		{
@@ -1060,10 +1023,13 @@ public:
 	}
 
 private:
+	// Runtime branches/calls need to be adjusted if rx space is different to rw space.
+	// Therefore can't mix GenBranch with GenBranchRuntime!
+
 	template <typename R, typename... P>
 	void GenCallRuntime(R (*function)(P...))
 	{
-		ptrdiff_t offset = reinterpret_cast<uintptr_t>(function) - GetBuffer()->GetStartAddress<uintptr_t>();
+		ptrdiff_t offset = reinterpret_cast<uintptr_t>(function) - reinterpret_cast<uintptr_t>(CC_RW2RX(GetBuffer()->GetStartAddress<void*>()));
 		verify(offset >= -128 * 1024 * 1024 && offset <= 128 * 1024 * 1024);
 		verify((offset & 3) == 0);
 		Label function_label;
@@ -1071,6 +1037,17 @@ private:
 		Bl(&function_label);
 	}
 
+   template <typename R, typename... P>
+	void GenBranchRuntime(R (*target)(P...))
+	{
+		ptrdiff_t offset = reinterpret_cast<uintptr_t>(target) - reinterpret_cast<uintptr_t>(CC_RW2RX(GetBuffer()->GetStartAddress<void*>()));
+		verify(offset >= -128 * 1024 * 1024 && offset <= 128 * 1024 * 1024);
+		verify((offset & 3) == 0);
+		Label target_label;
+		BindToOffset(&target_label, offset);
+		B(&target_label);
+	}
+
 	template <typename R, typename... P>
 	void GenBranch(R (*code)(P...), Condition cond = al)
 	{
diff --git a/core/windows/win_vmem.cpp b/core/windows/win_vmem.cpp
index d62d138cf..edb4b21cd 100644
--- a/core/windows/win_vmem.cpp
+++ b/core/windows/win_vmem.cpp
@@ -183,7 +183,7 @@ bool vmem_platform_prepare_jit_block(void *code_area, unsigned size, void **code
 
 	*code_area_rw = ptr_rw;
 	*rx_offset = (char*)ptr_rx - (char*)ptr_rw;
-	printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %p\n", ptr_rx, ptr_rw, *rx_offset);
+	printf("Info: Using NO_RWX mode, rx ptr: %p, rw ptr: %p, offset: %lu\n", ptr_rx, ptr_rw, (unsigned long)*rx_offset);
 
 	return (ptr_rw != NULL);
 }