From 4e0e8cef54d815037c9be17cbd72b632c12c6793 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Thu, 21 Mar 2024 16:43:50 +1000
Subject: [PATCH] Common: ARM64 compatibility

---
 common/Darwin/DarwinMisc.cpp  | 21 +++++++++++++++++++++
 common/HostSys.h              |  6 ++++++
 common/Linux/LnxHostSys.cpp   | 28 ++++++++++++++++++++++++++++
 common/Pcsx2Defs.h            | 34 +++++++++++++++++++++++++++++-----
 common/VectorIntrin.h         |  8 ++------
 common/Windows/WinHostSys.cpp | 29 +++++++++++++++++++++++++++++
 common/Windows/WinThreads.cpp | 24 ++++++++++++++++++++++++
 common/boost_spsc_queue.hpp   |  3 ++-
 pcsx2/GS/GSCapture.cpp        |  2 +-
 pcsx2/GS/GSLzma.cpp           |  2 +-
 pcsx2/MTGS.cpp                |  6 +++---
 pcsx2/MTVU.h                  |  6 +++---
 12 files changed, 149 insertions(+), 20 deletions(-)
diff --git a/common/Darwin/DarwinMisc.cpp b/common/Darwin/DarwinMisc.cpp
index 1a9a689e35..094d5cfc78 100644
--- a/common/Darwin/DarwinMisc.cpp
+++ b/common/Darwin/DarwinMisc.cpp
@@ -189,6 +189,27 @@ std::vector<DarwinMisc::CPUClass> DarwinMisc::GetCPUClasses()
 	return out;
 }
 
+template <typename T>
+static std::optional<T> sysctlbyname_T(const char* name)
+{
+	T output = 0;
+	size_t output_size = sizeof(output);
+	if (sysctlbyname(name, &output, &output_size, nullptr, 0) != 0)
+		return std::nullopt;
+
+	return output;
+}
+
+size_t HostSys::GetRuntimePageSize()
+{
+	return sysctlbyname_T<u32>("hw.pagesize").value_or(0);
+}
+
+size_t HostSys::GetRuntimeCacheLineSize()
+{
+	return static_cast<size_t>(std::max<s64>(sysctlbyname_T<s64>("hw.cachelinesize").value_or(0), 0));
+}
+
 static __ri vm_prot_t MachProt(const PageProtectionMode& mode)
 {
 	vm_prot_t machmode = (mode.CanWrite()) ? VM_PROT_WRITE : 0;
diff --git a/common/HostSys.h b/common/HostSys.h
index 1e7cfe3827..0830fcc236 100644
--- a/common/HostSys.h
+++ b/common/HostSys.h
@@ -123,6 +123,12 @@ namespace HostSys
 #else
 	void FlushInstructionCache(void* address, u32 size);
 #endif
+
+	/// Returns the size of pages for the current host.
+	size_t GetRuntimePageSize();
+
+	/// Returns the size of a cache line for the current host.
+	size_t GetRuntimeCacheLineSize();
 } // namespace HostSys
 
 namespace PageFaultHandler
diff --git a/common/Linux/LnxHostSys.cpp b/common/Linux/LnxHostSys.cpp
index 0d2835636e..83d24ca6d0 100644
--- a/common/Linux/LnxHostSys.cpp
+++ b/common/Linux/LnxHostSys.cpp
@@ -134,6 +134,34 @@ void HostSys::UnmapSharedMemory(void* baseaddr, size_t size)
 		pxFailRel("Failed to unmap shared memory");
 }
 
+size_t HostSys::GetRuntimePageSize()
+{
+	int res = sysconf(_SC_PAGESIZE);
+	return (res > 0) ? static_cast<size_t>(res) : 0;
+}
+
+size_t HostSys::GetRuntimeCacheLineSize()
+{
+	int l1i = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+	int l1d = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+	int res = (l1i > l1d) ? l1i : l1d;
+	for (int index = 0; index < 16; index++)
+	{
+		char buf[128];
+		snprintf(buf, sizeof(buf), "/sys/devices/system/cpu/cpu0/cache/index%d/coherency_line_size", index);
+		std::FILE* fp = std::fopen(buf, "rb");
+		if (!fp)
+			break;
+
+		std::fread(buf, sizeof(buf), 1, fp);
+		std::fclose(fp);
+		int val = std::atoi(buf);
+		res = (val > res) ? val : res;
+	}
+
+	return (res > 0) ? static_cast<size_t>(res) : 0;
+}
+
 SharedMemoryMappingArea::SharedMemoryMappingArea(u8* base_ptr, size_t size, size_t num_pages)
 	: m_base_ptr(base_ptr)
 	, m_size(size)
diff --git a/common/Pcsx2Defs.h b/common/Pcsx2Defs.h
index 557713c0c6..2ba7d3c6b9 100644
--- a/common/Pcsx2Defs.h
+++ b/common/Pcsx2Defs.h
@@ -4,6 +4,8 @@
 #pragma once
 
 #include "Pcsx2Types.h"
+
+#include <bit>
 #include <cstddef>
 
 // --------------------------------------------------------------------------------------
@@ -21,11 +23,33 @@ static constexpr bool IsDebugBuild = true;
 static constexpr bool IsDebugBuild = false;
 #endif
 
-// Defines the memory page size for the target platform at compilation.  All supported platforms
-// (which means Intel only right now) have a 4k granularity.
-static constexpr unsigned int __pagesize = 0x1000;
-static constexpr unsigned int __pageshift = 12;
-static constexpr unsigned int __pagemask = __pagesize - 1;
+// Defines the memory page size for the target platform at compilation.
+#if defined(OVERRIDE_HOST_PAGE_SIZE)
+	static constexpr unsigned int __pagesize = OVERRIDE_HOST_PAGE_SIZE;
+	static constexpr unsigned int __pagemask = __pagesize - 1;
+	static constexpr unsigned int __pageshift = std::bit_width(__pagemask);
+#elif defined(_M_ARM64)
+	// Apple Silicon uses 16KB pages and 128 byte cache lines.
+	static constexpr unsigned int __pagesize = 0x4000;
+	static constexpr unsigned int __pageshift = 14;
+	static constexpr unsigned int __pagemask = __pagesize - 1;
+#else
+	// X86 uses a 4KB granularity and 64 byte cache lines.
+	static constexpr unsigned int __pagesize = 0x1000;
+	static constexpr unsigned int __pageshift = 12;
+	static constexpr unsigned int __pagemask = __pagesize - 1;
+#endif
+#if defined(OVERRIDE_HOST_CACHE_LINE_SIZE)
+	static constexpr unsigned int __cachelinesize = OVERRIDE_HOST_CACHE_LINE_SIZE;
+#elif defined(_M_ARM64)
+	static constexpr unsigned int __cachelinesize = 128;
+#else
+	static constexpr unsigned int __cachelinesize = 64;
+#endif
+
+// We use 4KB alignment for globals for both Apple and x86 platforms, since computing the
+// address on ARM64 is a single instruction (adrp).
+static constexpr unsigned int __pagealignsize = 0x1000;
 
 // --------------------------------------------------------------------------------------
 //  Microsoft Visual Studio
diff --git a/common/VectorIntrin.h b/common/VectorIntrin.h
index bbe5883306..58c97900cd 100644
--- a/common/VectorIntrin.h
+++ b/common/VectorIntrin.h
@@ -5,12 +5,12 @@
 
 #pragma once
 
+#if defined(_M_X86)
+
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
 
-#if defined(_M_X86)
-
 #if defined(__AVX2__)
 #define _M_SSE 0x501
 #elif defined(__AVX__)
@@ -36,12 +36,8 @@
 #include <immintrin.h>
 
 #elif defined(_M_ARM64)
-#if defined(_MSC_VER) && !defined(__clang__)
-#include <arm64_neon.h>
-#else
 #include <arm_neon.h>
 #endif
-#endif
 
 #ifdef __APPLE__
 #include <stdlib.h> // alloca
diff --git a/common/Windows/WinHostSys.cpp b/common/Windows/WinHostSys.cpp
index 978a97782f..61d5581759 100644
--- a/common/Windows/WinHostSys.cpp
+++ b/common/Windows/WinHostSys.cpp
@@ -100,6 +100,35 @@ void HostSys::UnmapSharedMemory(void* baseaddr, size_t size)
 		pxFail("Failed to unmap shared memory");
 }
 
+size_t HostSys::GetRuntimePageSize()
+{
+	SYSTEM_INFO si = {};
+	GetSystemInfo(&si);
+	return si.dwPageSize;
+}
+
+size_t HostSys::GetRuntimeCacheLineSize()
+{
+	DWORD size = 0;
+	if (!GetLogicalProcessorInformation(nullptr, &size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+		return 0;
+
+	std::unique_ptr<SYSTEM_LOGICAL_PROCESSOR_INFORMATION[]> lpi =
+		std::make_unique<SYSTEM_LOGICAL_PROCESSOR_INFORMATION[]>(
+			(size + (sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) - 1)) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
+	if (!GetLogicalProcessorInformation(lpi.get(), &size))
+		return 0;
+
+	u32 max_line_size = 0;
+	for (u32 i = 0; i < size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); i++)
+	{
+		if (lpi[i].Relationship == RelationCache)
+			max_line_size = std::max<u32>(max_line_size, lpi[i].Cache.LineSize);
+	}
+
+	return max_line_size;
+}
+
 #ifdef _M_ARM64
 
 void HostSys::FlushInstructionCache(void* address, u32 size)
diff --git a/common/Windows/WinThreads.cpp b/common/Windows/WinThreads.cpp
index 2e87794792..0050847cac 100644
--- a/common/Windows/WinThreads.cpp
+++ b/common/Windows/WinThreads.cpp
@@ -107,10 +107,20 @@ Threading::ThreadHandle& Threading::ThreadHandle::operator=(const ThreadHandle&
 
 u64 Threading::ThreadHandle::GetCPUTime() const
 {
+#ifndef _M_ARM64
 	u64 ret = 0;
 	if (m_native_handle)
 		QueryThreadCycleTime((HANDLE)m_native_handle, &ret);
 	return ret;
+#else
+	FILETIME user, kernel, unused;
+	if (!GetThreadTimes((HANDLE)m_native_handle, &unused, &unused, &kernel, &user))
+		return 0;
+
+	const u64 user_time = (static_cast<u64>(user.dwHighDateTime) << 32) | static_cast<u64>(user.dwLowDateTime);
+	const u64 kernel_time = (static_cast<u64>(kernel.dwHighDateTime) << 32) | static_cast<u64>(kernel.dwLowDateTime);
+	return user_time + kernel_time;
+#endif
 }
 
 bool Threading::ThreadHandle::SetAffinity(u64 processor_mask) const
@@ -198,13 +208,24 @@ Threading::ThreadHandle& Threading::Thread::operator=(Thread&& thread)
 
 u64 Threading::GetThreadCpuTime()
 {
+#ifndef _M_ARM64
 	u64 ret = 0;
 	QueryThreadCycleTime(GetCurrentThread(), &ret);
 	return ret;
+#else
+	FILETIME user, kernel, unused;
+	if (!GetThreadTimes(GetCurrentThread(), &unused, &unused, &kernel, &user))
+		return 0;
+
+	const u64 user_time = (static_cast<u64>(user.dwHighDateTime) << 32) | static_cast<u64>(user.dwLowDateTime);
+	const u64 kernel_time = (static_cast<u64>(kernel.dwHighDateTime) << 32) | static_cast<u64>(kernel.dwLowDateTime);
+	return user_time + kernel_time;
+#endif
 }
 
 u64 Threading::GetThreadTicksPerSecond()
 {
+#ifndef _M_ARM64
 	// On x86, despite what the MS documentation says, this basically appears to be rdtsc.
 	// So, the frequency is our base clock speed (and stable regardless of power management).
 	static u64 frequency = 0;
@@ -224,6 +245,9 @@ u64 Threading::GetThreadTicksPerSecond()
 		}
 	}
 	return frequency;
+#else
+	return 10000000;
+#endif
 }
 
 void Threading::SetNameOfCurrentThread(const char* name)
diff --git a/common/boost_spsc_queue.hpp b/common/boost_spsc_queue.hpp
index e7ac2acc05..8468d9f28d 100644
--- a/common/boost_spsc_queue.hpp
+++ b/common/boost_spsc_queue.hpp
@@ -46,11 +46,12 @@
 
 #include <atomic>
 #include "AlignedMalloc.h"
+#include "Pcsx2Defs.h"
 
 template <typename T, size_t max_size>
 class ringbuffer_base
 {
-    static const int padding_size = 64 - sizeof(size_t);
+    static const int padding_size = __cachelinesize - sizeof(size_t);
 
     std::atomic<size_t> write_index_;
     char padding1[padding_size]; /* force read_index and write_index to different cache lines */
diff --git a/pcsx2/GS/GSCapture.cpp b/pcsx2/GS/GSCapture.cpp
index 057ba34a8c..9677011cfe 100644
--- a/pcsx2/GS/GSCapture.cpp
+++ b/pcsx2/GS/GSCapture.cpp
@@ -211,7 +211,7 @@ namespace GSCapture
 	static std::unique_ptr<s16[]> s_audio_buffer;
 	static std::atomic<u32> s_audio_buffer_size{0};
 	static u32 s_audio_buffer_write_pos = 0;
-	alignas(64) static u32 s_audio_buffer_read_pos = 0;
+	alignas(__cachelinesize) static u32 s_audio_buffer_read_pos = 0;
 } // namespace GSCapture
 
 #ifndef USE_LINKED_FFMPEG
diff --git a/pcsx2/GS/GSLzma.cpp b/pcsx2/GS/GSLzma.cpp
index 87da896773..ec5d7166ae 100644
--- a/pcsx2/GS/GSLzma.cpp
+++ b/pcsx2/GS/GSLzma.cpp
@@ -278,7 +278,7 @@ namespace
 		size_t m_block_pos = 0;
 
 		DynamicHeapArray<u8, 64> m_block_read_buffer;
-		alignas(64) CXzUnpacker m_unpacker = {};
+		alignas(__cachelinesize) CXzUnpacker m_unpacker = {};
 	};
 
 	GSDumpLzma::GSDumpLzma() = default;
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index f5034de462..252d72083f 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -61,12 +61,12 @@ namespace MTGS
 
 	static void SetEvent();
 
-	alignas(32) BufferedData RingBuffer;
+	alignas(__cachelinesize) BufferedData RingBuffer;
 
 	// note: when m_ReadPos == m_WritePos, the fifo is empty
 	// Threading info: m_ReadPos is updated by the MTGS thread. m_WritePos is updated by the EE thread
-	alignas(64) static std::atomic<unsigned int> s_ReadPos; // cur pos gs is reading from
-	alignas(64) static std::atomic<unsigned int> s_WritePos; // cur pos ee thread is writing to
+	alignas(__cachelinesize) static std::atomic<unsigned int> s_ReadPos; // cur pos gs is reading from
+	alignas(__cachelinesize) static std::atomic<unsigned int> s_WritePos; // cur pos ee thread is writing to
 
 	// These vars maintain instance data for sending Data Packets.
 	// Only one data packet can be constructed and uploaded at a time.
diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h
index 7fdc32883d..a6d2cf3701 100644
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@@ -21,9 +21,9 @@ class VU_Thread final {
 
 	u32 buffer[buffer_size];
 	// Note: keep atomic on separate cache line to avoid CPU conflict
-	alignas(64) std::atomic<int> m_ato_read_pos; // Only modified by VU thread
-	alignas(64) std::atomic<int> m_ato_write_pos;    // Only modified by EE thread
-	alignas(64) int  m_read_pos; // temporary read pos (local to the VU thread)
+	alignas(__cachelinesize) std::atomic<int> m_ato_read_pos; // Only modified by VU thread
+	alignas(__cachelinesize) std::atomic<int> m_ato_write_pos;    // Only modified by EE thread
+	alignas(__cachelinesize) int  m_read_pos; // temporary read pos (local to the VU thread)
 	int  m_write_pos; // temporary write pos (local to the EE thread)
 	Threading::WorkSema semaEvent;
 	std::atomic_bool m_shutdown_flag{false};