From 4e0e8cef54d815037c9be17cbd72b632c12c6793 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 21 Mar 2024 16:43:50 +1000 Subject: [PATCH] Common: ARM64 compatibility --- common/Darwin/DarwinMisc.cpp | 21 +++++++++++++++++++++ common/HostSys.h | 6 ++++++ common/Linux/LnxHostSys.cpp | 28 ++++++++++++++++++++++++++++ common/Pcsx2Defs.h | 34 +++++++++++++++++++++++++++++----- common/VectorIntrin.h | 8 ++------ common/Windows/WinHostSys.cpp | 29 +++++++++++++++++++++++++++++ common/Windows/WinThreads.cpp | 24 ++++++++++++++++++++++++ common/boost_spsc_queue.hpp | 3 ++- pcsx2/GS/GSCapture.cpp | 2 +- pcsx2/GS/GSLzma.cpp | 2 +- pcsx2/MTGS.cpp | 6 +++--- pcsx2/MTVU.h | 6 +++--- 12 files changed, 149 insertions(+), 20 deletions(-) diff --git a/common/Darwin/DarwinMisc.cpp b/common/Darwin/DarwinMisc.cpp index 1a9a689e35..094d5cfc78 100644 --- a/common/Darwin/DarwinMisc.cpp +++ b/common/Darwin/DarwinMisc.cpp @@ -189,6 +189,27 @@ std::vector DarwinMisc::GetCPUClasses() return out; } +template +static std::optional sysctlbyname_T(const char* name) +{ + T output = 0; + size_t output_size = sizeof(output); + if (sysctlbyname(name, &output, &output_size, nullptr, 0) != 0) + return std::nullopt; + + return output; +} + +size_t HostSys::GetRuntimePageSize() +{ + return sysctlbyname_T("hw.pagesize").value_or(0); +} + +size_t HostSys::GetRuntimeCacheLineSize() +{ + return static_cast(std::max(sysctlbyname_T("hw.cachelinesize").value_or(0), 0)); +} + static __ri vm_prot_t MachProt(const PageProtectionMode& mode) { vm_prot_t machmode = (mode.CanWrite()) ? VM_PROT_WRITE : 0; diff --git a/common/HostSys.h b/common/HostSys.h index 1e7cfe3827..0830fcc236 100644 --- a/common/HostSys.h +++ b/common/HostSys.h @@ -123,6 +123,12 @@ namespace HostSys #else void FlushInstructionCache(void* address, u32 size); #endif + + /// Returns the size of pages for the current host. + size_t GetRuntimePageSize(); + + /// Returns the size of a cache line for the current host. + size_t GetRuntimeCacheLineSize(); } // namespace HostSys namespace PageFaultHandler diff --git a/common/Linux/LnxHostSys.cpp b/common/Linux/LnxHostSys.cpp index 0d2835636e..83d24ca6d0 100644 --- a/common/Linux/LnxHostSys.cpp +++ b/common/Linux/LnxHostSys.cpp @@ -134,6 +134,34 @@ void HostSys::UnmapSharedMemory(void* baseaddr, size_t size) pxFailRel("Failed to unmap shared memory"); } +size_t HostSys::GetRuntimePageSize() +{ + int res = sysconf(_SC_PAGESIZE); + return (res > 0) ? static_cast(res) : 0; +} + +size_t HostSys::GetRuntimeCacheLineSize() +{ + int l1i = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + int l1d = sysconf(_SC_LEVEL1_ICACHE_LINESIZE); + int res = (l1i > l1d) ? l1i : l1d; + for (int index = 0; index < 16; index++) + { + char buf[128]; + snprintf(buf, sizeof(buf), "/sys/devices/system/cpu/cpu0/cache/index%d/coherency_line_size", index); + std::FILE* fp = std::fopen(buf, "rb"); + if (!fp) + break; + + std::fread(buf, sizeof(buf), 1, fp); + std::fclose(fp); + int val = std::atoi(buf); + res = (val > res) ? val : res; + } + + return (res > 0) ? static_cast(res) : 0; +} + SharedMemoryMappingArea::SharedMemoryMappingArea(u8* base_ptr, size_t size, size_t num_pages) : m_base_ptr(base_ptr) , m_size(size) diff --git a/common/Pcsx2Defs.h b/common/Pcsx2Defs.h index 557713c0c6..2ba7d3c6b9 100644 --- a/common/Pcsx2Defs.h +++ b/common/Pcsx2Defs.h @@ -4,6 +4,8 @@ #pragma once #include "Pcsx2Types.h" + +#include #include // -------------------------------------------------------------------------------------- @@ -21,11 +23,33 @@ static constexpr bool IsDebugBuild = true; static constexpr bool IsDebugBuild = false; #endif -// Defines the memory page size for the target platform at compilation. All supported platforms -// (which means Intel only right now) have a 4k granularity. -static constexpr unsigned int __pagesize = 0x1000; -static constexpr unsigned int __pageshift = 12; -static constexpr unsigned int __pagemask = __pagesize - 1; +// Defines the memory page size for the target platform at compilation. +#if defined(OVERRIDE_HOST_PAGE_SIZE) + static constexpr unsigned int __pagesize = OVERRIDE_HOST_PAGE_SIZE; + static constexpr unsigned int __pagemask = __pagesize - 1; + static constexpr unsigned int __pageshift = std::bit_width(__pagemask); +#elif defined(_M_ARM64) + // Apple Silicon uses 16KB pages and 128 byte cache lines. + static constexpr unsigned int __pagesize = 0x4000; + static constexpr unsigned int __pageshift = 14; + static constexpr unsigned int __pagemask = __pagesize - 1; +#else + // X86 uses a 4KB granularity and 64 byte cache lines. + static constexpr unsigned int __pagesize = 0x1000; + static constexpr unsigned int __pageshift = 12; + static constexpr unsigned int __pagemask = __pagesize - 1; +#endif +#if defined(OVERRIDE_HOST_CACHE_LINE_SIZE) + static constexpr unsigned int __cachelinesize = OVERRIDE_HOST_CACHE_LINE_SIZE; +#elif defined(_M_ARM64) + static constexpr unsigned int __cachelinesize = 128; +#else + static constexpr unsigned int __cachelinesize = 64; +#endif + +// We use 4KB alignment for globals for both Apple and x86 platforms, since computing the +// address on ARM64 is a single instruction (adrp). +static constexpr unsigned int __pagealignsize = 0x1000; // -------------------------------------------------------------------------------------- // Microsoft Visual Studio diff --git a/common/VectorIntrin.h b/common/VectorIntrin.h index bbe5883306..58c97900cd 100644 --- a/common/VectorIntrin.h +++ b/common/VectorIntrin.h @@ -5,12 +5,12 @@ #pragma once +#if defined(_M_X86) + #ifdef _MSC_VER #include #endif -#if defined(_M_X86) - #if defined(__AVX2__) #define _M_SSE 0x501 #elif defined(__AVX__) @@ -36,12 +36,8 @@ #include #elif defined(_M_ARM64) -#if defined(_MSC_VER) && !defined(__clang__) -#include -#else #include #endif -#endif #ifdef __APPLE__ #include // alloca diff --git a/common/Windows/WinHostSys.cpp b/common/Windows/WinHostSys.cpp index 978a97782f..61d5581759 100644 --- a/common/Windows/WinHostSys.cpp +++ b/common/Windows/WinHostSys.cpp @@ -100,6 +100,35 @@ void HostSys::UnmapSharedMemory(void* baseaddr, size_t size) pxFail("Failed to unmap shared memory"); } +size_t HostSys::GetRuntimePageSize() +{ + SYSTEM_INFO si = {}; + GetSystemInfo(&si); + return si.dwPageSize; +} + +size_t HostSys::GetRuntimeCacheLineSize() +{ + DWORD size = 0; + if (!GetLogicalProcessorInformation(nullptr, &size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + return 0; + + std::unique_ptr lpi = + std::make_unique( + (size + (sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) - 1)) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); + if (!GetLogicalProcessorInformation(lpi.get(), &size)) + return 0; + + u32 max_line_size = 0; + for (u32 i = 0; i < size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); i++) + { + if (lpi[i].Relationship == RelationCache) + max_line_size = std::max(max_line_size, lpi[i].Cache.LineSize); + } + + return max_line_size; +} + #ifdef _M_ARM64 void HostSys::FlushInstructionCache(void* address, u32 size) diff --git a/common/Windows/WinThreads.cpp b/common/Windows/WinThreads.cpp index 2e87794792..0050847cac 100644 --- a/common/Windows/WinThreads.cpp +++ b/common/Windows/WinThreads.cpp @@ -107,10 +107,20 @@ Threading::ThreadHandle& Threading::ThreadHandle::operator=(const ThreadHandle& u64 Threading::ThreadHandle::GetCPUTime() const { +#ifndef _M_ARM64 u64 ret = 0; if (m_native_handle) QueryThreadCycleTime((HANDLE)m_native_handle, &ret); return ret; +#else + FILETIME user, kernel, unused; + if (!GetThreadTimes((HANDLE)m_native_handle, &unused, &unused, &kernel, &user)) + return 0; + + const u64 user_time = (static_cast(user.dwHighDateTime) << 32) | static_cast(user.dwLowDateTime); + const u64 kernel_time = (static_cast(kernel.dwHighDateTime) << 32) | static_cast(kernel.dwLowDateTime); + return user_time + kernel_time; +#endif } bool Threading::ThreadHandle::SetAffinity(u64 processor_mask) const @@ -198,13 +208,24 @@ Threading::ThreadHandle& Threading::Thread::operator=(Thread&& thread) u64 Threading::GetThreadCpuTime() { +#ifndef _M_ARM64 u64 ret = 0; QueryThreadCycleTime(GetCurrentThread(), &ret); return ret; +#else + FILETIME user, kernel, unused; + if (!GetThreadTimes(GetCurrentThread(), &unused, &unused, &kernel, &user)) + return 0; + + const u64 user_time = (static_cast(user.dwHighDateTime) << 32) | static_cast(user.dwLowDateTime); + const u64 kernel_time = (static_cast(kernel.dwHighDateTime) << 32) | static_cast(kernel.dwLowDateTime); + return user_time + kernel_time; +#endif } u64 Threading::GetThreadTicksPerSecond() { +#ifndef _M_ARM64 // On x86, despite what the MS documentation says, this basically appears to be rdtsc. // So, the frequency is our base clock speed (and stable regardless of power management). static u64 frequency = 0; @@ -224,6 +245,9 @@ u64 Threading::GetThreadTicksPerSecond() } } return frequency; +#else + return 10000000; +#endif } void Threading::SetNameOfCurrentThread(const char* name) diff --git a/common/boost_spsc_queue.hpp b/common/boost_spsc_queue.hpp index e7ac2acc05..8468d9f28d 100644 --- a/common/boost_spsc_queue.hpp +++ b/common/boost_spsc_queue.hpp @@ -46,11 +46,12 @@ #include #include "AlignedMalloc.h" +#include "Pcsx2Defs.h" template class ringbuffer_base { - static const int padding_size = 64 - sizeof(size_t); + static const int padding_size = __cachelinesize - sizeof(size_t); std::atomic write_index_; char padding1[padding_size]; /* force read_index and write_index to different cache lines */ diff --git a/pcsx2/GS/GSCapture.cpp b/pcsx2/GS/GSCapture.cpp index 057ba34a8c..9677011cfe 100644 --- a/pcsx2/GS/GSCapture.cpp +++ b/pcsx2/GS/GSCapture.cpp @@ -211,7 +211,7 @@ namespace GSCapture static std::unique_ptr s_audio_buffer; static std::atomic s_audio_buffer_size{0}; static u32 s_audio_buffer_write_pos = 0; - alignas(64) static u32 s_audio_buffer_read_pos = 0; + alignas(__cachelinesize) static u32 s_audio_buffer_read_pos = 0; } // namespace GSCapture #ifndef USE_LINKED_FFMPEG diff --git a/pcsx2/GS/GSLzma.cpp b/pcsx2/GS/GSLzma.cpp index 87da896773..ec5d7166ae 100644 --- a/pcsx2/GS/GSLzma.cpp +++ b/pcsx2/GS/GSLzma.cpp @@ -278,7 +278,7 @@ namespace size_t m_block_pos = 0; DynamicHeapArray m_block_read_buffer; - alignas(64) CXzUnpacker m_unpacker = {}; + alignas(__cachelinesize) CXzUnpacker m_unpacker = {}; }; GSDumpLzma::GSDumpLzma() = default; diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp index f5034de462..252d72083f 100644 --- a/pcsx2/MTGS.cpp +++ b/pcsx2/MTGS.cpp @@ -61,12 +61,12 @@ namespace MTGS static void SetEvent(); - alignas(32) BufferedData RingBuffer; + alignas(__cachelinesize) BufferedData RingBuffer; // note: when m_ReadPos == m_WritePos, the fifo is empty // Threading info: m_ReadPos is updated by the MTGS thread. m_WritePos is updated by the EE thread - alignas(64) static std::atomic s_ReadPos; // cur pos gs is reading from - alignas(64) static std::atomic s_WritePos; // cur pos ee thread is writing to + alignas(__cachelinesize) static std::atomic s_ReadPos; // cur pos gs is reading from + alignas(__cachelinesize) static std::atomic s_WritePos; // cur pos ee thread is writing to // These vars maintain instance data for sending Data Packets. // Only one data packet can be constructed and uploaded at a time. diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h index 7fdc32883d..a6d2cf3701 100644 --- a/pcsx2/MTVU.h +++ b/pcsx2/MTVU.h @@ -21,9 +21,9 @@ class VU_Thread final { u32 buffer[buffer_size]; // Note: keep atomic on separate cache line to avoid CPU conflict - alignas(64) std::atomic m_ato_read_pos; // Only modified by VU thread - alignas(64) std::atomic m_ato_write_pos; // Only modified by EE thread - alignas(64) int m_read_pos; // temporary read pos (local to the VU thread) + alignas(__cachelinesize) std::atomic m_ato_read_pos; // Only modified by VU thread + alignas(__cachelinesize) std::atomic m_ato_write_pos; // Only modified by EE thread + alignas(__cachelinesize) int m_read_pos; // temporary read pos (local to the VU thread) int m_write_pos; // temporary write pos (local to the EE thread) Threading::WorkSema semaEvent; std::atomic_bool m_shutdown_flag{false};