From 71100679a3409222c925f26e34da2b2fe9f71f5f Mon Sep 17 00:00:00 2001 From: Ty Date: Fri, 17 Jan 2025 17:21:50 -0500 Subject: [PATCH] R5900: Implement ARM NEON intrinsics for the EE cache --- pcsx2/vtlb.cpp | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/pcsx2/vtlb.cpp b/pcsx2/vtlb.cpp index 0b290d7906..2b94267042 100644 --- a/pcsx2/vtlb.cpp +++ b/pcsx2/vtlb.cpp @@ -30,9 +30,6 @@ #include "fmt/format.h" #include -#ifdef _M_X86 -#include -#endif #include #include #include @@ -112,6 +109,16 @@ vtlb_private::VTLBVirtual::VTLBVirtual(VTLBPhysical phys, u32 paddr, u32 vaddr) } } +#if defined(_M_X86) +#include +#elif defined(_M_ARM64) +#if defined(_MSC_VER) && !defined(__clang__) +#include +#else +#include +#endif +#endif + __inline int CheckCache(u32 addr) { // Check if the cache is enabled @@ -123,7 +130,7 @@ __inline int CheckCache(u32 addr) size_t i = 0; const size_t size = cachedTlbs.count; -#ifdef _M_X86 +#if defined(_M_X86) const int stride = 4; const __m128i addr_vec = _mm_set1_epi32(addr); @@ -173,6 +180,32 @@ __inline int CheckCache(u32 addr) return true; } } +#elif defined(_M_ARM64) + const int stride = 4; + + const uint32x4_t addr_vec = vld1q_dup_u32(&addr); + + for (; i + stride <= size; i += stride) + { + const uint32x4_t pfn1_vec = vld1q_u32(&cachedTlbs.PFN1s[i]); + const uint32x4_t pfn0_vec = vld1q_u32(&cachedTlbs.PFN0s[i]); + const uint32x4_t mask_vec = vld1q_u32(&cachedTlbs.PageMasks[i]); + + const uint32x4_t cached1_vec = vld1q_u32(&cachedTlbs.CacheEnabled1[i]); + const uint32x4_t cached0_vec = vld1q_u32(&cachedTlbs.CacheEnabled0[i]); + + const uint32x4_t pfn1_end_vec = vaddq_u32(pfn1_vec, mask_vec); + const uint32x4_t pfn0_end_vec = vaddq_u32(pfn0_vec, mask_vec); + + const uint32x4_t cmp1 = vandq_u32(vcgeq_u32(addr_vec, pfn1_vec), vcleq_u32(addr_vec, pfn1_end_vec)); + const uint32x4_t cmp0 = vandq_u32(vcgeq_u32(addr_vec, pfn0_vec), vcleq_u32(addr_vec, pfn0_end_vec)); + + const uint32x4_t lanes_enabled = vorrq_u32(vandq_u32(cached1_vec, cmp1), vandq_u32(cached0_vec, cmp0)); + + const uint32x2_t tmp = vorr_u32(vget_low_u32(lanes_enabled), vget_high_u32(lanes_enabled)); + if (vget_lane_u32(vpmax_u32(tmp, tmp), 0)) + return true; + } #endif for (; i < size; i++) {