From 26a6e887ad3dad5a1fc913553136dae232c3325d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 16 Oct 2024 22:26:56 -0400 Subject: [PATCH] aarch64 neon impl take one fingers crossed it compiles! --- src/CP15.cpp | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index 9e4736f2..2079645b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -20,6 +20,8 @@ #include #if defined(__x86_64__) #include +#elif defined(__ARM_NEON) +#include #endif #include "NDS.h" #include "DSi.h" @@ -365,6 +367,25 @@ u32 ARMv5::ICacheLookup(const u32 addr) if (!set) goto miss; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { ICacheTags[id+0], ICacheTags[id+1], ICacheTags[id+2], ICacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) goto miss; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -535,6 +556,25 @@ u32 ARMv5::DCacheLookup(const u32 addr) if (!set) goto miss; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) goto miss; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -656,6 +696,22 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) if (!set) return false; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -708,6 +764,22 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) if (!set) return false; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -761,6 +833,22 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) if (!set) return false; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -813,6 +901,22 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) if (!set) return; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow