aarch64 neon impl take one

fingers crossed it compiles!
This commit is contained in:
Jaklyy 2024-10-16 22:26:56 -04:00
parent 9f2b097e96
commit 26a6e887ad
1 changed files with 104 additions and 0 deletions

View File

@ -20,6 +20,8 @@
#include <string.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#elif defined(__ARM_NEON)
#include <arm_neon.h>
#endif
#include "NDS.h"
#include "DSi.h"
@ -365,6 +367,25 @@ u32 ARMv5::ICacheLookup(const u32 addr)
if (!set) goto miss; // check if none of them were a match
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
{
#elif defined(__ARM_NEON)
uint32x4_t tags = { ICacheTags[id+0], ICacheTags[id+1], ICacheTags[id+2], ICacheTags[id+3] }; // load tags
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
uint32x4_t cmp = { tag | CACHE_FLAG_VALID,
tag | CACHE_FLAG_VALID,
tag | CACHE_FLAG_VALID,
tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
cmp = vceqq_u32(tags, cmp);
uint16x4_t res = vmovn_u32(cmp);
u64 set; memcpy(&set, &res, 4);
if (!set) goto miss;
else set = __builtin_ctz(set) >> 3;
{
#else
// fallback for loop; slow
@ -535,6 +556,25 @@ u32 ARMv5::DCacheLookup(const u32 addr)
if (!set) goto miss; // check if none of them were a match
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
{
#elif defined(__ARM_NEON)
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
uint32x4_t cmp = { tag | CACHE_FLAG_VALID,
tag | CACHE_FLAG_VALID,
tag | CACHE_FLAG_VALID,
tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
cmp = vceqq_u32(tags, cmp);
uint16x4_t res = vmovn_u32(cmp);
u64 set; memcpy(&set, &res, 4);
if (!set) goto miss;
else set = __builtin_ctz(set) >> 3;
{
#else
// fallback for loop; slow
@ -656,6 +696,22 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val)
if (!set) return false; // check if none of them were a match
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
{
#elif defined(__ARM_NEON)
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
cmp = vceqq_u32(tags, cmp);
uint16x4_t res = vmovn_u32(cmp);
u64 set; memcpy(&set, &res, 4);
if (!set) return false;
else set = __builtin_ctz(set) >> 3;
{
#else
// fallback for loop; slow
@ -708,6 +764,22 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val)
if (!set) return false; // check if none of them were a match
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
{
#elif defined(__ARM_NEON)
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
cmp = vceqq_u32(tags, cmp);
uint16x4_t res = vmovn_u32(cmp);
u64 set; memcpy(&set, &res, 4);
if (!set) return false;
else set = __builtin_ctz(set) >> 3;
{
#else
// fallback for loop; slow
@ -761,6 +833,22 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val)
if (!set) return false; // check if none of them were a match
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
{
#elif defined(__ARM_NEON)
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
cmp = vceqq_u32(tags, cmp);
uint16x4_t res = vmovn_u32(cmp);
u64 set; memcpy(&set, &res, 4);
if (!set) return false;
else set = __builtin_ctz(set) >> 3;
{
#else
// fallback for loop; slow
@ -813,6 +901,22 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr)
if (!set) return; // check if none of them were a match
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
{
#elif defined(__ARM_NEON)
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
cmp = vceqq_u32(tags, cmp);
uint16x4_t res = vmovn_u32(cmp);
u64 set; memcpy(&set, &res, 4);
if (!set) return;
else set = __builtin_ctz(set) >> 3;
{
#else
// fallback for loop; slow