parent
9f2b097e96
commit
26a6e887ad
104
src/CP15.cpp
104
src/CP15.cpp
|
@ -20,6 +20,8 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
#include "NDS.h"
|
#include "NDS.h"
|
||||||
#include "DSi.h"
|
#include "DSi.h"
|
||||||
|
@ -365,6 +367,25 @@ u32 ARMv5::ICacheLookup(const u32 addr)
|
||||||
if (!set) goto miss; // check if none of them were a match
|
if (!set) goto miss; // check if none of them were a match
|
||||||
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
||||||
|
|
||||||
|
{
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
uint32x4_t tags = { ICacheTags[id+0], ICacheTags[id+1], ICacheTags[id+2], ICacheTags[id+3] }; // load tags
|
||||||
|
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
|
||||||
|
uint32x4_t cmp = { tag | CACHE_FLAG_VALID,
|
||||||
|
tag | CACHE_FLAG_VALID,
|
||||||
|
tag | CACHE_FLAG_VALID,
|
||||||
|
tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for
|
||||||
|
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
|
||||||
|
cmp = vceqq_u32(tags, cmp);
|
||||||
|
uint16x4_t res = vmovn_u32(cmp);
|
||||||
|
u64 set; memcpy(&set, &res, 4);
|
||||||
|
|
||||||
|
if (!set) goto miss;
|
||||||
|
else set = __builtin_ctz(set) >> 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
#else
|
#else
|
||||||
// fallback for loop; slow
|
// fallback for loop; slow
|
||||||
|
@ -535,6 +556,25 @@ u32 ARMv5::DCacheLookup(const u32 addr)
|
||||||
if (!set) goto miss; // check if none of them were a match
|
if (!set) goto miss; // check if none of them were a match
|
||||||
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
||||||
|
|
||||||
|
{
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
|
||||||
|
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
|
||||||
|
uint32x4_t cmp = { tag | CACHE_FLAG_VALID,
|
||||||
|
tag | CACHE_FLAG_VALID,
|
||||||
|
tag | CACHE_FLAG_VALID,
|
||||||
|
tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for
|
||||||
|
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
|
||||||
|
cmp = vceqq_u32(tags, cmp);
|
||||||
|
uint16x4_t res = vmovn_u32(cmp);
|
||||||
|
u64 set; memcpy(&set, &res, 4);
|
||||||
|
|
||||||
|
if (!set) goto miss;
|
||||||
|
else set = __builtin_ctz(set) >> 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
#else
|
#else
|
||||||
// fallback for loop; slow
|
// fallback for loop; slow
|
||||||
|
@ -656,6 +696,22 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val)
|
||||||
if (!set) return false; // check if none of them were a match
|
if (!set) return false; // check if none of them were a match
|
||||||
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
||||||
|
|
||||||
|
{
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
|
||||||
|
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
|
||||||
|
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
|
||||||
|
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
|
||||||
|
cmp = vceqq_u32(tags, cmp);
|
||||||
|
uint16x4_t res = vmovn_u32(cmp);
|
||||||
|
u64 set; memcpy(&set, &res, 4);
|
||||||
|
|
||||||
|
if (!set) return false;
|
||||||
|
else set = __builtin_ctz(set) >> 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
#else
|
#else
|
||||||
// fallback for loop; slow
|
// fallback for loop; slow
|
||||||
|
@ -708,6 +764,22 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val)
|
||||||
if (!set) return false; // check if none of them were a match
|
if (!set) return false; // check if none of them were a match
|
||||||
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
||||||
|
|
||||||
|
{
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
|
||||||
|
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
|
||||||
|
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
|
||||||
|
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
|
||||||
|
cmp = vceqq_u32(tags, cmp);
|
||||||
|
uint16x4_t res = vmovn_u32(cmp);
|
||||||
|
u64 set; memcpy(&set, &res, 4);
|
||||||
|
|
||||||
|
if (!set) return false;
|
||||||
|
else set = __builtin_ctz(set) >> 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
#else
|
#else
|
||||||
// fallback for loop; slow
|
// fallback for loop; slow
|
||||||
|
@ -761,6 +833,22 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val)
|
||||||
if (!set) return false; // check if none of them were a match
|
if (!set) return false; // check if none of them were a match
|
||||||
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
||||||
|
|
||||||
|
{
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
|
||||||
|
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
|
||||||
|
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
|
||||||
|
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
|
||||||
|
cmp = vceqq_u32(tags, cmp);
|
||||||
|
uint16x4_t res = vmovn_u32(cmp);
|
||||||
|
u64 set; memcpy(&set, &res, 4);
|
||||||
|
|
||||||
|
if (!set) return false;
|
||||||
|
else set = __builtin_ctz(set) >> 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
#else
|
#else
|
||||||
// fallback for loop; slow
|
// fallback for loop; slow
|
||||||
|
@ -813,6 +901,22 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr)
|
||||||
if (!set) return; // check if none of them were a match
|
if (!set) return; // check if none of them were a match
|
||||||
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match
|
||||||
|
|
||||||
|
{
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags
|
||||||
|
uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK),
|
||||||
|
~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask
|
||||||
|
uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for
|
||||||
|
tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for
|
||||||
|
cmp = vceqq_u32(tags, cmp);
|
||||||
|
uint16x4_t res = vmovn_u32(cmp);
|
||||||
|
u64 set; memcpy(&set, &res, 4);
|
||||||
|
|
||||||
|
if (!set) return;
|
||||||
|
else set = __builtin_ctz(set) >> 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
#else
|
#else
|
||||||
// fallback for loop; slow
|
// fallback for loop; slow
|
||||||
|
|
Loading…
Reference in New Issue