diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 8acbf43bd..c45bf4636 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -11,6 +11,10 @@ #include "xenia/base/cvar.h" #include "xenia/base/platform.h" +#if XE_ARCH_ARM64 +#include +#endif + #include DEFINE_bool( @@ -215,7 +219,128 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr, dest[i] = (src[i] >> 16) | (src[i] << 16); } } + +#elif XE_ARCH_ARM64 + +// Although NEON offers vector rev instructions (like vrev32q_u8), they are +// slower in benchmarks. Also, using uint8x16xN_t wasn't any faster in the +// benchmarks, hence we use just use one SIMD register to minimize residual +// processing. + +void copy_and_swap_16_aligned(void* dst_ptr, const void* src_ptr, + size_t count) { + copy_and_swap_16_unaligned(dst_ptr, src_ptr, count); +} + +void copy_and_swap_16_unaligned(void* dst_ptr, const void* src_ptr, + size_t count) { + auto dst = reinterpret_cast(dst_ptr); + auto src = reinterpret_cast(src_ptr); + + const uint8x16_t tbl_idx = + vcombine_u8(vcreate_u8(UINT64_C(0x0607040502030001)), + vcreate_u8(UINT64_C(0x0E0F0C0D0A0B0809))); + + while (count >= 8) { + uint8x16_t data = vld1q_u8(src); + data = vqtbl1q_u8(data, tbl_idx); + vst1q_u8(dst, data); + + count -= 8; + // These pointer increments will be combined with the load/stores (ldr/str) + // into single instructions (at least by clang) + dst += 16; + src += 16; + } + + while (count > 0) { + store_and_swap(dst, load(src)); + + count--; + dst += 2; + src += 2; + } +} + +void copy_and_swap_32_aligned(void* dst, const void* src, size_t count) { + copy_and_swap_32_unaligned(dst, src, count); +} + +void copy_and_swap_32_unaligned(void* dst_ptr, const void* src_ptr, + size_t count) { + auto dst = reinterpret_cast(dst_ptr); + auto src = reinterpret_cast(src_ptr); + + const uint8x16_t tbl_idx = + vcombine_u8(vcreate_u8(UINT64_C(0x405060700010203)), + vcreate_u8(UINT64_C(0x0C0D0E0F08090A0B))); + + while (count >= 4) { + uint8x16_t data = vld1q_u8(src); + data = vqtbl1q_u8(data, tbl_idx); + vst1q_u8(dst, data); + + count -= 4; + dst += 16; + src += 16; + } + + while (count > 0) { + store_and_swap(dst, load(src)); + + count--; + dst += 4; + src += 4; + } +} + +void copy_and_swap_64_aligned(void* dst, const void* src, size_t count) { + copy_and_swap_64_unaligned(dst, src, count); +} + +void copy_and_swap_64_unaligned(void* dst_ptr, const void* src_ptr, + size_t count) { + auto dst = reinterpret_cast(dst_ptr); + auto src = reinterpret_cast(src_ptr); + + const uint8x16_t tbl_idx = + vcombine_u8(vcreate_u8(UINT64_C(0x0001020304050607)), + vcreate_u8(UINT64_C(0x08090A0B0C0D0E0F))); + + while (count >= 2) { + uint8x16_t data = vld1q_u8(src); + data = vqtbl1q_u8(data, tbl_idx); + vst1q_u8(dst, data); + + count -= 2; + dst += 16; + src += 16; + } + + while (count > 0) { + store_and_swap(dst, load(src)); + + count--; + dst += 8; + src += 8; + } +} + +void copy_and_swap_16_in_32_aligned(void* dst, const void* src, size_t count) { + return copy_and_swap_16_in_32_unaligned(dst, src, count); +} + +void copy_and_swap_16_in_32_unaligned(void* dst_ptr, const void* src_ptr, + size_t count) { + auto dst = reinterpret_cast(dst_ptr); + auto src = reinterpret_cast(src_ptr); + for (size_t i = 0; i < count; ++i) { + dst[i] = (src[i] >> 16) | (src[i] << 16); + } +} + #else + // Generic routines. void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) { return copy_and_swap_16_unaligned(dest, src, count); @@ -268,6 +393,7 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr, dest[i] = (src[i] >> 16) | (src[i] << 16); } } + #endif } // namespace xe