[Base, aarch64] Add `copy_and_swap` NEON impls
This commit is contained in:
parent
bfaad055a2
commit
4a288dc6bd
|
@ -2,7 +2,7 @@
|
|||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2014 Ben Vanik. All rights reserved. *
|
||||
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -11,6 +11,10 @@
|
|||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/platform.h"
|
||||
|
||||
#if XE_ARCH_ARM64
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
DEFINE_bool(
|
||||
|
@ -215,7 +219,128 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
|
|||
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
||||
}
|
||||
}
|
||||
|
||||
#elif XE_ARCH_ARM64
|
||||
|
||||
// Although NEON offers vector rev instructions (like vrev32q_u8), they are
|
||||
// slower in benchmarks. Also, using uint8x16xN_t wasn't any faster in the
|
||||
// benchmarks, hence we use just use one SIMD register to minimize residual
|
||||
// processing.
|
||||
|
||||
void copy_and_swap_16_aligned(void* dst_ptr, const void* src_ptr,
|
||||
size_t count) {
|
||||
copy_and_swap_16_unaligned(dst_ptr, src_ptr, count);
|
||||
}
|
||||
|
||||
void copy_and_swap_16_unaligned(void* dst_ptr, const void* src_ptr,
|
||||
size_t count) {
|
||||
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
|
||||
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
|
||||
|
||||
const uint8x16_t tbl_idx =
|
||||
vcombine_u8(vcreate_u8(UINT64_C(0x0607040502030001)),
|
||||
vcreate_u8(UINT64_C(0x0E0F0C0D0A0B0809)));
|
||||
|
||||
while (count >= 8) {
|
||||
uint8x16_t data = vld1q_u8(src);
|
||||
data = vqtbl1q_u8(data, tbl_idx);
|
||||
vst1q_u8(dst, data);
|
||||
|
||||
count -= 8;
|
||||
// These pointer increments will be combined with the load/stores (ldr/str)
|
||||
// into single instructions (at least by clang)
|
||||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
|
||||
while (count > 0) {
|
||||
store_and_swap<uint16_t>(dst, load<uint16_t>(src));
|
||||
|
||||
count--;
|
||||
dst += 2;
|
||||
src += 2;
|
||||
}
|
||||
}
|
||||
|
||||
void copy_and_swap_32_aligned(void* dst, const void* src, size_t count) {
|
||||
copy_and_swap_32_unaligned(dst, src, count);
|
||||
}
|
||||
|
||||
void copy_and_swap_32_unaligned(void* dst_ptr, const void* src_ptr,
|
||||
size_t count) {
|
||||
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
|
||||
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
|
||||
|
||||
const uint8x16_t tbl_idx =
|
||||
vcombine_u8(vcreate_u8(UINT64_C(0x405060700010203)),
|
||||
vcreate_u8(UINT64_C(0x0C0D0E0F08090A0B)));
|
||||
|
||||
while (count >= 4) {
|
||||
uint8x16_t data = vld1q_u8(src);
|
||||
data = vqtbl1q_u8(data, tbl_idx);
|
||||
vst1q_u8(dst, data);
|
||||
|
||||
count -= 4;
|
||||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
|
||||
while (count > 0) {
|
||||
store_and_swap<uint32_t>(dst, load<uint32_t>(src));
|
||||
|
||||
count--;
|
||||
dst += 4;
|
||||
src += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void copy_and_swap_64_aligned(void* dst, const void* src, size_t count) {
|
||||
copy_and_swap_64_unaligned(dst, src, count);
|
||||
}
|
||||
|
||||
void copy_and_swap_64_unaligned(void* dst_ptr, const void* src_ptr,
|
||||
size_t count) {
|
||||
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
|
||||
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
|
||||
|
||||
const uint8x16_t tbl_idx =
|
||||
vcombine_u8(vcreate_u8(UINT64_C(0x0001020304050607)),
|
||||
vcreate_u8(UINT64_C(0x08090A0B0C0D0E0F)));
|
||||
|
||||
while (count >= 2) {
|
||||
uint8x16_t data = vld1q_u8(src);
|
||||
data = vqtbl1q_u8(data, tbl_idx);
|
||||
vst1q_u8(dst, data);
|
||||
|
||||
count -= 2;
|
||||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
|
||||
while (count > 0) {
|
||||
store_and_swap<uint64_t>(dst, load<uint64_t>(src));
|
||||
|
||||
count--;
|
||||
dst += 8;
|
||||
src += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void copy_and_swap_16_in_32_aligned(void* dst, const void* src, size_t count) {
|
||||
return copy_and_swap_16_in_32_unaligned(dst, src, count);
|
||||
}
|
||||
|
||||
void copy_and_swap_16_in_32_unaligned(void* dst_ptr, const void* src_ptr,
|
||||
size_t count) {
|
||||
auto dst = reinterpret_cast<uint64_t*>(dst_ptr);
|
||||
auto src = reinterpret_cast<const uint64_t*>(src_ptr);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
dst[i] = (src[i] >> 16) | (src[i] << 16);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Generic routines.
|
||||
void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) {
|
||||
return copy_and_swap_16_unaligned(dest, src, count);
|
||||
|
@ -268,6 +393,7 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
|
|||
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace xe
|
||||
|
|
Loading…
Reference in New Issue