[Base, aarch64] Add `copy_and_swap` NEON impls

This commit is contained in:
Joel Linn 2022-01-21 22:28:47 +01:00 committed by Triang3l
parent bfaad055a2
commit 4a288dc6bd
1 changed files with 127 additions and 1 deletions

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2014 Ben Vanik. All rights reserved. *
* Copyright 2022 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -11,6 +11,10 @@
#include "xenia/base/cvar.h"
#include "xenia/base/platform.h"
#if XE_ARCH_ARM64
#include <arm_neon.h>
#endif
#include <algorithm>
DEFINE_bool(
@ -215,7 +219,128 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
dest[i] = (src[i] >> 16) | (src[i] << 16);
}
}
#elif XE_ARCH_ARM64
// Although NEON offers vector rev instructions (like vrev32q_u8), they are
// slower in benchmarks. Also, using uint8x16xN_t wasn't any faster in the
// benchmarks, hence we use just use one SIMD register to minimize residual
// processing.
void copy_and_swap_16_aligned(void* dst_ptr, const void* src_ptr,
size_t count) {
copy_and_swap_16_unaligned(dst_ptr, src_ptr, count);
}
void copy_and_swap_16_unaligned(void* dst_ptr, const void* src_ptr,
size_t count) {
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
const uint8x16_t tbl_idx =
vcombine_u8(vcreate_u8(UINT64_C(0x0607040502030001)),
vcreate_u8(UINT64_C(0x0E0F0C0D0A0B0809)));
while (count >= 8) {
uint8x16_t data = vld1q_u8(src);
data = vqtbl1q_u8(data, tbl_idx);
vst1q_u8(dst, data);
count -= 8;
// These pointer increments will be combined with the load/stores (ldr/str)
// into single instructions (at least by clang)
dst += 16;
src += 16;
}
while (count > 0) {
store_and_swap<uint16_t>(dst, load<uint16_t>(src));
count--;
dst += 2;
src += 2;
}
}
void copy_and_swap_32_aligned(void* dst, const void* src, size_t count) {
copy_and_swap_32_unaligned(dst, src, count);
}
void copy_and_swap_32_unaligned(void* dst_ptr, const void* src_ptr,
size_t count) {
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
const uint8x16_t tbl_idx =
vcombine_u8(vcreate_u8(UINT64_C(0x405060700010203)),
vcreate_u8(UINT64_C(0x0C0D0E0F08090A0B)));
while (count >= 4) {
uint8x16_t data = vld1q_u8(src);
data = vqtbl1q_u8(data, tbl_idx);
vst1q_u8(dst, data);
count -= 4;
dst += 16;
src += 16;
}
while (count > 0) {
store_and_swap<uint32_t>(dst, load<uint32_t>(src));
count--;
dst += 4;
src += 4;
}
}
void copy_and_swap_64_aligned(void* dst, const void* src, size_t count) {
copy_and_swap_64_unaligned(dst, src, count);
}
void copy_and_swap_64_unaligned(void* dst_ptr, const void* src_ptr,
size_t count) {
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
const uint8x16_t tbl_idx =
vcombine_u8(vcreate_u8(UINT64_C(0x0001020304050607)),
vcreate_u8(UINT64_C(0x08090A0B0C0D0E0F)));
while (count >= 2) {
uint8x16_t data = vld1q_u8(src);
data = vqtbl1q_u8(data, tbl_idx);
vst1q_u8(dst, data);
count -= 2;
dst += 16;
src += 16;
}
while (count > 0) {
store_and_swap<uint64_t>(dst, load<uint64_t>(src));
count--;
dst += 8;
src += 8;
}
}
void copy_and_swap_16_in_32_aligned(void* dst, const void* src, size_t count) {
return copy_and_swap_16_in_32_unaligned(dst, src, count);
}
void copy_and_swap_16_in_32_unaligned(void* dst_ptr, const void* src_ptr,
size_t count) {
auto dst = reinterpret_cast<uint64_t*>(dst_ptr);
auto src = reinterpret_cast<const uint64_t*>(src_ptr);
for (size_t i = 0; i < count; ++i) {
dst[i] = (src[i] >> 16) | (src[i] << 16);
}
}
#else
// Generic routines.
void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) {
return copy_and_swap_16_unaligned(dest, src, count);
@ -268,6 +393,7 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
dest[i] = (src[i] >> 16) | (src[i] << 16);
}
}
#endif
} // namespace xe