[Base, aarch64] Add `copy_and_swap` NEON impls
This commit is contained in:
parent
bfaad055a2
commit
4a288dc6bd
|
@ -2,7 +2,7 @@
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
* Xenia : Xbox 360 Emulator Research Project *
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
* Copyright 2014 Ben Vanik. All rights reserved. *
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
*/
|
*/
|
||||||
|
@ -11,6 +11,10 @@
|
||||||
#include "xenia/base/cvar.h"
|
#include "xenia/base/cvar.h"
|
||||||
#include "xenia/base/platform.h"
|
#include "xenia/base/platform.h"
|
||||||
|
|
||||||
|
#if XE_ARCH_ARM64
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
DEFINE_bool(
|
DEFINE_bool(
|
||||||
|
@ -215,7 +219,128 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif XE_ARCH_ARM64
|
||||||
|
|
||||||
|
// Although NEON offers vector rev instructions (like vrev32q_u8), they are
|
||||||
|
// slower in benchmarks. Also, using uint8x16xN_t wasn't any faster in the
|
||||||
|
// benchmarks, hence we use just use one SIMD register to minimize residual
|
||||||
|
// processing.
|
||||||
|
|
||||||
|
void copy_and_swap_16_aligned(void* dst_ptr, const void* src_ptr,
|
||||||
|
size_t count) {
|
||||||
|
copy_and_swap_16_unaligned(dst_ptr, src_ptr, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_16_unaligned(void* dst_ptr, const void* src_ptr,
|
||||||
|
size_t count) {
|
||||||
|
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
|
||||||
|
|
||||||
|
const uint8x16_t tbl_idx =
|
||||||
|
vcombine_u8(vcreate_u8(UINT64_C(0x0607040502030001)),
|
||||||
|
vcreate_u8(UINT64_C(0x0E0F0C0D0A0B0809)));
|
||||||
|
|
||||||
|
while (count >= 8) {
|
||||||
|
uint8x16_t data = vld1q_u8(src);
|
||||||
|
data = vqtbl1q_u8(data, tbl_idx);
|
||||||
|
vst1q_u8(dst, data);
|
||||||
|
|
||||||
|
count -= 8;
|
||||||
|
// These pointer increments will be combined with the load/stores (ldr/str)
|
||||||
|
// into single instructions (at least by clang)
|
||||||
|
dst += 16;
|
||||||
|
src += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (count > 0) {
|
||||||
|
store_and_swap<uint16_t>(dst, load<uint16_t>(src));
|
||||||
|
|
||||||
|
count--;
|
||||||
|
dst += 2;
|
||||||
|
src += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_32_aligned(void* dst, const void* src, size_t count) {
|
||||||
|
copy_and_swap_32_unaligned(dst, src, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_32_unaligned(void* dst_ptr, const void* src_ptr,
|
||||||
|
size_t count) {
|
||||||
|
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
|
||||||
|
|
||||||
|
const uint8x16_t tbl_idx =
|
||||||
|
vcombine_u8(vcreate_u8(UINT64_C(0x405060700010203)),
|
||||||
|
vcreate_u8(UINT64_C(0x0C0D0E0F08090A0B)));
|
||||||
|
|
||||||
|
while (count >= 4) {
|
||||||
|
uint8x16_t data = vld1q_u8(src);
|
||||||
|
data = vqtbl1q_u8(data, tbl_idx);
|
||||||
|
vst1q_u8(dst, data);
|
||||||
|
|
||||||
|
count -= 4;
|
||||||
|
dst += 16;
|
||||||
|
src += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (count > 0) {
|
||||||
|
store_and_swap<uint32_t>(dst, load<uint32_t>(src));
|
||||||
|
|
||||||
|
count--;
|
||||||
|
dst += 4;
|
||||||
|
src += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_64_aligned(void* dst, const void* src, size_t count) {
|
||||||
|
copy_and_swap_64_unaligned(dst, src, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_64_unaligned(void* dst_ptr, const void* src_ptr,
|
||||||
|
size_t count) {
|
||||||
|
auto dst = reinterpret_cast<uint8_t*>(dst_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint8_t*>(src_ptr);
|
||||||
|
|
||||||
|
const uint8x16_t tbl_idx =
|
||||||
|
vcombine_u8(vcreate_u8(UINT64_C(0x0001020304050607)),
|
||||||
|
vcreate_u8(UINT64_C(0x08090A0B0C0D0E0F)));
|
||||||
|
|
||||||
|
while (count >= 2) {
|
||||||
|
uint8x16_t data = vld1q_u8(src);
|
||||||
|
data = vqtbl1q_u8(data, tbl_idx);
|
||||||
|
vst1q_u8(dst, data);
|
||||||
|
|
||||||
|
count -= 2;
|
||||||
|
dst += 16;
|
||||||
|
src += 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (count > 0) {
|
||||||
|
store_and_swap<uint64_t>(dst, load<uint64_t>(src));
|
||||||
|
|
||||||
|
count--;
|
||||||
|
dst += 8;
|
||||||
|
src += 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_16_in_32_aligned(void* dst, const void* src, size_t count) {
|
||||||
|
return copy_and_swap_16_in_32_unaligned(dst, src, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
void copy_and_swap_16_in_32_unaligned(void* dst_ptr, const void* src_ptr,
|
||||||
|
size_t count) {
|
||||||
|
auto dst = reinterpret_cast<uint64_t*>(dst_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint64_t*>(src_ptr);
|
||||||
|
for (size_t i = 0; i < count; ++i) {
|
||||||
|
dst[i] = (src[i] >> 16) | (src[i] << 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// Generic routines.
|
// Generic routines.
|
||||||
void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) {
|
void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) {
|
||||||
return copy_and_swap_16_unaligned(dest, src, count);
|
return copy_and_swap_16_unaligned(dest, src, count);
|
||||||
|
@ -268,6 +393,7 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
Loading…
Reference in New Issue