BufferUtils: cleanup regarding ARM64

This commit is contained in:
Nekotekina 2022-09-04 22:10:04 +03:00 committed by Ivan
parent 82258915da
commit 5985f0eefa
2 changed files with 8 additions and 59 deletions

View File

@ -28,10 +28,6 @@
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
#if defined(ARCH_ARM64)
#include "Emu/CPU/sse2neon.h"
#endif
const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iname> g_spu_iname;
const extern spu_decoder<spu_iflag> g_spu_iflag;
@ -7457,12 +7453,13 @@ public:
set_vr(op.rt, fshl(a, zshuffle(a, 4, 0, 1, 2), b));
}
#if defined(ARCH_X64) || defined(ARCH_ARM64)
#if defined(ARCH_X64)
static __m128i exec_rotqby(__m128i a, u8 b)
{
alignas(32) const __m128i buf[2]{a, a};
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const u8*>(buf) + (16 - (b & 0xf))));
}
#elif defined(ARCH_ARM64)
#else
#error "Unimplemented"
#endif
@ -7472,6 +7469,7 @@ public:
const auto a = get_vr<u8[16]>(op.ra);
const auto b = get_vr<u8[16]>(op.rb);
#if defined(ARCH_X64)
if (!m_use_ssse3)
{
value_t<u8[16]> r;
@ -7479,6 +7477,7 @@ public:
set_vr(op.rt, r);
return;
}
#endif
// Data with swapped endian from a load instruction
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)

View File

@ -10,53 +10,21 @@
#include "util/v128.hpp"
#include "util/simd.hpp"
#if defined(ARCH_X64)
#include "emmintrin.h"
#include "immintrin.h"
#endif
#if !defined(_MSC_VER)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
#ifdef ARCH_ARM64
#if !defined(_MSC_VER)
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
#undef FORCE_INLINE
#include "Emu/CPU/sse2neon.h"
#endif
#if defined(_MSC_VER) || !defined(__SSE2__)
#define PLAIN_FUNC
#define SSE4_1_FUNC
#define AVX2_FUNC
#define AVX3_FUNC
#else
#ifndef __clang__
#define PLAIN_FUNC __attribute__((optimize("no-tree-vectorize")))
#else
#define PLAIN_FUNC
#endif
#define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
#define AVX2_FUNC __attribute__((__target__("avx2")))
#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl")))
#ifndef __AVX2__
using __m256i = long long __attribute__((vector_size(32)));
#endif
#endif // _MSC_VER
SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x)
{
return _mm_cvtsi128_si32(_mm_minpos_epu16(x));
}
SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x)
{
return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1))));
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
constexpr bool s_use_sse4_1 = true;
@ -89,17 +57,8 @@ constexpr bool s_use_avx2 = false;
constexpr bool s_use_avx3 = false;
#endif
const v128 s_bswap_u32_mask = _mm_set_epi8(
0xC, 0xD, 0xE, 0xF,
0x8, 0x9, 0xA, 0xB,
0x4, 0x5, 0x6, 0x7,
0x0, 0x1, 0x2, 0x3);
const v128 s_bswap_u16_mask = _mm_set_epi8(
0xE, 0xF, 0xC, 0xD,
0xA, 0xB, 0x8, 0x9,
0x6, 0x7, 0x4, 0x5,
0x2, 0x3, 0x0, 0x1);
const v128 s_bswap_u32_mask = v128::from32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
const v128 s_bswap_u16_mask = v128::from32(0x02030001, 0x06070405, 0x0a0b0809, 0x0e0f0c0d);
namespace utils
{
@ -113,13 +72,10 @@ namespace utils
namespace
{
template <bool Compare>
PLAIN_FUNC auto copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count)
auto copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count)
{
u32 result = 0;
#ifdef __clang__
#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
#endif
for (u32 i = 0; i < count; i++)
{
const u32 data = stx::se_storage<u32>::swap(src[i]);
@ -213,16 +169,10 @@ namespace
c.vec_cleanup_ret();
}
#elif defined(ARCH_ARM64)
template <bool Compare>
void build_copy_data_swap_u32(native_asm& c, native_args& args)
{
c.b(&copy_data_swap_u32_naive<Compare>);
}
#endif
}
#if !defined(__APPLE__) || defined(ARCH_X64)
#if defined(ARCH_X64)
DECLARE(copy_data_swap_u32) = build_function_asm<void(*)(u32*, const u32*, u32), asmjit::simd_builder>("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
DECLARE(copy_data_swap_u32_cmp) = build_function_asm<bool(*)(u32*, const u32*, u32), asmjit::simd_builder>("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
#else