Misc: Fix ARM32 build (again)

This commit is contained in:
Stenzek 2024-08-14 17:00:43 +10:00
parent 460acce561
commit 2e2451998c
No known key found for this signature in database
9 changed files with 558 additions and 85 deletions

View File

@ -83,8 +83,8 @@ function(detect_architecture)
AND CMAKE_SIZEOF_VOID_P EQUAL 4))
message(STATUS "Building ARM32 binaries.")
set(CPU_ARCH_ARM32 TRUE PARENT_SCOPE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a" PARENT_SCOPE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a" PARENT_SCOPE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE)
elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64")
message(STATUS "Building RISC-V 64 binaries.")
set(CPU_ARCH_RISCV64 TRUE PARENT_SCOPE)

View File

@ -108,7 +108,7 @@ TEST(GSVector, YUVToRGB)
#if 0
// Performance test
u32 g_gsvector_yuvtorgb_temp[64];
alignas(VECTOR_ALIGNMENT) u32 g_gsvector_yuvtorgb_temp[64];
TEST(GSVector, YUVToRGB_Scalar)
{

View File

@ -92,8 +92,8 @@ ALWAYS_INLINE static void* AlignedMalloc(size_t size, size_t alignment)
#else
// Unaligned sizes are slow on macOS.
#ifdef __APPLE__
if (IsPow2(alignment))
size = (size + alignment - 1) & ~(alignment - 1);
if (IsPow2(alignment))
size = (size + alignment - 1) & ~(alignment - 1);
#endif
void* ret = nullptr;
return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;

View File

@ -5,6 +5,7 @@
#include "common/types.h"
#include <algorithm>
#include <cmath>
#define GSVECTOR_HAS_UNSIGNED 1
#define GSVECTOR_HAS_SRLV 1
@ -86,7 +87,7 @@ public:
ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
@ -174,6 +175,8 @@ public:
return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
@ -190,6 +193,56 @@ public:
ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
#else
ALWAYS_INLINE u8 minv_u8() const
{
uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
return static_cast<u8>(
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
}
ALWAYS_INLINE u16 maxv_u8() const
{
uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
return static_cast<u8>(
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
}
ALWAYS_INLINE u16 minv_u16() const
{
uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
return static_cast<u16>(
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
}
ALWAYS_INLINE u16 maxv_u16() const
{
uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
return static_cast<u16>(
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
}
ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
ALWAYS_INLINE u32 minv_u32() const
{
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
}
ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
ALWAYS_INLINE u32 maxv_u32() const
{
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
}
#endif
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
@ -249,6 +302,8 @@ public:
return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
@ -272,6 +327,33 @@ public:
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
#else
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
}
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
}
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
ALWAYS_INLINE GSVector2i upl8() const
{
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
}
ALWAYS_INLINE GSVector2i upl16() const
{
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
}
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
#endif
ALWAYS_INLINE GSVector2i i8to16() const
{
return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
@ -465,7 +547,7 @@ public:
ALWAYS_INLINE bool eq(const GSVector2i& v) const
{
return (vmaxv_u32(vreinterpret_u32_s32(veor_s32(v2s, v.v2s))) == 0);
return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
}
ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
@ -483,11 +565,6 @@ public:
return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
}
ALWAYS_INLINE GSVector2i eq64(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_u64(vceq_s64(vreinterpret_s64_s32(v2s), vreinterpret_s64_s32(v.v2s))));
}
ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
@ -553,13 +630,23 @@ public:
ALWAYS_INLINE bool alltrue() const
{
// MSB should be set in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80;
#else
return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
0x80808080u);
#endif
}
ALWAYS_INLINE bool allfalse() const
{
// MSB should be clear in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80;
#else
return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
0);
#endif
}
template<int i>
@ -744,10 +831,26 @@ public:
return GSVector2(recip);
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
#else
ALWAYS_INLINE GSVector2 floor() const
{
return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
}
ALWAYS_INLINE GSVector2 ceil() const
{
return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
}
#endif
ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
@ -791,7 +894,11 @@ public:
template<int src, int dst>
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
#else
return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
#endif
}
template<int i>
@ -800,7 +907,15 @@ public:
return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
}
ALWAYS_INLINE float dot(const GSVector2& v) const { return vaddv_f32(vmul_f32(v2s, v.v2s)); }
ALWAYS_INLINE float dot(const GSVector2& v) const
{
#ifdef CPU_ARCH_ARM64
return vaddv_f32(vmul_f32(v2s, v.v2s));
#else
const float32x2_t dp = vmul_f32(v2s, v.v2s);
return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
#endif
}
ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
@ -817,7 +932,14 @@ public:
ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
ALWAYS_INLINE void operator/=(const GSVector2& v) { v2s = vdiv_f32(v2s, v.v2s); }
ALWAYS_INLINE void operator/=(const GSVector2& v)
{
#ifdef CPU_ARCH_ARM64
v2s = vdiv_f32(v2s, v.v2s);
#else
*this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
#endif
}
ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
@ -856,7 +978,12 @@ public:
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
{
#ifdef CPU_ARCH_ARM64
return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
#else
return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
#endif
}
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
@ -1013,8 +1140,8 @@ public:
ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {}
ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
@ -1035,7 +1162,14 @@ public:
ALWAYS_INLINE s32 rarea() const { return width() * height(); }
ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); }
ALWAYS_INLINE bool rempty() const
{
#ifdef CPU_ARCH_ARM64
return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0);
#else
return (vget_lane_u64(vreinterpret_u64_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))), 0) == 0);
#endif
}
ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); }
@ -1159,13 +1293,32 @@ public:
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
{
int32x4_t acc =
#ifdef CPU_ARCH_ARM64
const int32x4_t acc =
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
return GSVector4i(acc);
return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
#else
// borrowed from sse2neon
const int32x4_t low =
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
const int32x4_t high =
vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
#endif
}
ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); }
ALWAYS_INLINE GSVector4i addp_s32() const
{
#ifdef CPU_ARCH_ARM64
return GSVector4i(vpaddq_s32(v4s, v4s));
#else
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
return GSVector4i(vcombine_s32(res, res));
#endif
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
@ -1183,6 +1336,70 @@ public:
ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
#else
ALWAYS_INLINE u8 minv_u8() const
{
uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
return static_cast<u8>(
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
}
ALWAYS_INLINE u16 maxv_u8() const
{
uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
return static_cast<u8>(
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
}
ALWAYS_INLINE u16 minv_u16() const
{
uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
return static_cast<u16>(
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
}
ALWAYS_INLINE u16 maxv_u16() const
{
uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
return static_cast<u16>(
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
}
ALWAYS_INLINE s32 minv_s32() const
{
int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
}
ALWAYS_INLINE u32 minv_u32() const
{
uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
}
ALWAYS_INLINE s32 maxv_s32() const
{
int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
}
ALWAYS_INLINE u32 maxv_u32() const
{
uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
}
#endif
ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
@ -1224,7 +1441,13 @@ public:
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
#else
int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
#endif
}
ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
@ -1271,6 +1494,8 @@ public:
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
@ -1341,6 +1566,81 @@ public:
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
}
#else
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
{
const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
}
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
{
const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
}
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
{
const int16x4x2_t res =
vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
}
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
{
const int16x4x2_t res =
vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
}
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
{
const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
}
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
{
const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
}
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_s64(
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
}
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_s64(
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
}
ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
ALWAYS_INLINE GSVector4i upl64() const
{
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
}
ALWAYS_INLINE GSVector4i uph64() const
{
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
}
#endif
ALWAYS_INLINE GSVector4i i8to16() const
{
return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
@ -1537,11 +1837,14 @@ public:
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
}
#ifdef CPU_ARCH_ARM64
// not on arm32, hopefully we can do without
ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const
{
return GSVector4i(
vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
}
#endif
template<int i>
ALWAYS_INLINE GSVector4i srl64() const
@ -1554,11 +1857,13 @@ public:
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
{
return GSVector4i(
vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
}
#endif
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
{
@ -1588,7 +1893,14 @@ public:
// return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
const int16x8_t a = vreinterpretq_s16_s32(v4s);
const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
#ifdef CPU_ARCH_ARM64
return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
#else
// sse2neon again
int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
#endif
}
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
@ -1719,7 +2031,13 @@ public:
ALWAYS_INLINE bool eq(const GSVector4i& v) const
{
return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0);
const int32x4_t res = veorq_s32(v4s, v.v4s);
#ifdef CPU_ARCH_ARM64
return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
#else
const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
#endif
}
ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
@ -1737,10 +2055,12 @@ public:
return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
}
#endif
ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
@ -1807,13 +2127,23 @@ public:
ALWAYS_INLINE bool alltrue() const
{
// MSB should be set in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
#else
const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u);
#endif
}
ALWAYS_INLINE bool allfalse() const
{
// MSB should be clear in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
#else
const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0);
#endif
}
template<int i>
@ -2099,10 +2429,7 @@ public:
v4s = vld1q_f32(arr);
}
ALWAYS_INLINE GSVector4(float x, float y)
{
v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
}
ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
{
@ -2112,7 +2439,7 @@ public:
ALWAYS_INLINE GSVector4(int x, int y)
{
v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
}
ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
@ -2129,10 +2456,12 @@ public:
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE static GSVector4 f64(double x, double y)
{
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
}
#endif
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
@ -2140,19 +2469,6 @@ public:
ALWAYS_INLINE operator float32x4_t() const { return v4s; }
/// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
/// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
/// possibly-denormal garbage data from vectors before computing with them
ALWAYS_INLINE GSVector4 noopt()
{
// Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
// future the implementation should be updated
#ifdef __clang__
// __asm__("":"+x"(m)::);
#endif
return *this;
}
ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
@ -2172,10 +2488,28 @@ public:
return GSVector4(recip);
}
#ifdef _M_ARM64
ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
#else
ALWAYS_INLINE GSVector4 floor() const
{
return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
}
ALWAYS_INLINE GSVector4 ceil() const
{
return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
}
#endif
ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
{
return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
@ -2197,6 +2531,8 @@ public:
return a.nmadd(b, *this); // *this - a * b
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
@ -2208,12 +2544,46 @@ public:
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
}
#else
ALWAYS_INLINE GSVector4 hadd() const
{
const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
return GSVector4(vcombine_f32(res, res));
}
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
{
const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
return GSVector4(vcombine_f32(res1, res2));
}
ALWAYS_INLINE GSVector4 hsub() const
{
const float32x4x2_t res = vuzpq_f32(v4s, v4s);
return GSVector4(vsubq_f32(res.val[0], res.val[0]));
}
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
{
const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
return GSVector4(vsubq_f32(res.val[0], res.val[1]));
}
#endif
ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
{
#ifdef CPU_ARCH_ARM64
const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
#else
const GSVector4 minv(a.xyxy());
const GSVector4 maxv(a.zwzw());
#endif
return sat(minv, maxv);
}
@ -2239,6 +2609,8 @@ public:
return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
}
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
@ -2253,6 +2625,34 @@ public:
return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
}
#else
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
{
const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
}
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
{
const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
}
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
{
return GSVector4(vreinterpretq_f32_s64(
vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
}
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
{
return GSVector4(vreinterpretq_f32_s64(
vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
}
#endif
ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
{
return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
@ -2270,8 +2670,15 @@ public:
ALWAYS_INLINE int mask() const
{
#ifdef CPU_ARCH_ARM64
static constexpr const int32_t shifts[] = {0, 1, 2, 3};
return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
#else
// sse2neon again
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
#endif
}
ALWAYS_INLINE bool alltrue() const
@ -2290,7 +2697,11 @@ public:
template<int src, int dst>
ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
#else
return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
#endif
}
template<int i>
@ -2320,12 +2731,20 @@ public:
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
#else
vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
#endif
}
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
#else
vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
#endif
}
template<bool aligned>
@ -2341,12 +2760,29 @@ public:
ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); }
ALWAYS_INLINE void operator/=(const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
v4s = vdivq_f32(v4s, v.v4s);
#else
*this =
GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
#endif
}
ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
ALWAYS_INLINE void operator/=(float f)
{
#ifdef CPU_ARCH_ARM64
*this /= GSVector4(f);
#else
*this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
vgetq_lane_f32(v4s, 3) / f);
#endif
}
ALWAYS_INLINE void operator&=(const GSVector4& v)
{
@ -2380,13 +2816,27 @@ public:
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
#else
return GSVector4(
vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
#endif
}
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
{
#ifdef CPU_ARCH_ARM64
return v / GSVector4(f);
#else
return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
vgetq_lane_f32(v.v4s, 3) / f);
#endif
}
ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
{
@ -2434,6 +2884,9 @@ public:
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
}
#ifdef CPU_ARCH_ARM64
// Not in ARM32
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
{
return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
@ -2459,14 +2912,15 @@ public:
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
}
ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
ALWAYS_INLINE GSVector4i f64toi32() const
{
const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
}
#endif
// clang-format off
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
@ -2498,21 +2952,39 @@ public:
// clang-format on
ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); }
ALWAYS_INLINE GSVector4 broadcast32() const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vdupq_laneq_f32(v4s, 0));
#else
return xxxx();
#endif
}
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); }
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vdupq_laneq_f32(v.v4s, 0));
#else
return v.xxxx();
#endif
}
ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
{
return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
#else
return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
#endif
}
};
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
{
v2s = truncate ? vcvt_s32_f32(v.v2s) : vcvtn_u32_f32(v.v2s);
v2s = vcvt_s32_f32(v.v2s);
}
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
@ -2530,9 +3002,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
return GSVector2(vreinterpret_f32_s32(v.v2s));
}
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
{
v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
v4s = vcvtq_s32_f32(v.v4s);
}
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)

View File

@ -121,7 +121,7 @@ public:
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
@ -955,7 +955,7 @@ public:
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
@ -1879,8 +1879,6 @@ public:
ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
ALWAYS_INLINE GSVector4 noopt() { return *this; }
u32 rgba32() const { return GSVector4i(*this).rgba32(); }
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
@ -2316,7 +2314,7 @@ public:
return ret;
}
ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
ALWAYS_INLINE GSVector4i f64toi32() const
{
return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
}
@ -2372,9 +2370,8 @@ public:
}
};
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
{
// TODO: Truncation vs rounding...
x = static_cast<s32>(v.x);
y = static_cast<s32>(v.y);
}
@ -2399,9 +2396,8 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
return ret;
}
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
{
// TODO: Truncation vs rounding...
x = static_cast<s32>(v.x);
y = static_cast<s32>(v.y);
z = static_cast<s32>(v.z);

View File

@ -88,7 +88,7 @@ public:
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
@ -840,9 +840,9 @@ public:
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
@ -1952,9 +1952,9 @@ public:
return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
}
ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
ALWAYS_INLINE GSVector4i f64toi32() const
{
return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m)));
}
// clang-format off
@ -2007,9 +2007,9 @@ public:
}
};
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
{
m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
m = _mm_cvttps_epi32(v);
}
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
@ -2027,9 +2027,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
return GSVector2(_mm_castsi128_ps(v.m));
}
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
{
m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
m = _mm_cvttps_epi32(v);
}
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)

View File

@ -66,8 +66,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
#if defined(CPU_ARCH_SSE)
const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value));
#elif defined(CPU_ARCH_NEON)
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value));
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast<uintptr_t>(value));
#endif
// Clang gets way too eager and tries to unroll these, emitting thousands of instructions.
@ -78,8 +80,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
{
#if defined(CPU_ARCH_SSE)
_mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue);
#elif defined(CPU_ARCH_NEON)
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
vst1q_u64(reinterpret_cast<u64*>(dest), svalue);
#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
vst1q_u32(reinterpret_cast<u32*>(dest), svalue);
#endif
dest += PTRS_PER_VECTOR;
}

View File

@ -14,6 +14,7 @@
#include "util/page_fault_handler.h"
#include "common/align.h"
#include "common/assert.h"
#include "common/error.h"
#include "common/intrin.h"
@ -456,15 +457,15 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio
s_blocks.erase(it);
block->~Block();
std::free(block);
Common::AlignedFree(block);
block = nullptr;
}
}
if (!block)
{
block =
static_cast<Block*>(std::malloc(sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size)));
block = static_cast<Block*>(Common::AlignedMalloc(
sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size), alignof(Block)));
Assert(block);
new (block) Block();
s_blocks.push_back(block);
@ -734,7 +735,7 @@ void CPU::CodeCache::ClearBlocks()
for (Block* block : s_blocks)
{
block->~Block();
std::free(block);
Common::AlignedFree(block);
}
s_blocks.clear();

View File

@ -148,7 +148,7 @@ void CPU::Recompiler::armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vix
}
void CPU::Recompiler::armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
const void* addr, const vixl::aarch64::Register& tempreg)
const void* addr, const vixl::aarch32::Register& tempreg)
{
armMoveAddressToReg(armAsm, tempreg, addr);
armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));
@ -1931,12 +1931,12 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
{
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
{
armEmitFarLoad(m_emit, RARG2, GetFetchMemoryAccessTimePtr());
m_emit->ldr(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
m_emit->Mov(RARG3, m_block->size);
m_emit->mul(RARG2, RARG2, RARG3);
m_emit->add(RARG1, RARG1, RARG2);
m_emit->str(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
armEmitFarLoad(m_emit, GetHostReg32(RARG2), GetFetchMemoryAccessTimePtr());
m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
m_emit->Mov(GetHostReg32(RARG3), m_block->size);
m_emit->mul(GetHostReg32(RARG2), GetHostReg32(RARG2), GetHostReg32(RARG3));
m_emit->add(GetHostReg32(RARG1), GetHostReg32(RARG1), GetHostReg32(RARG2));
m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
}
else
{