Misc: Fix ARM32 build (again)

2024-08-14 17:00:43 +10:00 · 2024-08-14 17:00:43 +10:00 · 2e2451998c
parent 460acce561
commit 2e2451998c
9 changed files with 558 additions and 85 deletions
--- a/CMakeModules/DuckStationUtils.cmake
+++ b/CMakeModules/DuckStationUtils.cmake
@ -83,8 +83,8 @@ function(detect_architecture)
          AND CMAKE_SIZEOF_VOID_P EQUAL 4))
    message(STATUS "Building ARM32 binaries.")
    set(CPU_ARCH_ARM32 TRUE PARENT_SCOPE)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a" PARENT_SCOPE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a" PARENT_SCOPE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE)
  elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64")
    message(STATUS "Building RISC-V 64 binaries.")
    set(CPU_ARCH_RISCV64 TRUE PARENT_SCOPE)
--- a/src/common-tests/gsvector_yuvtorgb_test.cpp
+++ b/src/common-tests/gsvector_yuvtorgb_test.cpp
@ -108,7 +108,7 @@ TEST(GSVector, YUVToRGB)

 #if 0
 // Performance test
-u32 g_gsvector_yuvtorgb_temp[64];
+alignas(VECTOR_ALIGNMENT) u32 g_gsvector_yuvtorgb_temp[64];

 TEST(GSVector, YUVToRGB_Scalar)
 {
--- a/src/common/align.h
+++ b/src/common/align.h
@ -92,8 +92,8 @@ ALWAYS_INLINE static void* AlignedMalloc(size_t size, size_t alignment)
 #else
  // Unaligned sizes are slow on macOS.
 #ifdef __APPLE__
-    if (IsPow2(alignment))
-      size = (size + alignment - 1) & ~(alignment - 1);
+  if (IsPow2(alignment))
+    size = (size + alignment - 1) & ~(alignment - 1);
 #endif
  void* ret = nullptr;
  return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
--- a/src/common/gsvector_neon.h
+++ b/src/common/gsvector_neon.h
@ -5,6 +5,7 @@
 #include "common/types.h"

 #include <algorithm>
+#include <cmath>

 #define GSVECTOR_HAS_UNSIGNED 1
 #define GSVECTOR_HAS_SRLV 1
@ -86,7 +87,7 @@ public:

  ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}

-  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);

  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);

@ -174,6 +175,8 @@ public:
    return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
  }

+#ifdef CPU_ARCH_ARM64
+
  ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }

  ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
@ -190,6 +193,56 @@ public:

  ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }

+#else
+
+  ALWAYS_INLINE u8 minv_u8() const
+  {
+    uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u8>(
+      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
+               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
+                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u8() const
+  {
+    uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u8>(
+      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
+               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
+                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
+  }
+
+  ALWAYS_INLINE u16 minv_u16() const
+  {
+    uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u16>(
+      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u16() const
+  {
+    uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u16>(
+      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
+  }
+
+  ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
+
+  ALWAYS_INLINE u32 minv_u32() const
+  {
+    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
+  }
+
+  ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
+
+  ALWAYS_INLINE u32 maxv_u32() const
+  {
+    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
+  }
+
+#endif
+
  ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }

  ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
@ -249,6 +302,8 @@ public:
    return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
  }

+#ifdef CPU_ARCH_ARM64
+
  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
  {
    return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
@ -272,6 +327,33 @@ public:

  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }

+#else
+
+  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
+  {
+    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
+  }
+
+  ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
+  {
+    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
+  }
+  ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
+
+  ALWAYS_INLINE GSVector2i upl8() const
+  {
+    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
+  }
+
+  ALWAYS_INLINE GSVector2i upl16() const
+  {
+    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
+  }
+
+  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
+
+#endif
+
  ALWAYS_INLINE GSVector2i i8to16() const
  {
    return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
@ -465,7 +547,7 @@ public:

  ALWAYS_INLINE bool eq(const GSVector2i& v) const
  {
-    return (vmaxv_u32(vreinterpret_u32_s32(veor_s32(v2s, v.v2s))) == 0);
+    return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
  }

  ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
@ -483,11 +565,6 @@ public:
    return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
  }

-  ALWAYS_INLINE GSVector2i eq64(const GSVector2i& v) const
-  {
-    return GSVector2i(vreinterpret_s32_u64(vceq_s64(vreinterpret_s64_s32(v2s), vreinterpret_s64_s32(v.v2s))));
-  }
-
  ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }

  ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
@ -553,13 +630,23 @@ public:
  ALWAYS_INLINE bool alltrue() const
  {
    // MSB should be set in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
    return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80;
+#else
+    return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
+            0x80808080u);
+#endif
  }

  ALWAYS_INLINE bool allfalse() const
  {
    // MSB should be clear in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
    return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80;
+#else
+    return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
+            0);
+#endif
  }

  template<int i>
@ -744,10 +831,26 @@ public:
    return GSVector2(recip);
  }

+#ifdef CPU_ARCH_ARM64
+
  ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }

  ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }

+#else
+
+  ALWAYS_INLINE GSVector2 floor() const
+  {
+    return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
+  }
+
+  ALWAYS_INLINE GSVector2 ceil() const
+  {
+    return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
+  }
+
+#endif
+
  ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }

  ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
@ -791,7 +894,11 @@ public:
  template<int src, int dst>
  ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
  {
+#ifdef CPU_ARCH_ARM64
    return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
+#else
+    return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
+#endif
  }

  template<int i>
@ -800,7 +907,15 @@ public:
    return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
  }

-  ALWAYS_INLINE float dot(const GSVector2& v) const { return vaddv_f32(vmul_f32(v2s, v.v2s)); }
+  ALWAYS_INLINE float dot(const GSVector2& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return vaddv_f32(vmul_f32(v2s, v.v2s));
+#else
+    const float32x2_t dp = vmul_f32(v2s, v.v2s);
+    return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
+#endif
+  }

  ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }

@ -817,7 +932,14 @@ public:
  ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
  ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
  ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
-  ALWAYS_INLINE void operator/=(const GSVector2& v) { v2s = vdiv_f32(v2s, v.v2s); }
+  ALWAYS_INLINE void operator/=(const GSVector2& v)
+  {
+#ifdef CPU_ARCH_ARM64
+    v2s = vdiv_f32(v2s, v.v2s);
+#else
+    *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
+#endif
+  }

  ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
@ -856,7 +978,12 @@ public:

  ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
  {
+#ifdef CPU_ARCH_ARM64
    return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
+#else
+    return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
+                     vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
+#endif
  }

  ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
@ -1013,8 +1140,8 @@ public:
  ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {}
  ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}

-  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true);
-  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);

  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);

@ -1035,7 +1162,14 @@ public:

  ALWAYS_INLINE s32 rarea() const { return width() * height(); }

-  ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); }
+  ALWAYS_INLINE bool rempty() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0);
+#else
+    return (vget_lane_u64(vreinterpret_u64_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))), 0) == 0);
+#endif
+  }

  ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); }

@ -1159,13 +1293,32 @@ public:

  ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
  {
-    int32x4_t acc =
+#ifdef CPU_ARCH_ARM64
+    const int32x4_t acc =
      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
-    acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
-    return GSVector4i(acc);
+    return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
+#else
+    // borrowed from sse2neon
+    const int32x4_t low =
+      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+    const int32x4_t high =
+      vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
+                                   vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
+#endif
  }

-  ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); }
+  ALWAYS_INLINE GSVector4i addp_s32() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4i(vpaddq_s32(v4s, v4s));
+#else
+    const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
+    return GSVector4i(vcombine_s32(res, res));
+#endif
+  }
+
+#ifdef CPU_ARCH_ARM64

  ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }

@ -1183,6 +1336,70 @@ public:

  ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }

+#else
+
+  ALWAYS_INLINE u8 minv_u8() const
+  {
+    uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
+    vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
+    return static_cast<u8>(
+      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
+               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
+                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u8() const
+  {
+    uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
+    vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
+    return static_cast<u8>(
+      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
+               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
+                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
+  }
+
+  ALWAYS_INLINE u16 minv_u16() const
+  {
+    uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
+    vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
+    return static_cast<u16>(
+      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u16() const
+  {
+    uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
+    vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
+    return static_cast<u16>(
+      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
+  }
+
+  ALWAYS_INLINE s32 minv_s32() const
+  {
+    int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
+    return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
+  }
+
+  ALWAYS_INLINE u32 minv_u32() const
+  {
+    uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
+    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
+  }
+
+  ALWAYS_INLINE s32 maxv_s32() const
+  {
+    int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
+    return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
+  }
+
+  ALWAYS_INLINE u32 maxv_u32() const
+  {
+    uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
+    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
+  }
+
+#endif
+
  ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }

  ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
@ -1224,7 +1441,13 @@ public:

  ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
  {
+#ifdef CPU_ARCH_ARM64
    return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
+#else
+    int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
+    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
+                                                       vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
+#endif
  }

  ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
@ -1271,6 +1494,8 @@ public:
    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
  }

+#ifdef CPU_ARCH_ARM64
+
  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
  {
    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
@ -1341,6 +1566,81 @@ public:
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
  }

+#else
+
+  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
+  {
+    const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
+  {
+    const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
+  {
+    const int16x4x2_t res =
+      vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
+  {
+    const int16x4x2_t res =
+      vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
+  {
+    const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
+    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
+  {
+    const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
+    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+  }
+#endif
+
  ALWAYS_INLINE GSVector4i i8to16() const
  {
    return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
@ -1537,11 +1837,14 @@ public:
    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
  }

+#ifdef CPU_ARCH_ARM64
+  // not on arm32, hopefully we can do without
  ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const
  {
    return GSVector4i(
      vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
  }
+#endif

  template<int i>
  ALWAYS_INLINE GSVector4i srl64() const
@ -1554,11 +1857,13 @@ public:
    return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
  }

+#ifdef CPU_ARCH_ARM64
  ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
  {
    return GSVector4i(
      vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
  }
+#endif

  ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
  {
@ -1588,7 +1893,14 @@ public:
    // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
    const int16x8_t a = vreinterpretq_s16_s32(v4s);
    const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
+#ifdef CPU_ARCH_ARM64
    return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    // sse2neon again
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
+#endif
  }

  ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
@ -1719,7 +2031,13 @@ public:

  ALWAYS_INLINE bool eq(const GSVector4i& v) const
  {
-    return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0);
+    const int32x4_t res = veorq_s32(v4s, v.v4s);
+#ifdef CPU_ARCH_ARM64
+    return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
+#else
+    const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
+    return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
+#endif
  }

  ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
@ -1737,10 +2055,12 @@ public:
    return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
  }

+#ifdef CPU_ARCH_ARM64
  ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
  {
    return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
  }
+#endif

  ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }

@ -1807,13 +2127,23 @@ public:
  ALWAYS_INLINE bool alltrue() const
  {
    // MSB should be set in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
    return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
+#else
+    const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
+    return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u);
+#endif
  }

  ALWAYS_INLINE bool allfalse() const
  {
    // MSB should be clear in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
    return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
+#else
+    const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
+    return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0);
+#endif
  }

  template<int i>
@ -2099,10 +2429,7 @@ public:
    v4s = vld1q_f32(arr);
  }

-  ALWAYS_INLINE GSVector4(float x, float y)
-  {
-    v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
-  }
+  ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }

  ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
  {
@ -2112,7 +2439,7 @@ public:

  ALWAYS_INLINE GSVector4(int x, int y)
  {
-    v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
+    v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
  }

  ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
@ -2129,10 +2456,12 @@ public:

  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);

+#ifdef CPU_ARCH_ARM64
  ALWAYS_INLINE static GSVector4 f64(double x, double y)
  {
    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
  }
+#endif

  ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }

@ -2140,19 +2469,6 @@ public:

  ALWAYS_INLINE operator float32x4_t() const { return v4s; }

-  /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
-  /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
-  /// possibly-denormal garbage data from vectors before computing with them
-  ALWAYS_INLINE GSVector4 noopt()
-  {
-    // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
-    // future the implementation should be updated
-#ifdef __clang__
-    // __asm__("":"+x"(m)::);
-#endif
-    return *this;
-  }
-
  ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }

  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
@ -2172,10 +2488,28 @@ public:
    return GSVector4(recip);
  }

+#ifdef _M_ARM64
+
  ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }

  ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }

+#else
+
+  ALWAYS_INLINE GSVector4 floor() const
+  {
+    return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
+                     std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
+  }
+
+  ALWAYS_INLINE GSVector4 ceil() const
+  {
+    return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
+                     std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
+  }
+
+#endif
+
  ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
  {
    return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
@ -2197,6 +2531,8 @@ public:
    return a.nmadd(b, *this); // *this - a * b
  }

+#ifdef CPU_ARCH_ARM64
+
  ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }

  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
@ -2208,12 +2544,46 @@ public:
    return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
  }

+#else
+
+  ALWAYS_INLINE GSVector4 hadd() const
+  {
+    const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
+    return GSVector4(vcombine_f32(res, res));
+  }
+
+  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
+  {
+    const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
+    const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
+    return GSVector4(vcombine_f32(res1, res2));
+  }
+
+  ALWAYS_INLINE GSVector4 hsub() const
+  {
+    const float32x4x2_t res = vuzpq_f32(v4s, v4s);
+    return GSVector4(vsubq_f32(res.val[0], res.val[0]));
+  }
+
+  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
+  {
+    const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
+    return GSVector4(vsubq_f32(res.val[0], res.val[1]));
+  }
+
+#endif
+
  ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }

  ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
  {
+#ifdef CPU_ARCH_ARM64
    const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
    const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
+#else
+    const GSVector4 minv(a.xyxy());
+    const GSVector4 maxv(a.zwzw());
+#endif
    return sat(minv, maxv);
  }

@ -2239,6 +2609,8 @@ public:
    return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
  }

+#ifdef CPU_ARCH_ARM64
+
  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }

  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
@ -2253,6 +2625,34 @@ public:
    return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
  }

+#else
+
+  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
+  {
+    const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
+    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
+  {
+    const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
+    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
+  {
+    return GSVector4(vreinterpretq_f32_s64(
+      vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
+  {
+    return GSVector4(vreinterpretq_f32_s64(
+      vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
+  }
+
+#endif
+
  ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
  {
    return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
@ -2270,8 +2670,15 @@ public:

  ALWAYS_INLINE int mask() const
  {
+#ifdef CPU_ARCH_ARM64
    static constexpr const int32_t shifts[] = {0, 1, 2, 3};
    return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
+#else
+    // sse2neon again
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
+    uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
  }

  ALWAYS_INLINE bool alltrue() const
@ -2290,7 +2697,11 @@ public:
  template<int src, int dst>
  ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
  {
+#ifdef CPU_ARCH_ARM64
    return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
+#else
+    return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
+#endif
  }

  template<int i>
@ -2320,12 +2731,20 @@ public:

  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
  {
+#ifdef CPU_ARCH_ARM64
    vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
+#else
+    vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
+#endif
  }

  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
  {
+#ifdef CPU_ARCH_ARM64
    vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
+#else
+    vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
+#endif
  }

  template<bool aligned>
@ -2341,12 +2760,29 @@ public:
  ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
  ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
  ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
-  ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); }
+  ALWAYS_INLINE void operator/=(const GSVector4& v)
+  {
+#ifdef CPU_ARCH_ARM64
+    v4s = vdivq_f32(v4s, v.v4s);
+#else
+    *this =
+      GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
+                vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
+#endif
+  }

  ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
-  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+  ALWAYS_INLINE void operator/=(float f)
+  {
+#ifdef CPU_ARCH_ARM64
+    *this /= GSVector4(f);
+#else
+    *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
+                      vgetq_lane_f32(v4s, 3) / f);
+#endif
+  }

  ALWAYS_INLINE void operator&=(const GSVector4& v)
  {
@ -2380,13 +2816,27 @@ public:

  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
  {
+#ifdef CPU_ARCH_ARM64
    return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
+#else
+    return GSVector4(
+      vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
+      vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
+#endif
  }

  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
-  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
+  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
+  {
+#ifdef CPU_ARCH_ARM64
+    return v / GSVector4(f);
+#else
+    return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
+                     vgetq_lane_f32(v.v4s, 3) / f);
+#endif
+  }

  ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
  {
@ -2434,6 +2884,9 @@ public:
    return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
  }

+#ifdef CPU_ARCH_ARM64
+  // Not in ARM32
+
  ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
  {
    return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
@ -2459,14 +2912,15 @@ public:
    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
  }

-  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  ALWAYS_INLINE GSVector4i f64toi32() const
  {
-    const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
-    const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
-    const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
+    const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
+    const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
    return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
  }

+#endif
+
  // clang-format off

 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
@ -2498,21 +2952,39 @@ public:

  // clang-format on

-  ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); }
+  ALWAYS_INLINE GSVector4 broadcast32() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vdupq_laneq_f32(v4s, 0));
+#else
+    return xxxx();
+#endif
+  }

-  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); }
+  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vdupq_laneq_f32(v.v4s, 0));
+#else
+    return v.xxxx();
+#endif
+  }

  ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }

  ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
  {
-    return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
+#else
+    return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
+#endif
  }
 };

-ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
+ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
 {
-  v2s = truncate ? vcvt_s32_f32(v.v2s) : vcvtn_u32_f32(v.v2s);
+  v2s = vcvt_s32_f32(v.v2s);
 }

 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
@ -2530,9 +3002,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
  return GSVector2(vreinterpret_f32_s32(v.v2s));
 }

-ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
 {
-  v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
+  v4s = vcvtq_s32_f32(v.v4s);
 }

 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
--- a/src/common/gsvector_nosimd.h
+++ b/src/common/gsvector_nosimd.h
@ -121,7 +121,7 @@ public:
  // so leave the non-constexpr version default
  ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }

-  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);

  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);

@ -955,7 +955,7 @@ public:
  // so leave the non-constexpr version default
  ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }

-  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);

  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);

@ -1879,8 +1879,6 @@ public:

  ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }

-  ALWAYS_INLINE GSVector4 noopt() { return *this; }
-
  u32 rgba32() const { return GSVector4i(*this).rgba32(); }

  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
@ -2316,7 +2314,7 @@ public:
    return ret;
  }

-  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  ALWAYS_INLINE GSVector4i f64toi32() const
  {
    return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
  }
@ -2372,9 +2370,8 @@ public:
  }
 };

-ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
+ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
 {
-  // TODO: Truncation vs rounding...
  x = static_cast<s32>(v.x);
  y = static_cast<s32>(v.y);
 }
@ -2399,9 +2396,8 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
  return ret;
 }

-ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
 {
-  // TODO: Truncation vs rounding...
  x = static_cast<s32>(v.x);
  y = static_cast<s32>(v.y);
  z = static_cast<s32>(v.z);
--- a/src/common/gsvector_sse.h
+++ b/src/common/gsvector_sse.h
@ -88,7 +88,7 @@ public:
  // so leave the non-constexpr version default
  ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }

-  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);

  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);

@ -840,9 +840,9 @@ public:
  // so leave the non-constexpr version default
  ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }

-  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);

-  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);

  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);

@ -1952,9 +1952,9 @@ public:
    return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
  }

-  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  ALWAYS_INLINE GSVector4i f64toi32() const
  {
-    return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
+    return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m)));
  }

  // clang-format off
@ -2007,9 +2007,9 @@ public:
  }
 };

-ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
+ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
 {
-  m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+  m = _mm_cvttps_epi32(v);
 }

 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
@ -2027,9 +2027,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
  return GSVector2(_mm_castsi128_ps(v.m));
 }

-ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
 {
-  m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+  m = _mm_cvttps_epi32(v);
 }

 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@ -66,8 +66,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)

 #if defined(CPU_ARCH_SSE)
  const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value));
-#elif defined(CPU_ARCH_NEON)
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
  const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value));
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
+  const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast<uintptr_t>(value));
 #endif

  // Clang gets way too eager and tries to unroll these, emitting thousands of instructions.
@ -78,8 +80,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
  {
 #if defined(CPU_ARCH_SSE)
    _mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue);
-#elif defined(CPU_ARCH_NEON)
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
    vst1q_u64(reinterpret_cast<u64*>(dest), svalue);
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
+    vst1q_u32(reinterpret_cast<u32*>(dest), svalue);
 #endif
    dest += PTRS_PER_VECTOR;
  }
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@ -14,6 +14,7 @@

 #include "util/page_fault_handler.h"

+#include "common/align.h"
 #include "common/assert.h"
 #include "common/error.h"
 #include "common/intrin.h"
@ -456,15 +457,15 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio
      s_blocks.erase(it);

      block->~Block();
-      std::free(block);
+      Common::AlignedFree(block);
      block = nullptr;
    }
  }

  if (!block)
  {
-    block =
-      static_cast<Block*>(std::malloc(sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size)));
+    block = static_cast<Block*>(Common::AlignedMalloc(
+      sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size), alignof(Block)));
    Assert(block);
    new (block) Block();
    s_blocks.push_back(block);
@ -734,7 +735,7 @@ void CPU::CodeCache::ClearBlocks()
  for (Block* block : s_blocks)
  {
    block->~Block();
-    std::free(block);
+    Common::AlignedFree(block);
  }
  s_blocks.clear();

--- a/src/core/cpu_recompiler_code_generator_aarch32.cpp
+++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp
@ -148,7 +148,7 @@ void CPU::Recompiler::armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vix
 }

 void CPU::Recompiler::armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
-                                      const void* addr, const vixl::aarch64::Register& tempreg)
+                                      const void* addr, const vixl::aarch32::Register& tempreg)
 {
  armMoveAddressToReg(armAsm, tempreg, addr);
  armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));
@ -1931,12 +1931,12 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
  {
    if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
    {
-      armEmitFarLoad(m_emit, RARG2, GetFetchMemoryAccessTimePtr());
-      m_emit->ldr(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
-      m_emit->Mov(RARG3, m_block->size);
-      m_emit->mul(RARG2, RARG2, RARG3);
-      m_emit->add(RARG1, RARG1, RARG2);
-      m_emit->str(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
+      armEmitFarLoad(m_emit, GetHostReg32(RARG2), GetFetchMemoryAccessTimePtr());
+      m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
+      m_emit->Mov(GetHostReg32(RARG3), m_block->size);
+      m_emit->mul(GetHostReg32(RARG2), GetHostReg32(RARG2), GetHostReg32(RARG3));
+      m_emit->add(GetHostReg32(RARG1), GetHostReg32(RARG1), GetHostReg32(RARG2));
+      m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
    }
    else
    {