diff --git a/src/common/gsvector.h b/src/common/gsvector.h
index fdef42676..0054699a0 100644
--- a/src/common/gsvector.h
+++ b/src/common/gsvector.h
@@ -1,6 +1,10 @@
 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
+//
+// Lightweight wrapper over native SIMD types for cross-platform vector code.
+//
+
 #pragma once
 
 #include "common/intrin.h"
diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h
index 874371651..e4991af5e 100644
--- a/src/common/gsvector_neon.h
+++ b/src/common/gsvector_neon.h
@@ -284,8 +284,6 @@ public:
                                                   vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
   }
 
-  ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
-
   ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
   {
     return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
@@ -537,16 +535,6 @@ public:
     return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
   }
 
-  ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
-  {
-    return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
-  }
-
-  ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
-  {
-    return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
-  }
-
   ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
   {
     return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
@@ -828,22 +816,11 @@ public:
   ALWAYS_INLINE operator float32x2_t() const { return v2s; }
 
   ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
-
   ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
 
-  ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(vrecpe_f32(v2s)); }
-
-  ALWAYS_INLINE GSVector2 rcpnr() const
-  {
-    float32x2_t recip = vrecpe_f32(v2s);
-    recip = vmul_f32(recip, vrecps_f32(recip, v2s));
-    return GSVector2(recip);
-  }
-
 #ifdef CPU_ARCH_ARM64
 
   ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
-
   ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
 
 #else
@@ -1160,16 +1137,8 @@ public:
 
   // rect
 
-  ALWAYS_INLINE int width() const { return right - left; }
-
-  ALWAYS_INLINE int height() const { return bottom - top; }
-
-  ALWAYS_INLINE GSVector4i rsize() const
-  {
-    return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
-  }
-
-  ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+  ALWAYS_INLINE s32 width() const { return right - left; }
+  ALWAYS_INLINE s32 height() const { return bottom - top; }
 
   ALWAYS_INLINE bool rempty() const
   {
@@ -1456,8 +1425,6 @@ public:
                                     vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
   }
 
-  ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
-
   ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
   {
 #ifdef CPU_ARCH_ARM64
@@ -2004,50 +1971,6 @@ public:
 
   ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
 
-  template<int shift>
-  ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
-  {
-    // (a - this) * f << shift + this
-
-    return add16(a.sub16(*this).modulate16<shift>(f));
-  }
-
-  template<int shift>
-  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
-  {
-    // (a - b) * c << shift
-
-    return a.sub16(b).modulate16<shift>(c);
-  }
-
-  template<int shift>
-  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
-                                         const GSVector4i& d)
-  {
-    // (a - b) * c << shift + d
-
-    return d.add16(a.sub16(b).modulate16<shift>(c));
-  }
-
-  ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
-  {
-    // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
-
-    return add16(a.sub16(*this).mul16l(f).sra16<4>());
-  }
-
-  template<int shift>
-  ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
-  {
-    // a * f << shift
-    if (shift == 0)
-    {
-      return mul16hrs(f);
-    }
-
-    return sll16<shift + 1>().mul16hs(f);
-  }
-
   ALWAYS_INLINE bool eq(const GSVector4i& v) const
   {
     const int32x4_t res = veorq_s32(v4s, v.v4s);
@@ -2355,36 +2278,39 @@ public:
 
   ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
 
-  // clang-format off
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                             \
+  ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const                                                                      \
+  {                                                                                                                    \
+    return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn));                                              \
+  }
 
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                     \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
 
-#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-    ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); }
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn)                                                                             \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
 
-#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+#define VECTOR4i_SHUFFLE_1(xs, xn)                                                                                     \
+  VECTOR4i_SHUFFLE_2(xs, xn, x, 0);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, y, 1);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, z, 2);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
 
-#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+  VECTOR4i_SHUFFLE_1(x, 0);
+  VECTOR4i_SHUFFLE_1(y, 1);
+  VECTOR4i_SHUFFLE_1(z, 2);
+  VECTOR4i_SHUFFLE_1(w, 3);
 
-#define VECTOR4i_SHUFFLE_1(xs, xn) \
-    VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
-    VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
-    VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
-    VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
-
-  VECTOR4i_SHUFFLE_1(x, 0)
-    VECTOR4i_SHUFFLE_1(y, 1)
-    VECTOR4i_SHUFFLE_1(z, 2)
-    VECTOR4i_SHUFFLE_1(w, 3)
-
-  // clang-format on
+#undef VECTOR4i_SHUFFLE_1
+#undef VECTOR4i_SHUFFLE_2
+#undef VECTOR4i_SHUFFLE_3
+#undef VECTOR4i_SHUFFLE_4
 };
 
 class alignas(16) GSVector4
@@ -2400,6 +2326,8 @@ class alignas(16) GSVector4
 
   constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
 
+  constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
+
 public:
   union
   {
@@ -2442,6 +2370,10 @@ public:
 
   constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
 
+  constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
+
+  constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
+
   ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
   {
     const float arr[4] = {x, y, z, w};
@@ -2475,12 +2407,28 @@ public:
 
   ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
 
-#ifdef CPU_ARCH_ARM64
   ALWAYS_INLINE static GSVector4 f64(double x, double y)
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
-  }
+#else
+    GSVector4 ret;
+    ret.F64[0] = x;
+    ret.F64[1] = y;
+    return ret;
 #endif
+  }
+
+  ALWAYS_INLINE static GSVector4 f64(double x)
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
+#else
+    GSVector4 ret;
+    ret.F64[0] = ret.F64[1] = x;
+    return ret;
+#endif
+  }
 
   ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
 
@@ -2498,15 +2446,6 @@ public:
 
   ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
 
-  ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); }
-
-  ALWAYS_INLINE GSVector4 rcpnr() const
-  {
-    float32x4_t recip = vrecpeq_f32(v4s);
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
-    return GSVector4(recip);
-  }
-
 #ifdef _M_ARM64
 
   ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
@@ -2529,27 +2468,6 @@ public:
 
 #endif
 
-  ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
-  {
-    return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
-  }
-  ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const
-  {
-    return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s));
-  }
-  ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; }
-  ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; }
-
-  ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const
-  {
-    return a.madd(b, *this); // *this + a * b
-  }
-
-  ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const
-  {
-    return a.nmadd(b, *this); // *this - a * b
-  }
-
 #ifdef CPU_ARCH_ARM64
 
   ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
@@ -2729,6 +2647,28 @@ public:
     return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
   }
 
+  template<int dst>
+  ALWAYS_INLINE GSVector4 insert64(double v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
+#else
+    GSVector4 ret;
+    ret.F64[dst] = v;
+    return ret;
+#endif
+  }
+
+  template<int src>
+  ALWAYS_INLINE double extract64() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
+#else
+    return F64[src];
+#endif
+  }
+
   ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
 
   ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
@@ -2903,73 +2843,219 @@ public:
     return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
   }
 
-#ifdef CPU_ARCH_ARM64
-  // Not in ARM32
-
   ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
   {
-    return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
+#endif
   }
 
   ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
   {
-    return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
+#endif
   }
 
   ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
   {
-    return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
+#else
+    return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
+
+  ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
+
+  ALWAYS_INLINE GSVector4 sqrt64() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
+#else
+    return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 sqr64() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
+#else
+    return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 floor64() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
+#else
+    return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
+#endif
   }
 
   ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
+#else
+    return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
+#endif
   }
 
   ALWAYS_INLINE static GSVector4 f32to64(const void* p)
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
+#else
+    const float* fp = static_cast<const float*>(p);
+    return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
+#endif
   }
 
   ALWAYS_INLINE GSVector4i f64toi32() const
   {
+#ifdef CPU_ARCH_ARM64
     const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
     const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
+#else
+    const s32 low = static_cast<s32>(F64[0]);
+    const s32 high = static_cast<s32>(F64[1]);
+#endif
     return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
   }
 
-#endif
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                              \
+  ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const                                                                       \
+  {                                                                                                                    \
+    return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn));                                               \
+  }                                                                                                                    \
+  ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const                                                     \
+  {                                                                                                                    \
+    return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn));                                     \
+  }
 
-  // clang-format off
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                      \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
 
-#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-    ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
-    ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn)                                                                              \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
 
-#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+#define VECTOR4_SHUFFLE_1(xs, xn)                                                                                      \
+  VECTOR4_SHUFFLE_2(xs, xn, x, 0);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, y, 1);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, z, 2);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, w, 3);
 
-#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+  VECTOR4_SHUFFLE_1(x, 0);
+  VECTOR4_SHUFFLE_1(y, 1);
+  VECTOR4_SHUFFLE_1(z, 2);
+  VECTOR4_SHUFFLE_1(w, 3);
 
-#define VECTOR4_SHUFFLE_1(xs, xn) \
-    VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
-    VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
-    VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
-    VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
-
-  VECTOR4_SHUFFLE_1(x, 0)
-    VECTOR4_SHUFFLE_1(y, 1)
-    VECTOR4_SHUFFLE_1(z, 2)
-    VECTOR4_SHUFFLE_1(w, 3)
-
-  // clang-format on
+#undef VECTOR4_SHUFFLE_1
+#undef VECTOR4_SHUFFLE_2
+#undef VECTOR4_SHUFFLE_3
+#undef VECTOR4_SHUFFLE_4
 
   ALWAYS_INLINE GSVector4 broadcast32() const
   {
diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h
index 8f2f26472..b460c4ca8 100644
--- a/src/common/gsvector_nosimd.h
+++ b/src/common/gsvector_nosimd.h
@@ -235,8 +235,6 @@ public:
     return ret;
   }
 
-  ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
-
   GSVector2i shuffle8(const GSVector2i& mask) const
   {
     ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
@@ -601,15 +599,6 @@ public:
 
   GSVector2 neg() const { return GSVector2(-x, -y); }
 
-  GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); }
-
-  GSVector2 rcpnr() const
-  {
-    GSVector2 v_ = rcp();
-
-    return (v_ + v_) - (v_ * v_) * *this;
-  }
-
   GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); }
 
   GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); }
@@ -973,16 +962,8 @@ public:
   // rect
 
   ALWAYS_INLINE s32 width() const { return right - left; }
-
   ALWAYS_INLINE s32 height() const { return bottom - top; }
 
-  ALWAYS_INLINE GSVector4i rsize() const
-  {
-    return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
-  }
-
-  ALWAYS_INLINE s32 rarea() const { return width() * height(); }
-
   ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
 
   // TODO: Optimize for no-simd, this generates crap code.
@@ -1185,8 +1166,6 @@ public:
     return ret;
   }
 
-  ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
-
   GSVector4i shuffle8(const GSVector4i& mask) const
   {
     ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
@@ -1447,64 +1426,14 @@ public:
 
   GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
 
-  GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
-
-  GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
-
   GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] * v.S16[i]) >> 16); }
 
-  GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); }
-
   GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); }
 
   GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = ((S16[i] * v.S16[i]) >> 14) + 1); }
 
   GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); }
 
-  template<s32 shift>
-  ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
-  {
-    // (a - this) * f << shift + this
-
-    return add16(a.sub16(*this).modulate16<shift>(f));
-  }
-
-  template<s32 shift>
-  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
-  {
-    // (a - b) * c << shift
-
-    return a.sub16(b).modulate16<shift>(c);
-  }
-
-  template<s32 shift>
-  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
-                                         const GSVector4i& d)
-  {
-    // (a - b) * c << shift + d
-
-    return d.add16(a.sub16(b).modulate16<shift>(c));
-  }
-
-  ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
-  {
-    // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
-
-    return add16(a_.sub16(*this).mul16l(f).sra16<4>());
-  }
-
-  template<s32 shift>
-  ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
-  {
-    // a * f << shift
-    if constexpr (shift == 0)
-    {
-      return mul16hrs(f);
-    }
-
-    return sll16<shift + 1>().mul16hs(f);
-  }
-
   ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; }
 
   GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); }
@@ -1746,36 +1675,36 @@ public:
   ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); }
   ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); }
 
-  // clang-format off
-  // l/h/lh not implemented until needed
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                             \
+  ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]); }
 
-#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-    ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]);}
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                     \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
 
-#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn)                                                                             \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
 
-#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+#define VECTOR4i_SHUFFLE_1(xs, xn)                                                                                     \
+  VECTOR4i_SHUFFLE_2(xs, xn, x, 0);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, y, 1);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, z, 2);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
 
-#define VECTOR4i_SHUFFLE_1(xs, xn) \
-    VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
-    VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
-    VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
-    VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+  VECTOR4i_SHUFFLE_1(x, 0);
+  VECTOR4i_SHUFFLE_1(y, 1);
+  VECTOR4i_SHUFFLE_1(z, 2);
+  VECTOR4i_SHUFFLE_1(w, 3);
 
-  VECTOR4i_SHUFFLE_1(x, 0)
-    VECTOR4i_SHUFFLE_1(y, 1)
-    VECTOR4i_SHUFFLE_1(z, 2)
-    VECTOR4i_SHUFFLE_1(w, 3)
-
-  // clang-format on
+#undef VECTOR4i_SHUFFLE_1
+#undef VECTOR4i_SHUFFLE_2
+#undef VECTOR4i_SHUFFLE_3
+#undef VECTOR4i_SHUFFLE_4
 };
 
 class alignas(16) GSVector4
@@ -1791,6 +1720,8 @@ class alignas(16) GSVector4
 
   constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
 
+  constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
+
 public:
   union
   {
@@ -1832,6 +1763,10 @@ public:
 
   constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
 
+  constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
+
+  constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
+
   ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
   {
     this->x = x;
@@ -1881,6 +1816,13 @@ public:
     return ret;
   }
 
+  ALWAYS_INLINE static GSVector4 f64(double x)
+  {
+    GSVector4 ret;
+    ret.F64[0] = ret.F64[1] = x;
+    return ret;
+  }
+
   ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
 
   u32 rgba32() const { return GSVector4i(*this).rgba32(); }
@@ -1893,37 +1835,10 @@ public:
 
   GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); }
 
-  GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); }
-
-  GSVector4 rcpnr() const
-  {
-    GSVector4 v_ = rcp();
-
-    return (v_ + v_) - (v_ * v_) * *this;
-  }
-
   GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); }
 
   GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); }
 
-  GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; }
-
-  GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; }
-
-  GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; }
-
-  GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; }
-
-  GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
-  {
-    return a_.madd(b_, *this); // *this + a * b
-  }
-
-  GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
-  {
-    return a_.nmadd(b_, *this); // *this - a * b
-  }
-
   GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); }
 
   GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); }
@@ -2045,6 +1960,20 @@ public:
     return I32[i];
   }
 
+  template<int dst>
+  ALWAYS_INLINE GSVector4 insert64(double v) const
+  {
+    GSVector4 ret;
+    ret.F64[dst] = v;
+    return ret;
+  }
+
+  template<int src>
+  ALWAYS_INLINE double extract64() const
+  {
+    return F64[src];
+  }
+
   ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
 
   ALWAYS_INLINE static constexpr GSVector4 xffffffff()
@@ -2300,6 +2229,71 @@ public:
     return ret;
   }
 
+  ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
+  {
+    return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
+  }
+
+  ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
+  {
+    return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
+  }
+
+  ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
+  {
+    return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
+  }
+
+  ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
+
+  ALWAYS_INLINE GSVector4 neg64() const {return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL(); }
+
+  ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); }
+
+  ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); }
+
+  ALWAYS_INLINE GSVector4 floor64() const { return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); }
+
   ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
   {
     GSVector4 ret;
@@ -2323,36 +2317,40 @@ public:
     return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
   }
 
-  // clang-format off
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                              \
+  ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); }             \
+  ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const                                                    \
+  {                                                                                                                    \
+    return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]);                                                        \
+  }
 
-#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-    ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \
-    ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); }
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                      \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
 
-#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn)                                                                              \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
 
-#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+#define VECTOR4_SHUFFLE_1(xs, xn)                                                                                      \
+  VECTOR4_SHUFFLE_2(xs, xn, x, 0);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, y, 1);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, z, 2);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, w, 3);
 
-#define VECTOR4_SHUFFLE_1(xs, xn) \
-    VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
-    VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
-    VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
-    VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+  VECTOR4_SHUFFLE_1(x, 0);
+  VECTOR4_SHUFFLE_1(y, 1);
+  VECTOR4_SHUFFLE_1(z, 2);
+  VECTOR4_SHUFFLE_1(w, 3);
 
-  VECTOR4_SHUFFLE_1(x, 0)
-    VECTOR4_SHUFFLE_1(y, 1)
-    VECTOR4_SHUFFLE_1(z, 2)
-    VECTOR4_SHUFFLE_1(w, 3)
-
-  // clang-format on
+#undef VECTOR4_SHUFFLE_1
+#undef VECTOR4_SHUFFLE_2
+#undef VECTOR4_SHUFFLE_3
+#undef VECTOR4_SHUFFLE_4
 
   ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); }
 
diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h
index a2b31b4cc..b2ab9d9f9 100644
--- a/src/common/gsvector_sse.h
+++ b/src/common/gsvector_sse.h
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: LGPL-3.0+
 //
+// Lightweight wrapper over native SIMD types for cross-platform vector code.
 // Rewritten and NEON+No-SIMD variants added for DuckStation.
 //
 
@@ -63,11 +64,9 @@ public:
   GSVector2i() = default;
 
   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
-
   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
 
   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
-
   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
   {
     return GSVector2i(cxpr_init, s0, s1, s2, s3);
@@ -79,26 +78,26 @@ public:
   }
 
   ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); }
-
   ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); }
-
   ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
   {
   }
-
-  // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
-  // so leave the non-constexpr version default
   ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
-
   ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
-
-  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
-
   ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {}
 
-  ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); }
-  ALWAYS_INLINE void operator=(__m128i m_) { m = m_; }
+  ALWAYS_INLINE GSVector2i& operator=(s32 i)
+  {
+    m = _mm_set1_epi32(i);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector2i& operator=(__m128i m_)
+  {
+    m = m_;
+    return *this;
+  }
 
   ALWAYS_INLINE operator __m128i() const { return m; }
 
@@ -142,10 +141,7 @@ public:
   ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); }
   ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); }
 
-  ALWAYS_INLINE s32 addv_s32() const
-  {
-    return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m));
-  }
+  ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); }
 
   ALWAYS_INLINE u8 minv_u8() const
   {
@@ -180,11 +176,8 @@ public:
   }
 
   ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
-
   ALWAYS_INLINE u32 minv_u32() const { return std::min<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
-
   ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
-
   ALWAYS_INLINE u32 maxv_u32() const { return std::max<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
 
   ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
@@ -203,7 +196,7 @@ public:
   template<s32 mask>
   ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
   {
-#if defined(__AVX2__)
+#if defined(CPU_ARCH_AVX2)
     return GSVector2i(_mm_blend_epi32(m, v.m, mask));
 #else
     constexpr s32 bit1 = ((mask & 2) * 3) << 1;
@@ -217,8 +210,6 @@ public:
     return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
   }
 
-  ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
-
   ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); }
 
   ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); }
@@ -333,39 +324,25 @@ public:
 #endif
 
   ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); }
 
   ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); }
 
   ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); }
 
   ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); }
 
   ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); }
@@ -399,7 +376,6 @@ public:
   ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); }
 
   ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); }
-
   ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); }
 
   template<s32 i>
@@ -442,24 +418,35 @@ public:
   }
 
   ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
-
-  ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64((__m128i*)p)); }
-
+  ALWAYS_INLINE static GSVector2i load(const void* p)
+  {
+    return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
+  }
   ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); }
-
   ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); }
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64((__m128i*)p, v.m); }
-
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
-
   ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); }
-
   ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); }
 
-  ALWAYS_INLINE void operator&=(const GSVector2i& v) { m = _mm_and_si128(m, v); }
-  ALWAYS_INLINE void operator|=(const GSVector2i& v) { m = _mm_or_si128(m, v); }
-  ALWAYS_INLINE void operator^=(const GSVector2i& v) { m = _mm_xor_si128(m, v); }
+  ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
+  {
+    m = _mm_and_si128(m, v);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v)
+  {
+    m = _mm_or_si128(m, v);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v)
+  {
+    m = _mm_xor_si128(m, v);
+    return *this;
+  }
 
   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
   {
@@ -485,6 +472,7 @@ public:
   ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
 
   ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); }
+  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
 
   ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
   ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); }
@@ -500,7 +488,6 @@ class alignas(16) GSVector2
   static constexpr cxpr_init_tag cxpr_init{};
 
   constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
-
   constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
 
 public:
@@ -530,28 +517,20 @@ public:
   GSVector2() = default;
 
   constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
-
   constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
-
   constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
-
   constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
 
   ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); }
-
   ALWAYS_INLINE GSVector2(int x, int y)
   {
     GSVector2i v_(x, y);
-
     m = _mm_cvtepi32_ps(v_.m);
   }
 
   ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {}
-
   ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {}
-
   ALWAYS_INLINE explicit GSVector2(float f) { *this = f; }
-
   ALWAYS_INLINE explicit GSVector2(int i)
   {
 #ifdef CPU_ARCH_AVX2
@@ -563,38 +542,22 @@ public:
 
   ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
 
-  ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
-
-  ALWAYS_INLINE void operator=(float f)
+  ALWAYS_INLINE GSVector2& operator=(float f)
   {
-#if CPU_ARCH_AVX2
-
-    m = _mm_broadcastss_ps(_mm_load_ss(&f));
-
-#else
-
     m = _mm_set1_ps(f);
-
-#endif
+    return *this;
   }
 
-  ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; }
+  ALWAYS_INLINE GSVector2& operator=(__m128 m_)
+  {
+    m = m_;
+    return *this;
+  }
 
   ALWAYS_INLINE operator __m128() const { return m; }
 
   ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); }
-
   ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); }
-
-  ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); }
-
-  ALWAYS_INLINE GSVector2 rcpnr() const
-  {
-    GSVector2 v_ = rcp();
-
-    return (v_ + v_) - (v_ * v_) * *this;
-  }
-
   ALWAYS_INLINE GSVector2 floor() const
   {
     return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
@@ -657,27 +620,77 @@ public:
 
   ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
 
-  ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(_mm_castpd_ps(_mm_load_sd((double*)p))); }
+  ALWAYS_INLINE static GSVector2 load(const void* p)
+  {
+    return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
+  }
 
   ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); }
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); }
+  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
+  {
+    _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
+  }
 
   ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
 
-  ALWAYS_INLINE void operator+=(const GSVector2& v_) { m = _mm_add_ps(m, v_); }
-  ALWAYS_INLINE void operator-=(const GSVector2& v_) { m = _mm_sub_ps(m, v_); }
-  ALWAYS_INLINE void operator*=(const GSVector2& v_) { m = _mm_mul_ps(m, v_); }
-  ALWAYS_INLINE void operator/=(const GSVector2& v_) { m = _mm_div_ps(m, v_); }
+  ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_)
+  {
+    m = _mm_add_ps(m, v_);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_)
+  {
+    m = _mm_sub_ps(m, v_);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_)
+  {
+    m = _mm_mul_ps(m, v_);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_)
+  {
+    m = _mm_div_ps(m, v_);
+    return *this;
+  }
 
-  ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
-  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
-  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
-  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
+  ALWAYS_INLINE GSVector2& operator+=(float f)
+  {
+    *this += GSVector2(f);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator-=(float f)
+  {
+    *this -= GSVector2(f);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator*=(float f)
+  {
+    *this *= GSVector2(f);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator/=(float f)
+  {
+    *this /= GSVector2(f);
+    return *this;
+  }
 
-  ALWAYS_INLINE void operator&=(const GSVector2& v_) { m = _mm_and_ps(m, v_); }
-  ALWAYS_INLINE void operator|=(const GSVector2& v_) { m = _mm_or_ps(m, v_); }
-  ALWAYS_INLINE void operator^=(const GSVector2& v_) { m = _mm_xor_ps(m, v_); }
+  ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_)
+  {
+    m = _mm_and_ps(m, v_);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_)
+  {
+    m = _mm_or_ps(m, v_);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_)
+  {
+    m = _mm_xor_ps(m, v_);
+    return *this;
+  }
 
   ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
   {
@@ -752,6 +765,8 @@ public:
     return GSVector2(_mm_cmple_ps(v1, v2));
   }
 
+  ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
+
   ALWAYS_INLINE GSVector2 xy() const { return *this; }
   ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); }
   ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); }
@@ -811,11 +826,9 @@ public:
   {
     return GSVector4i(cxpr_init, x, y, z, w);
   }
-
   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
 
   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
-
   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
   {
     return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
@@ -828,9 +841,7 @@ public:
   }
 
   ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
-
   ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
-
   ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
   {
     m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
@@ -844,36 +855,30 @@ public:
 
   ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; }
 
-  // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
-  // so leave the non-constexpr version default
   ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
 
   ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
 
   ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
 
-  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
-
   ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
 
-  ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); }
-  ALWAYS_INLINE void operator=(__m128i m_) { m = m_; }
+  ALWAYS_INLINE GSVector4i& operator=(s32 i)
+  {
+    m = _mm_set1_epi32(i);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector4i& operator=(__m128i m_)
+  {
+    m = m_;
+    return *this;
+  }
 
   ALWAYS_INLINE operator __m128i() const { return m; }
 
-  // rect
-
   ALWAYS_INLINE s32 width() const { return right - left; }
-
   ALWAYS_INLINE s32 height() const { return bottom - top; }
 
-  ALWAYS_INLINE GSVector4i rsize() const
-  {
-    return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
-  }
-
-  ALWAYS_INLINE s32 rarea() const { return width() * height(); }
-
   ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
 
   ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
@@ -882,8 +887,6 @@ public:
   ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
   ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
 
-  //
-
   ALWAYS_INLINE u32 rgba32() const
   {
     GSVector4i v = *this;
@@ -1044,7 +1047,7 @@ public:
   template<s32 mask>
   ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
   {
-#if defined(__AVX2__)
+#if defined(CPU_ARCH_AVX2)
     return GSVector4i(_mm_blend_epi32(m, v.m, mask));
 #else
     constexpr s32 bit3 = ((mask & 8) * 3) << 3;
@@ -1060,8 +1063,6 @@ public:
     return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
   }
 
-  ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
-
   ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); }
 
   ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); }
@@ -1237,99 +1238,30 @@ public:
 #endif
 
   ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
 
   ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
 
-  ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); }
-
-  ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
-
-  ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
-
   ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
 
-  template<s32 shift>
-  ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
-  {
-    // (a - this) * f << shift + this
-
-    return add16(a.sub16(*this).modulate16<shift>(f));
-  }
-
-  template<s32 shift>
-  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
-  {
-    // (a - b) * c << shift
-
-    return a.sub16(b).modulate16<shift>(c);
-  }
-
-  template<s32 shift>
-  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
-                                         const GSVector4i& d)
-  {
-    // (a - b) * c << shift + d
-
-    return d.add16(a.sub16(b).modulate16<shift>(c));
-  }
-
-  ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
-  {
-    // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
-
-    return add16(a_.sub16(*this).mul16l(f).sra16<4>());
-  }
-
-  template<s32 shift>
-  ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
-  {
-    // a * f << shift
-    if (shift == 0)
-    {
-      return mul16hrs(f);
-    }
-
-    return sll16<shift + 1>().mul16hs(f);
-  }
-
   ALWAYS_INLINE bool eq(const GSVector4i& v) const
   {
-    // pxor, ptest, je
-
-    GSVector4i t = *this ^ v;
-
+    const GSVector4i t = *this ^ v;
     return _mm_testz_si128(t, t) != 0;
   }
 
@@ -1420,15 +1352,21 @@ public:
     return _mm_extract_epi64(m, i);
   }
 
-  ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); }
+  ALWAYS_INLINE static GSVector4i loadnt(const void* p)
+  {
+    return GSVector4i(_mm_stream_load_si128(static_cast<const __m128i*>(p)));
+  }
 
   ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
 
-  ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); }
+  ALWAYS_INLINE static GSVector4i loadl(const void* p)
+  {
+    return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
+  }
 
   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   {
-    return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p)));
+    return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
   }
 
   ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
@@ -1439,18 +1377,19 @@ public:
   template<bool aligned>
   ALWAYS_INLINE static GSVector4i load(const void* p)
   {
-    return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p));
+    return GSVector4i(aligned ? _mm_load_si128(static_cast<const __m128i*>(p)) :
+                                _mm_loadu_si128(static_cast<const __m128i*>(p)));
   }
 
   ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); }
-
   ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); }
 
-  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); }
-
-  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); }
-
-  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); }
+  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
+  {
+    _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
+  }
 
   ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
   {
@@ -1462,20 +1401,30 @@ public:
   ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
   {
     if constexpr (aligned)
-      _mm_store_si128((__m128i*)p, v.m);
+      _mm_store_si128(static_cast<__m128i*>(p), v.m);
     else
-      _mm_storeu_si128((__m128i*)p, v.m);
+      _mm_storeu_si128(static_cast<__m128i*>(p), v.m);
   }
 
   ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
-
   ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); }
-
   ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); }
 
-  ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); }
-  ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); }
-  ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); }
+  ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v)
+  {
+    m = _mm_and_si128(m, v);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v)
+  {
+    m = _mm_or_si128(m, v);
+    return *this;
+  }
+  ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v)
+  {
+    m = _mm_xor_si128(m, v);
+    return *this;
+  }
 
   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
   {
@@ -1493,14 +1442,12 @@ public:
   }
 
   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
-
   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
-
   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
-
   ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
 
   ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
+  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
 
   ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
 
@@ -1508,38 +1455,52 @@ public:
 
   ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); }
 
-  // clang-format off
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                             \
+  ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const                                                                      \
+  {                                                                                                                    \
+    return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));                                              \
+  }                                                                                                                    \
+  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const                                                                   \
+  {                                                                                                                    \
+    return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));                                            \
+  }                                                                                                                    \
+  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const                                                                   \
+  {                                                                                                                    \
+    return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));                                            \
+  }                                                                                                                    \
+  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const                                                                  \
+  {                                                                                                                    \
+    return GSVector4i(                                                                                                 \
+      _mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));          \
+  }
 
-#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-    ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-    ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-    ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-    ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                     \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                    \
+  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
 
-#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
-    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn)                                                                             \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                            \
+  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
 
-#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
-    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+#define VECTOR4i_SHUFFLE_1(xs, xn)                                                                                     \
+  VECTOR4i_SHUFFLE_2(xs, xn, x, 0);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, y, 1);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, z, 2);                                                                                    \
+  VECTOR4i_SHUFFLE_2(xs, xn, w, 3)
 
-#define VECTOR4i_SHUFFLE_1(xs, xn) \
-    VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
-    VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
-    VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
-    VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+  VECTOR4i_SHUFFLE_1(x, 0);
+  VECTOR4i_SHUFFLE_1(y, 1);
+  VECTOR4i_SHUFFLE_1(z, 2);
+  VECTOR4i_SHUFFLE_1(w, 3)
 
-  VECTOR4i_SHUFFLE_1(x, 0)
-    VECTOR4i_SHUFFLE_1(y, 1)
-    VECTOR4i_SHUFFLE_1(z, 2)
-    VECTOR4i_SHUFFLE_1(w, 3)
-
-  // clang-format on
+#undef VECTOR4i_SHUFFLE_1
+#undef VECTOR4i_SHUFFLE_2
+#undef VECTOR4i_SHUFFLE_3
+#undef VECTOR4i_SHUFFLE_4
 };
 
 class alignas(16) GSVector4
@@ -1555,6 +1516,8 @@ class alignas(16) GSVector4
 
   constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
 
+  constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
+
 public:
   union
   {
@@ -1586,35 +1549,29 @@ public:
   GSVector4() = default;
 
   constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
-
   constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
-
   constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
-
   constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
 
   constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
-
   constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
 
+  constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
+  constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
+
   ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
-
   ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
-
   ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
   {
     GSVector4i v_(x, y, z, w);
-
     m = _mm_cvtepi32_ps(v_.m);
   }
-
   ALWAYS_INLINE GSVector4(int x, int y)
   {
     m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
   }
 
   ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {}
-
   ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
     : m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd())))
   {
@@ -1637,24 +1594,20 @@ public:
 
   ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
 
-  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
-
   ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
+  ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); }
 
-  ALWAYS_INLINE void operator=(float f)
+  ALWAYS_INLINE GSVector4& operator=(float f)
   {
-#if CPU_ARCH_AVX2
-
-    m = _mm_broadcastss_ps(_mm_load_ss(&f));
-
-#else
-
     m = _mm_set1_ps(f);
-
-#endif
+    return *this;
   }
 
-  ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; }
+  ALWAYS_INLINE GSVector4& operator=(__m128 m_)
+  {
+    this->m = m_;
+    return *this;
+  }
 
   ALWAYS_INLINE operator __m128() const { return m; }
 
@@ -1668,15 +1621,6 @@ public:
 
   ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); }
 
-  ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); }
-
-  ALWAYS_INLINE GSVector4 rcpnr() const
-  {
-    GSVector4 v_ = rcp();
-
-    return (v_ + v_) - (v_ * v_) * *this;
-  }
-
   ALWAYS_INLINE GSVector4 floor() const
   {
     return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
@@ -1684,52 +1628,6 @@ public:
 
   ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
 
-  ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const
-  {
-#ifdef CPU_ARCH_AVX2
-    return GSVector4(_mm_fmadd_ps(m, a_, b_));
-#else
-    return *this * a_ + b_;
-#endif
-  }
-
-  ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const
-  {
-#ifdef CPU_ARCH_AVX2
-    return GSVector4(_mm_fmsub_ps(m, a_, b_));
-#else
-    return *this * a_ - b_;
-#endif
-  }
-
-  ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const
-  {
-#ifdef CPU_ARCH_AVX2
-    return GSVector4(_mm_fnmadd_ps(m, a_, b_));
-#else
-    return b_ - *this * a_;
-#endif
-  }
-
-  ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const
-  {
-#ifdef CPU_ARCH_AVX2
-    return GSVector4(_mm_fnmsub_ps(m, a_, b_));
-#else
-    return -b_ - *this * a_;
-#endif
-  }
-
-  ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
-  {
-    return a_.madd(b_, *this); // *this + a * b
-  }
-
-  ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
-  {
-    return a_.nmadd(b_, *this); // *this - a * b
-  }
-
   ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); }
 
   ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); }
@@ -1824,52 +1722,132 @@ public:
     return _mm_extract_ps(m, i);
   }
 
+  template<int dst>
+  ALWAYS_INLINE GSVector4 insert64(double v) const
+  {
+    if constexpr (dst == 0)
+      return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v)));
+    else
+      return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0));
+  }
+
+  template<int src>
+  ALWAYS_INLINE double extract64() const
+  {
+    double ret;
+    if constexpr (src == 0)
+      _mm_storel_pd(&ret, _mm_castps_pd(m));
+    else
+      _mm_storeh_pd(&ret, _mm_castps_pd(m));
+    return ret;
+  }
+
   ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
+  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
 
   ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
 
-  ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); }
+  ALWAYS_INLINE static GSVector4 loadl(const void* p)
+  {
+    return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
+  }
 
   ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); }
 
   template<bool aligned>
   ALWAYS_INLINE static GSVector4 load(const void* p)
   {
-    return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
+    return GSVector4(aligned ? _mm_load_ps(static_cast<const float*>(p)) : _mm_loadu_ps(static_cast<const float*>(p)));
   }
 
-  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); }
-
-  ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); }
-
-  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); }
+  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
+  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
+  {
+    _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
+  }
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
+  {
+    _mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
+  }
 
   template<bool aligned>
   ALWAYS_INLINE static void store(void* p, const GSVector4& v)
   {
     if constexpr (aligned)
-      _mm_store_ps((float*)p, v.m);
+      _mm_store_ps(static_cast<float*>(p), v.m);
     else
-      _mm_storeu_ps((float*)p, v.m);
+      _mm_storeu_ps(static_cast<float*>(p), v.m);
   }
 
   ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
 
   ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
 
-  ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); }
-  ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); }
-  ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); }
-  ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); }
+  ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_)
+  {
+    m = _mm_add_ps(m, v_);
+    return *this;
+  }
 
-  ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
-  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
-  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
-  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+  ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_)
+  {
+    m = _mm_sub_ps(m, v_);
+    return *this;
+  }
 
-  ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); }
-  ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); }
-  ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); }
+  ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_)
+  {
+    m = _mm_mul_ps(m, v_);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_)
+  {
+    m = _mm_div_ps(m, v_);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator+=(float f)
+  {
+    *this += GSVector4(f);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator-=(float f)
+  {
+    *this -= GSVector4(f);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator*=(float f)
+  {
+    *this *= GSVector4(f);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator/=(float f)
+  {
+    *this /= GSVector4(f);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_)
+  {
+    m = _mm_and_ps(m, v_);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_)
+  {
+    m = _mm_or_ps(m, v_);
+    return *this;
+  }
+
+  ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_)
+  {
+    m = _mm_xor_ps(m, v_);
+    return *this;
+  }
 
   ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
   {
@@ -1959,6 +1937,59 @@ public:
     return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
   }
 
+  ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const
+  {
+    return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const
+  {
+    return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
+
+  ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
+
+  ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); }
+
+  ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); }
+
+  ALWAYS_INLINE GSVector4 floor64() const
+  {
+    return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+  }
+
   ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
 
   ALWAYS_INLINE static GSVector4 f32to64(const void* p)
@@ -1968,36 +1999,43 @@ public:
 
   ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); }
 
-  // clang-format off
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                              \
+  ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const                                                                       \
+  {                                                                                                                    \
+    return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));                                               \
+  }                                                                                                                    \
+  ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const                                                    \
+  {                                                                                                                    \
+    return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn)));                                            \
+  }
 
-#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-    ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
-    ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                      \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                     \
+  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
 
-#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
-    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn)                                                                              \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                             \
+  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
 
-#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
-    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+#define VECTOR4_SHUFFLE_1(xs, xn)                                                                                      \
+  VECTOR4_SHUFFLE_2(xs, xn, x, 0);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, y, 1);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, z, 2);                                                                                     \
+  VECTOR4_SHUFFLE_2(xs, xn, w, 3);
 
-#define VECTOR4_SHUFFLE_1(xs, xn) \
-    VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
-    VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
-    VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
-    VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+  VECTOR4_SHUFFLE_1(x, 0);
+  VECTOR4_SHUFFLE_1(y, 1);
+  VECTOR4_SHUFFLE_1(z, 2);
+  VECTOR4_SHUFFLE_1(w, 3);
 
-  VECTOR4_SHUFFLE_1(x, 0)
-    VECTOR4_SHUFFLE_1(y, 1)
-    VECTOR4_SHUFFLE_1(z, 2)
-    VECTOR4_SHUFFLE_1(w, 3)
-
-  // clang-format on
+#undef VECTOR4_SHUFFLE_1
+#undef VECTOR4_SHUFFLE_2
+#undef VECTOR4_SHUFFLE_3
+#undef VECTOR4_SHUFFLE_4
 
 #if CPU_ARCH_AVX2
 
@@ -2007,7 +2045,7 @@ public:
 
   ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
   {
-    return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
+    return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast<const float*>(f))));
   }
 
 #endif