From 0a4c037898f77fd260ba77100ffa9dc7edf24451 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Thu, 21 Mar 2024 17:48:42 +1000
Subject: [PATCH] GS: ARM64 compatibility

---
 pcsx2/CMakeLists.txt                          |   32 +-
 pcsx2/GS/GSState.cpp                          |    5 +
 pcsx2/GS/GSUtil.cpp                           |    3 +
 pcsx2/GS/GSVector.cpp                         |    4 +
 pcsx2/GS/GSVector.h                           |   23 +
 pcsx2/GS/GSVector4_arm64.h                    |  787 ++++++
 pcsx2/GS/GSVector4i_arm64.h                   | 2222 +++++++++++++++
 pcsx2/GS/MultiISA.cpp                         |    6 +
 pcsx2/GS/MultiISA.h                           |    2 +
 pcsx2/GS/Renderers/Common/GSVertex.h          |    4 +
 pcsx2/GS/Renderers/DX11/D3D.cpp               |   16 +-
 pcsx2/GS/Renderers/DX11/D3D.h                 |    2 -
 pcsx2/GS/Renderers/SW/GSDrawScanline.h        |    9 +-
 .../SW/GSDrawScanlineCodeGenerator.arm64.cpp  | 2443 +++++++++++++++++
 .../SW/GSDrawScanlineCodeGenerator.arm64.h    |   82 +
 pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h |   20 +
 .../SW/GSSetupPrimCodeGenerator.arm64.cpp     |  339 +++
 .../SW/GSSetupPrimCodeGenerator.arm64.h       |   33 +
 pcsx2/pcsx2.vcxproj                           |   34 +-
 pcsx2/pcsx2.vcxproj.filters                   |   18 +
 20 files changed, 6064 insertions(+), 20 deletions(-)
 create mode 100644 pcsx2/GS/GSVector4_arm64.h
 create mode 100644 pcsx2/GS/GSVector4i_arm64.h
 create mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp
 create mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h
 create mode 100644 pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
 create mode 100644 pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h
diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt
index d5663e70ee..d350fdcfe7 100644
--- a/pcsx2/CMakeLists.txt
+++ b/pcsx2/CMakeLists.txt
@@ -449,12 +449,22 @@ set(pcsx2GSSourcesUnshared
 	GS/Renderers/Common/GSVertexTraceFMM.cpp
 	GS/Renderers/HW/GSRendererHWMultiISA.cpp
 	GS/Renderers/SW/GSDrawScanline.cpp
-	GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
 	GS/Renderers/SW/GSRasterizer.cpp
 	GS/Renderers/SW/GSRendererSW.cpp
-	GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
 )
 
+if(_M_X86)
+	list(APPEND pcsx2GSSourcesUnshared
+		GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
+		GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
+	)
+elseif(_M_ARM64)
+	list(APPEND pcsx2GSSourcesUnshared
+		GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp
+		GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
+	)
+endif()
+
 set(pcsx2GSSources
 	GS/GS.cpp
 	GS/GSCapture.cpp
@@ -509,10 +519,6 @@ set(pcsx2GSHeaders
 	GS/GSTables.h
 	GS/GSUtil.h
 	GS/GSVector.h
-	GS/GSVector4.h
-	GS/GSVector4i.h
-	GS/GSVector8.h
-	GS/GSVector8i.h
 	GS/GSXXH.h
 	GS/MultiISA.h
 	GS/Renderers/Common/GSDevice.h
@@ -540,6 +546,20 @@ set(pcsx2GSHeaders
 	GS/Renderers/SW/GSVertexSW.h
 	)
 
+if(_M_X86)
+	list(APPEND pcsx2GSHeaders
+		GS/GSVector4.h
+		GS/GSVector4i.h
+		GS/GSVector8.h
+		GS/GSVector8i.h
+	)
+elseif(_M_ARM64)
+	list(APPEND pcsx2GSHeaders
+		GS/GSVector4_arm64.h
+		GS/GSVector4i_arm64.h
+	)
+endif()
+
 if(USE_OPENGL)
 	list(APPEND pcsx2GSSources
 		GS/Renderers/OpenGL/GLContext.cpp
diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp
index 443a028334..5154d7bc8b 100644
--- a/pcsx2/GS/GSState.cpp
+++ b/pcsx2/GS/GSState.cpp
@@ -3588,8 +3588,13 @@ __forceinline void GSState::VertexKick(u32 skip)
 				break;
 		}
 
+#ifndef _M_ARM64
 		// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
 		skip |= test.mask() & 0xff;
+#else
+		// mask() is slow on ARM, so just pull the bits out instead, thankfully we only care about the first 4 bytes.
+		skip |= (static_cast<u64>(test.extract64<0>()) & UINT64_C(0x8080808080808080)) != 0;
+#endif
 	}
 
 	if (skip != 0)
diff --git a/pcsx2/GS/GSUtil.cpp b/pcsx2/GS/GSUtil.cpp
index a2a74be468..29d5898fac 100644
--- a/pcsx2/GS/GSUtil.cpp
+++ b/pcsx2/GS/GSUtil.cpp
@@ -207,6 +207,9 @@ GSRendererType GSUtil::GetPreferredRenderer()
 #if defined(__APPLE__)
 		// Mac: Prefer Metal hardware.
 		preferred_renderer = GSRendererType::Metal;
+#elif defined(_WIN32) && defined(_M_ARM64)
+		// Default to DX12 on Windows-on-ARM.
+		preferred_renderer = GSRendererType::DX12;
 #elif defined(_WIN32)
 		// Use D3D device info to select renderer.
 		preferred_renderer = D3D::GetPreferredRenderer();
diff --git a/pcsx2/GS/GSVector.cpp b/pcsx2/GS/GSVector.cpp
index 40bf4f87d6..022d5746ad 100644
--- a/pcsx2/GS/GSVector.cpp
+++ b/pcsx2/GS/GSVector.cpp
@@ -59,6 +59,8 @@ constinit const GSVector4 GSVector4::m_xc1e00000000fffff = cxpr64(0xc1e00000000f
 constinit const GSVector4 GSVector4::m_max = cxpr(FLT_MAX);
 constinit const GSVector4 GSVector4::m_min = cxpr(FLT_MIN);
 
+#ifdef _M_X86
+
 constinit const GSVector8 GSVector8::m_half = cxpr(0.5f);
 constinit const GSVector8 GSVector8::m_one = cxpr(1.0f);
 constinit const GSVector8 GSVector8::m_x7fffffff = cxpr(0x7fffffff);
@@ -143,6 +145,8 @@ constinit const GSVector8i GSVector8i::m_x0f[33] =
 	cxpr(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
 };
 
+#endif
+
 GSVector4i GSVector4i::fit(int arx, int ary) const
 {
 	GSVector4i r = *this;
diff --git a/pcsx2/GS/GSVector.h b/pcsx2/GS/GSVector.h
index 3d1530d4ac..2e0a3c3d53 100644
--- a/pcsx2/GS/GSVector.h
+++ b/pcsx2/GS/GSVector.h
@@ -84,6 +84,7 @@ typedef GSVector2T<int> GSVector2i;
 class GSVector4;
 class GSVector4i;
 
+#if defined(_M_X86)
 #if _M_SSE >= 0x500
 
 class GSVector8;
@@ -102,16 +103,30 @@ class GSVector8i;
 #include "GSVector8i.h"
 #include "GSVector8.h"
 
+#elif defined(_M_ARM64)
+#include "GSVector4i_arm64.h"
+#include "GSVector4_arm64.h"
+#endif
+
 // conversion
 
 __forceinline_odr GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
 {
+#if defined(_M_X86)
 	m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+#elif defined(_M_ARM64)
+	// GS thread uses default (nearest) rounding.
+	v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
+#endif
 }
 
 __forceinline_odr GSVector4::GSVector4(const GSVector4i& v)
 {
+#if defined(_M_X86)
 	m = _mm_cvtepi32_ps(v);
+#elif defined(_M_ARM64)
+	v4s = vcvtq_f32_s32(v.v4s);
+#endif
 }
 
 __forceinline_odr void GSVector4i::sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
@@ -153,12 +168,20 @@ __forceinline_odr void GSVector8i::sw32_inv(GSVector8i& a, GSVector8i& b)
 
 __forceinline_odr GSVector4i GSVector4i::cast(const GSVector4& v)
 {
+#ifndef _M_ARM64
 	return GSVector4i(_mm_castps_si128(v.m));
+#else
+	return GSVector4i(vreinterpretq_s32_f32(v.v4s));
+#endif
 }
 
 __forceinline_odr GSVector4 GSVector4::cast(const GSVector4i& v)
 {
+#ifndef _M_ARM64
 	return GSVector4(_mm_castsi128_ps(v.m));
+#else
+	return GSVector4(vreinterpretq_f32_s32(v.v4s));
+#endif
 }
 
 #if _M_SSE >= 0x500
diff --git a/pcsx2/GS/GSVector4_arm64.h b/pcsx2/GS/GSVector4_arm64.h
new file mode 100644
index 0000000000..1b3765f38d
--- /dev/null
+++ b/pcsx2/GS/GSVector4_arm64.h
@@ -0,0 +1,787 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+class alignas(16) GSVector4
+{
+	struct cxpr_init_tag
+	{
+	};
+	static constexpr cxpr_init_tag cxpr_init{};
+
+	constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w)
+		: F32{x, y, z, w}
+	{
+	}
+
+	constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w)
+		: I32{x, y, z, w}
+	{
+	}
+
+	constexpr GSVector4(cxpr_init_tag, u64 x, u64 y)
+		: U64{x, y}
+	{
+	}
+
+public:
+	union
+	{
+		struct { float x, y, z, w; };
+		struct { float r, g, b, a; };
+		struct { float left, top, right, bottom; };
+		float v[4];
+		float F32[4];
+		double F64[2];
+		s8 I8[16];
+		s16 I16[8];
+		s32 I32[4];
+		s64 I64[2];
+		u8 U8[16];
+		u16 U16[8];
+		u32 U32[4];
+		u64 U64[2];
+		float32x4_t v4s;
+	};
+
+	static const GSVector4 m_ps0123;
+	static const GSVector4 m_ps4567;
+	static const GSVector4 m_half;
+	static const GSVector4 m_one;
+	static const GSVector4 m_two;
+	static const GSVector4 m_four;
+	static const GSVector4 m_x4b000000;
+	static const GSVector4 m_x4f800000;
+	static const GSVector4 m_xc1e00000000fffff;
+	static const GSVector4 m_max;
+	static const GSVector4 m_min;
+
+	GSVector4() = default;
+
+	constexpr static GSVector4 cxpr(float x, float y, float z, float w)
+	{
+		return GSVector4(cxpr_init, x, y, z, w);
+	}
+
+	constexpr static GSVector4 cxpr(float x)
+	{
+		return GSVector4(cxpr_init, x, x, x, x);
+	}
+
+	constexpr static GSVector4 cxpr(int x, int y, int z, int w)
+	{
+		return GSVector4(cxpr_init, x, y, z, w);
+	}
+
+	constexpr static GSVector4 cxpr(int x)
+	{
+		return GSVector4(cxpr_init, x, x, x, x);
+	}
+
+	constexpr static GSVector4 cxpr64(u64 x, u64 y)
+	{
+		return GSVector4(cxpr_init, x, y);
+	}
+
+	constexpr static GSVector4 cxpr64(u64 x)
+	{
+		return GSVector4(cxpr_init, x, x);
+	}
+
+	__forceinline GSVector4(float x, float y, float z, float w)
+	{
+		const float arr[4] = { x, y, z, w };
+		v4s = vld1q_f32(arr);
+	}
+
+	__forceinline GSVector4(float x, float y)
+	{
+		v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
+	}
+
+	__forceinline GSVector4(int x, int y, int z, int w)
+	{
+		const int arr[4] = { x, y, z, w };
+		v4s = vcvtq_f32_s32(vld1q_s32(arr));
+	}
+
+	__forceinline GSVector4(int x, int y)
+	{
+		v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
+	}
+
+	__forceinline explicit GSVector4(const GSVector2& v)
+	{
+		v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0));
+	}
+
+	__forceinline explicit GSVector4(const GSVector2i& v)
+	{
+		v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0)));
+	}
+
+	__forceinline constexpr explicit GSVector4(float32x4_t m)
+		: v4s(m)
+	{
+	}
+
+	__forceinline explicit GSVector4(float f)
+	{
+		v4s = vdupq_n_f32(f);
+	}
+
+	__forceinline explicit GSVector4(int i)
+	{
+		v4s = vcvtq_f32_s32(vdupq_n_s32(i));
+	}
+
+	__forceinline explicit GSVector4(u32 u)
+	{
+		GSVector4i v((int)u);
+
+		*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
+	}
+
+	__forceinline explicit GSVector4(const GSVector4i& v);
+
+	__forceinline static GSVector4 cast(const GSVector4i& v);
+
+	__forceinline static GSVector4 f64(double x, double y)
+	{
+		return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
+	}
+
+	__forceinline void operator=(float f)
+	{
+		v4s = vdupq_n_f32(f);
+	}
+
+	__forceinline void operator=(float32x4_t m)
+	{
+		v4s = m;
+	}
+
+	__forceinline operator float32x4_t() const
+	{
+		return v4s;
+	}
+
+	/// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks we don't need the whole vector
+	/// Useful for e.g. preventing clang from optimizing shuffles that remove possibly-denormal garbage data from vectors before computing with them
+	__forceinline GSVector4 noopt()
+	{
+		// Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the future the implementation should be updated
+#ifdef __clang__
+		// __asm__("":"+x"(m)::);
+#endif
+		return *this;
+	}
+
+	__forceinline u32 rgba32() const
+	{
+		return GSVector4i(*this).rgba32();
+	}
+
+	__forceinline static GSVector4 rgba32(u32 rgba)
+	{
+		return GSVector4(GSVector4i::load((int)rgba).u8to32());
+	}
+
+	__forceinline static GSVector4 rgba32(u32 rgba, int shift)
+	{
+		return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift);
+	}
+
+	__forceinline static GSVector4 unorm8(u32 rgba)
+	{
+		return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f);
+	}
+
+	__forceinline GSVector4 abs() const
+	{
+		return GSVector4(vabsq_f32(v4s));
+	}
+
+	__forceinline GSVector4 neg() const
+	{
+		return GSVector4(vnegq_f32(v4s));
+	}
+
+	__forceinline GSVector4 rcp() const
+	{
+		return GSVector4(vrecpeq_f32(v4s));
+	}
+
+	__forceinline GSVector4 rcpnr() const
+	{
+		float32x4_t recip = vrecpeq_f32(v4s);
+		recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
+		return GSVector4(recip);
+	}
+
+	template <int mode>
+	__forceinline GSVector4 round() const
+	{
+		if constexpr (mode == Round_NegInf)
+			return floor();
+		else if constexpr (mode == Round_PosInf)
+			return ceil();
+		else if constexpr (mode == Round_NearestInt)
+			return GSVector4(vrndnq_f32(v4s));
+    else
+      return GSVector4(vrndq_f32(v4s));
+	}
+
+	__forceinline GSVector4 floor() const
+	{
+		return GSVector4(vrndmq_f32(v4s));
+	}
+
+	__forceinline GSVector4 ceil() const
+	{
+		return GSVector4(vrndpq_f32(v4s));
+	}
+
+	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+
+#define LOG_POLY0(x, c0) GSVector4(c0)
+#define LOG_POLY1(x, c0, c1) (LOG_POLY0(x, c1).madd(x, GSVector4(c0)))
+#define LOG_POLY2(x, c0, c1, c2) (LOG_POLY1(x, c1, c2).madd(x, GSVector4(c0)))
+#define LOG_POLY3(x, c0, c1, c2, c3) (LOG_POLY2(x, c1, c2, c3).madd(x, GSVector4(c0)))
+#define LOG_POLY4(x, c0, c1, c2, c3, c4) (LOG_POLY3(x, c1, c2, c3, c4).madd(x, GSVector4(c0)))
+#define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector4(c0)))
+
+	__forceinline GSVector4 log2(int precision = 5) const
+	{
+		// NOTE: sign bit ignored, safe to pass negative numbers
+
+		// The idea behind this algorithm is to split the float into two parts, log2(m * 2^e) => log2(m) + log2(2^e) => log2(m) + e,
+		// and then approximate the logarithm of the mantissa (it's 1.x when normalized, a nice short range).
+
+		GSVector4 one = m_one;
+
+		GSVector4i i = GSVector4i::cast(*this);
+
+		GSVector4 e = GSVector4(((i << 1) >> 24) - GSVector4i::x0000007f());
+		GSVector4 m = GSVector4::cast((i << 9) >> 9) | one;
+
+		GSVector4 p;
+
+		// Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+
+		switch (precision)
+		{
+			case 3:
+				p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+				break;
+			case 4:
+				p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+				break;
+			default:
+			case 5:
+				p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+				break;
+			case 6:
+				p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+				break;
+		}
+
+		// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
+
+		p = p * (m - one);
+
+		return p + e;
+	}
+
+	__forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const
+	{
+		return *this * a + b;
+	}
+
+	__forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const
+	{
+		return *this * a - b;
+	}
+
+	__forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
+	{
+		return b - *this * a;
+	}
+
+	__forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
+	{
+		return -b - *this * a;
+	}
+
+	__forceinline GSVector4 addm(const GSVector4& a, const GSVector4& b) const
+	{
+		return a.madd(b, *this); // *this + a * b
+	}
+
+	__forceinline GSVector4 subm(const GSVector4& a, const GSVector4& b) const
+	{
+		return a.nmadd(b, *this); // *this - a * b
+	}
+
+	__forceinline GSVector4 hadd() const
+	{
+		return GSVector4(vpaddq_f32(v4s, v4s));
+	}
+
+	__forceinline GSVector4 hadd(const GSVector4& v) const
+	{
+		return GSVector4(vpaddq_f32(v4s, v.v4s));
+	}
+
+	__forceinline GSVector4 hsub() const
+	{
+		return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s)));
+	}
+
+	__forceinline GSVector4 hsub(const GSVector4& v) const
+	{
+		return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
+	}
+
+	__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
+	{
+		return max(a).min(b);
+	}
+
+	__forceinline GSVector4 sat(const GSVector4& a) const
+	{
+		const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
+		const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
+		return sat(minv, maxv);
+	}
+
+	__forceinline GSVector4 sat(const float scale = 255) const
+	{
+		return sat(zero(), GSVector4(scale));
+	}
+
+	__forceinline GSVector4 clamp(const float scale = 255) const
+	{
+		return min(GSVector4(scale));
+	}
+
+	__forceinline GSVector4 min(const GSVector4& a) const
+	{
+		return GSVector4(vminq_f32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4 max(const GSVector4& a) const
+	{
+    return GSVector4(vmaxq_f32(v4s, a.v4s));
+	}
+
+	template <int mask>
+	__forceinline GSVector4 blend32(const GSVector4& a) const
+	{
+		return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2, (mask & 8) ? 7 : 3));
+	}
+
+	__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
+	{
+		// duplicate sign bit across and bit select
+		const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
+		return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
+	}
+
+	__forceinline GSVector4 upl(const GSVector4& a) const
+	{
+		return GSVector4(vzip1q_f32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4 uph(const GSVector4& a) const
+	{
+		return GSVector4(vzip2q_f32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4 upld(const GSVector4& a) const
+	{
+		return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
+	}
+
+	__forceinline GSVector4 uphd(const GSVector4& a) const
+	{
+		return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
+	}
+
+	__forceinline GSVector4 l2h(const GSVector4& a) const
+	{
+		return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
+	}
+
+	__forceinline GSVector4 h2l(const GSVector4& a) const
+	{
+		return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
+	}
+
+	__forceinline GSVector4 andnot(const GSVector4& v) const
+	{
+		return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
+	}
+
+	__forceinline int mask() const
+	{
+    static const int32_t shifts[] = { 0, 1, 2, 3 };
+    return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
+	}
+
+	__forceinline bool alltrue() const
+	{
+		// return mask() == 0xf;
+		return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
+	}
+
+	__forceinline GSVector4 replace_nan(const GSVector4& v) const
+	{
+		return v.blend32(*this, *this == *this);
+	}
+
+	template <int src, int dst>
+	__forceinline GSVector4 insert32(const GSVector4& v) const
+	{
+		return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
+	}
+
+	template <int i>
+	__forceinline int extract32() const
+	{
+		return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
+	}
+
+	__forceinline static GSVector4 zero()
+	{
+		return GSVector4(vdupq_n_f32(0.0f));
+	}
+
+	__forceinline static GSVector4 xffffffff()
+	{
+		return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu)));
+	}
+
+	__forceinline static GSVector4 ps0123()
+	{
+		return GSVector4(m_ps0123);
+	}
+
+	__forceinline static GSVector4 ps4567()
+	{
+		return GSVector4(m_ps4567);
+	}
+
+	__forceinline static GSVector4 loadl(const void* p)
+	{
+		return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
+	}
+
+	__forceinline static GSVector4 load(float f)
+	{
+		return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0));
+	}
+
+	__forceinline static GSVector4 load(u32 u)
+	{
+		GSVector4i v = GSVector4i::load((int)u);
+
+		return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
+	}
+
+	template <bool aligned>
+	__forceinline static GSVector4 load(const void* p)
+	{
+		return GSVector4(vld1q_f32((const float*)p));
+	}
+
+	__forceinline static void storent(void* p, const GSVector4& v)
+	{
+    vst1q_f32((float*)p, v.v4s);
+	}
+
+	__forceinline static void storel(void* p, const GSVector4& v)
+	{
+		vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector4& v)
+	{
+		vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
+	}
+
+	template <bool aligned>
+	__forceinline static void store(void* p, const GSVector4& v)
+	{
+		vst1q_f32((float*)p, v.v4s);
+	}
+
+	__forceinline static void store(float* p, const GSVector4& v)
+	{
+		vst1q_lane_f32(p, v.v4s, 0);
+	}
+
+	__forceinline static void expand(const GSVector4i& v, GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
+	{
+		GSVector4i mask = GSVector4i::x000000ff();
+
+		a = GSVector4(v & mask);
+		b = GSVector4((v >> 8) & mask);
+		c = GSVector4((v >> 16) & mask);
+		d = GSVector4((v >> 24));
+	}
+
+	__forceinline static void transpose(GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
+	{
+		GSVector4 v0 = a.xyxy(b);
+		GSVector4 v1 = c.xyxy(d);
+
+		GSVector4 e = v0.xzxz(v1);
+		GSVector4 f = v0.ywyw(v1);
+
+		GSVector4 v2 = a.zwzw(b);
+		GSVector4 v3 = c.zwzw(d);
+
+		GSVector4 g = v2.xzxz(v3);
+		GSVector4 h = v2.ywyw(v3);
+
+		a = e;
+		b = f;
+		c = g;
+		d = h;
+	}
+
+	__forceinline GSVector4 operator-() const
+	{
+		return neg();
+	}
+
+	__forceinline void operator+=(const GSVector4& v)
+	{
+		v4s = vaddq_f32(v4s, v.v4s);
+	}
+
+	__forceinline void operator-=(const GSVector4& v)
+	{
+		v4s = vsubq_f32(v4s, v.v4s);
+	}
+
+	__forceinline void operator*=(const GSVector4& v)
+	{
+		v4s = vmulq_f32(v4s, v.v4s);
+	}
+
+	__forceinline void operator/=(const GSVector4& v)
+	{
+		v4s = vdivq_f32(v4s, v.v4s);
+	}
+
+	__forceinline void operator+=(float f)
+	{
+		*this += GSVector4(f);
+	}
+
+	__forceinline void operator-=(float f)
+	{
+		*this -= GSVector4(f);
+	}
+
+	__forceinline void operator*=(float f)
+	{
+		*this *= GSVector4(f);
+	}
+
+	__forceinline void operator/=(float f)
+	{
+		*this /= GSVector4(f);
+	}
+
+	__forceinline void operator&=(const GSVector4& v)
+	{
+		v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+	}
+
+	__forceinline void operator|=(const GSVector4& v)
+	{
+		v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+	}
+
+	__forceinline void operator^=(const GSVector4& v)
+	{
+		v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+	}
+
+	__forceinline friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
+	}
+
+	__forceinline friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
+	}
+
+	__forceinline friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
+	}
+
+	__forceinline friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
+	}
+
+	__forceinline friend GSVector4 operator+(const GSVector4& v, float f)
+	{
+		return v + GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator-(const GSVector4& v, float f)
+	{
+		return v - GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator*(const GSVector4& v, float f)
+	{
+		return v * GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator/(const GSVector4& v, float f)
+	{
+		return v / GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+	}
+
+	__forceinline friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+	}
+
+	__forceinline friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+	}
+
+	__forceinline friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+	{
+		// NEON has no !=
+		return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
+	}
+
+	__forceinline friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline GSVector4 mul64(const GSVector4& v) const
+	{
+		return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+	}
+
+	__forceinline GSVector4 add64(const GSVector4& v) const
+	{
+		return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+	}
+
+	__forceinline GSVector4 sub64(const GSVector4& v) const
+	{
+		return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+	}
+
+	__forceinline static GSVector4 f32to64(const GSVector4& v)
+	{
+		return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
+	}
+
+	__forceinline static GSVector4 f32to64(const void* p)
+	{
+		return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
+	}
+
+	__forceinline GSVector4i f64toi32(bool truncate = true) const
+	{
+		const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
+		const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
+		const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
+		return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
+	}
+
+	// clang-format off
+
+	#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
+		__forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
+
+	#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR4_SHUFFLE_1(xs, xn) \
+		VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR4_SHUFFLE_1(x, 0)
+	VECTOR4_SHUFFLE_1(y, 1)
+	VECTOR4_SHUFFLE_1(z, 2)
+	VECTOR4_SHUFFLE_1(w, 3)
+
+	// clang-format on
+
+	__forceinline GSVector4 broadcast32() const
+	{
+		return GSVector4(vdupq_laneq_f32(v4s, 0));
+	}
+
+	__forceinline static GSVector4 broadcast32(const GSVector4& v)
+	{
+		return GSVector4(vdupq_laneq_f32(v.v4s, 0));
+	}
+
+	__forceinline static GSVector4 broadcast32(const void* f)
+	{
+		return GSVector4(vld1q_dup_f32((const float*)f));
+	}
+
+	__forceinline static GSVector4 broadcast64(const void* f)
+	{
+		return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
+	}
+};
diff --git a/pcsx2/GS/GSVector4i_arm64.h b/pcsx2/GS/GSVector4i_arm64.h
new file mode 100644
index 0000000000..598a3dd659
--- /dev/null
+++ b/pcsx2/GS/GSVector4i_arm64.h
@@ -0,0 +1,2222 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#pragma once
+
+#include "common/Assertions.h"
+
+class alignas(16) GSVector4i
+{
+	static const GSVector4i m_xff[17];
+	static const GSVector4i m_x0f[17];
+
+	struct cxpr_init_tag
+	{
+	};
+	static constexpr cxpr_init_tag cxpr_init{};
+
+	constexpr GSVector4i(cxpr_init_tag, int x, int y, int z, int w)
+		: I32{x, y, z, w}
+	{
+	}
+
+	constexpr GSVector4i(cxpr_init_tag, short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+		: I16{s0, s1, s2, s3, s4, s5, s6, s7}
+	{
+	}
+
+	constexpr GSVector4i(cxpr_init_tag, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+		: U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#else
+		: I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#endif
+	{
+	}
+
+public:
+	union
+	{
+		struct
+		{
+			int x, y, z, w;
+		};
+		struct
+		{
+			int r, g, b, a;
+		};
+		struct
+		{
+			int left, top, right, bottom;
+		};
+		int v[4];
+		float F32[4];
+		s8 I8[16];
+		s16 I16[8];
+		s32 I32[4];
+		s64 I64[2];
+		u8 U8[16];
+		u16 U16[8];
+		u32 U32[4];
+		u64 U64[2];
+		int32x4_t v4s;
+	};
+
+	GSVector4i() = default;
+
+	constexpr static GSVector4i cxpr(int x, int y, int z, int w)
+	{
+		return GSVector4i(cxpr_init, x, y, z, w);
+	}
+
+	constexpr static GSVector4i cxpr(int x)
+	{
+		return GSVector4i(cxpr_init, x, x, x, x);
+	}
+
+	constexpr static GSVector4i cxpr16(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+	{
+		return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+	}
+
+	constexpr static GSVector4i cxpr8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+	{
+		return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+	}
+
+	__forceinline GSVector4i(int x, int y, int z, int w)
+	{
+		GSVector4i xz = load(x).upl32(load(z));
+		GSVector4i yw = load(y).upl32(load(w));
+
+		*this = xz.upl32(yw);
+	}
+
+	__forceinline GSVector4i(int x, int y)
+	{
+		*this = load(x).upl32(load(y));
+	}
+
+	__forceinline GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+		: I16{s0, s1, s2, s3, s4, s5, s6, s7}
+	{
+	}
+
+	constexpr GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+		: U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#else
+		: I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#endif
+	{
+	}
+
+	__forceinline explicit GSVector4i(const GSVector2i& v)
+	{
+		v4s = vcombine_s32(vld1_s32(v.v), vcreate_s32(0));
+	}
+
+	// MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), so leave the non-constexpr version default
+	__forceinline explicit GSVector4i(int i)
+	{
+		*this = i;
+	}
+
+	__forceinline constexpr explicit GSVector4i(int32x4_t m)
+		: v4s(m)
+	{
+	}
+
+	__forceinline explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+	__forceinline static GSVector4i cast(const GSVector4& v);
+
+	__forceinline void operator=(int i)
+	{
+		v4s = vdupq_n_s32(i);
+	}
+
+	__forceinline operator int32x4_t() const
+	{
+		return v4s;
+	}
+
+	// rect
+
+	__forceinline int width() const
+	{
+		return right - left;
+	}
+
+	__forceinline int height() const
+	{
+		return bottom - top;
+	}
+
+	__forceinline GSVector4i rsize() const
+	{
+		return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+	}
+
+	__forceinline unsigned int rarea() const
+	{
+		return width() * height();
+	}
+
+	__forceinline bool rempty() const
+	{
+		return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0);
+	}
+
+	__forceinline GSVector4i runion(const GSVector4i& a) const
+	{
+		return min_i32(a).upl64(max_i32(a).srl<8>());
+	}
+
+	__forceinline GSVector4i rintersect(const GSVector4i& a) const
+	{
+		return sat_i32(a);
+	}
+
+	__forceinline bool rintersects(const GSVector4i& v) const
+	{
+		return !rintersect(v).rempty();
+	}
+
+	__forceinline bool rcontains(const GSVector4i& v) const
+	{
+		return rintersect(v).eq(v);
+	}
+
+	template <Align_Mode mode>
+	GSVector4i _ralign_helper(const GSVector4i& mask) const
+	{
+		GSVector4i v;
+
+		switch (mode)
+		{
+			case Align_Inside:
+				v = *this + mask;
+				break;
+			case Align_Outside:
+				v = *this + mask.zwxy();
+				break;
+			case Align_NegInf:
+				v = *this;
+				break;
+			case Align_PosInf:
+				v = *this + mask.xyxy();
+				break;
+			default:
+				pxAssert(0);
+				break;
+		}
+
+		return v.andnot(mask.xyxy());
+	}
+
+	/// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+	template <Align_Mode mode>
+	GSVector4i ralign_presub(const GSVector2i& a) const
+	{
+		return _ralign_helper<mode>(GSVector4i(a));
+	}
+
+	template <Align_Mode mode>
+	GSVector4i ralign(const GSVector2i& a) const
+	{
+		// a must be 1 << n
+
+		return _ralign_helper<mode>(GSVector4i(a) - GSVector4i(1, 1));
+	}
+
+	GSVector4i fit(int arx, int ary) const;
+
+	GSVector4i fit(int preset) const;
+
+	//
+
+	__forceinline u32 rgba32() const
+	{
+		GSVector4i v = *this;
+
+		v = v.ps32(v);
+		v = v.pu16(v);
+
+		return (u32)store(v);
+	}
+
+	__forceinline GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_i8(a).min_i8(b);
+	}
+
+	__forceinline GSVector4i sat_i8(const GSVector4i& a) const
+	{
+		return max_i8(a.xyxy()).min_i8(a.zwzw());
+	}
+
+	__forceinline GSVector4i sat_i16(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_i16(a).min_i16(b);
+	}
+
+	__forceinline GSVector4i sat_i16(const GSVector4i& a) const
+	{
+		return max_i16(a.xyxy()).min_i16(a.zwzw());
+	}
+
+	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_i32(a).min_i32(b);
+	}
+
+	__forceinline GSVector4i sat_i32(const GSVector4i& a) const
+	{
+		return max_i32(a.xyxy()).min_i32(a.zwzw());
+	}
+
+	__forceinline GSVector4i sat_u8(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_u8(a).min_u8(b);
+	}
+
+	__forceinline GSVector4i sat_u8(const GSVector4i& a) const
+	{
+		return max_u8(a.xyxy()).min_u8(a.zwzw());
+	}
+
+	__forceinline GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_u16(a).min_u16(b);
+	}
+
+	__forceinline GSVector4i sat_u16(const GSVector4i& a) const
+	{
+		return max_u16(a.xyxy()).min_u16(a.zwzw());
+	}
+
+	__forceinline GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_u32(a).min_u32(b);
+	}
+
+	__forceinline GSVector4i sat_u32(const GSVector4i& a) const
+	{
+		return max_u32(a.xyxy()).min_u32(a.zwzw());
+	}
+
+	__forceinline GSVector4i min_i8(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i max_i8(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i min_i16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i max_i16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i min_i32(const GSVector4i& a) const
+	{
+		return GSVector4i(vminq_s32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4i max_i32(const GSVector4i& a) const
+	{
+		return GSVector4i(vmaxq_s32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4i min_u8(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i max_u8(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i min_u16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i max_u16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i min_u32(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i max_u32(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(a.v4s))));
+	}
+
+	__forceinline s32 minv_s32() const
+	{
+		return vminvq_s32(v4s);
+	}
+
+	__forceinline u32 minv_u32() const
+	{
+		return vminvq_u32(v4s);
+	}
+
+	__forceinline u32 maxv_s32() const
+	{
+		return vmaxvq_s32(v4s);
+	}
+
+	__forceinline u32 maxv_u32() const
+	{
+		return vmaxvq_u32(v4s);
+	}
+
+	__forceinline static int min_i16(int a, int b)
+	{
+		return store(load(a).min_i16(load(b)));
+	}
+
+	__forceinline GSVector4i clamp8() const
+	{
+		return pu16().upl8();
+	}
+
+	__forceinline GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
+	{
+		uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
+		return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
+	}
+
+	template <int mask>
+	__forceinline GSVector4i blend16(const GSVector4i& a) const
+	{
+		const uint16_t _mask[8] = {((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 4)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 5)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 6)) ? (uint16_t)-1 : 0x0,
+			((mask) & (1 << 7)) ? (uint16_t)-1 : 0x0};
+		return GSVector4i(vreinterpretq_s32_u16(vbslq_u16(vld1q_u16(_mask), vreinterpretq_u16_s32(a.v4s), vreinterpretq_u16_s32(v4s))));
+	}
+
+	template <int mask>
+	__forceinline GSVector4i blend32(const GSVector4i& v) const
+	{
+		constexpr int bit3 = ((mask & 8) * 3) << 3;
+		constexpr int bit2 = ((mask & 4) * 3) << 2;
+		constexpr int bit1 = ((mask & 2) * 3) << 1;
+		constexpr int bit0 = (mask & 1) * 3;
+		return blend16<bit3 | bit2 | bit1 | bit0>(v);
+	}
+
+	/// Equivalent to blend with the given mask broadcasted across the vector
+	/// May be faster than blend in some cases
+	template <u32 mask>
+	__forceinline GSVector4i smartblend(const GSVector4i& a) const
+	{
+		if (mask == 0)
+			return *this;
+		if (mask == 0xffffffff)
+			return a;
+
+		if (mask == 0x0000ffff)
+			return blend16<0x55>(a);
+		if (mask == 0xffff0000)
+			return blend16<0xaa>(a);
+
+		for (int i = 0; i < 32; i += 8)
+		{
+			u8 byte = (mask >> i) & 0xff;
+			if (byte != 0xff && byte != 0)
+				return blend(a, GSVector4i(mask));
+		}
+
+		return blend8(a, GSVector4i(mask));
+	}
+
+	__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)), vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(a.v4s)))));
+	}
+
+	__forceinline GSVector4i mix16(const GSVector4i& a) const
+	{
+		return blend16<0xaa>(a);
+	}
+
+	__forceinline GSVector4i shuffle8(const GSVector4i& mask) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
+	}
+
+	__forceinline GSVector4i ps16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(a.v4s)))));
+	}
+
+	__forceinline GSVector4i ps16() const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
+	}
+
+	__forceinline GSVector4i pu16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(a.v4s)))));
+	}
+
+	__forceinline GSVector4i pu16() const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
+	}
+
+	__forceinline GSVector4i ps32(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i ps32() const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
+	}
+
+	__forceinline GSVector4i pu32(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i pu32() const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
+	}
+
+	__forceinline GSVector4i upl8(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i uph8(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i upl16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i uph16(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s))));
+	}
+
+	__forceinline GSVector4i upl32(const GSVector4i& a) const
+	{
+		return GSVector4i(vzip1q_s32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4i uph32(const GSVector4i& a) const
+	{
+		return GSVector4i(vzip2q_s32(v4s, a.v4s));
+	}
+
+	__forceinline GSVector4i upl64(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(a.v4s)))));
+	}
+
+	__forceinline GSVector4i uph64(const GSVector4i& a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(a.v4s)))));
+	}
+
+	__forceinline GSVector4i upl8() const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
+	}
+
+	__forceinline GSVector4i uph8() const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
+	}
+
+	__forceinline GSVector4i upl16() const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
+	}
+
+	__forceinline GSVector4i uph16() const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
+	}
+
+	__forceinline GSVector4i upl32() const
+	{
+		return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0)));
+	}
+
+	__forceinline GSVector4i uph32() const
+	{
+		return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0)));
+	}
+
+	__forceinline GSVector4i upl64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+	}
+
+	__forceinline GSVector4i uph64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+	}
+
+	__forceinline GSVector4i i8to16() const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
+	}
+
+	__forceinline GSVector4i u8to16() const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
+	}
+
+	__forceinline GSVector4i i8to32() const
+	{
+		return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
+	}
+
+	__forceinline GSVector4i u8to32() const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
+	}
+
+	__forceinline GSVector4i i8to64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
+	}
+
+	__forceinline GSVector4i u8to64() const
+	{
+		return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
+	}
+
+	__forceinline GSVector4i i16to32() const
+	{
+		return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s))));
+	}
+
+	__forceinline GSVector4i u16to32() const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
+	}
+
+	__forceinline GSVector4i i16to64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
+	}
+
+	__forceinline GSVector4i u16to64() const
+	{
+		return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
+	}
+
+	__forceinline GSVector4i i32to64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s))));
+	}
+
+	__forceinline GSVector4i u32to64() const
+	{
+		return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i srl() const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
+	}
+
+	template <int i>
+	__forceinline GSVector4i srl(const GSVector4i& v)
+	{
+		if constexpr (i >= 16)
+			return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
+		else
+			return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sll() const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sll16() const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
+	}
+
+	__forceinline GSVector4i sll16(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
+	}
+
+	__forceinline GSVector4i sllv16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i srl16() const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
+	}
+
+	__forceinline GSVector4i srl16(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
+	}
+
+	__forceinline GSVector4i srlv16(const GSVector4i& v) const
+	{
+		return GSVector4i(
+			vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sra16() const
+	{
+		constexpr int count = (i & ~15) ? 15 : i;
+		return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
+	}
+
+	__forceinline GSVector4i sra16(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
+	}
+
+	__forceinline GSVector4i srav16(const GSVector4i& v) const
+	{
+		return GSVector4i(
+			vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sll32() const
+	{
+		return GSVector4i(vshlq_n_s32(v4s, i));
+	}
+
+	__forceinline GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
+
+	__forceinline GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
+
+	template <int i>
+	__forceinline GSVector4i srl32() const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
+	}
+
+	__forceinline GSVector4i srl32(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
+	}
+
+	__forceinline GSVector4i srlv32(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sra32() const
+	{
+		return GSVector4i(vshrq_n_s32(v4s, i));
+	}
+
+	__forceinline GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
+
+	__forceinline GSVector4i srav32(const GSVector4i& v) const
+	{
+		return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sll64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
+	}
+
+	__forceinline GSVector4i sll64(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
+	}
+
+	__forceinline GSVector4i sllv64(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i sra64() const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vshrq_n_s64(vreinterpretq_s64_s32(v4s), i)));
+	}
+
+	__forceinline GSVector4i sra64(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
+	}
+
+	__forceinline GSVector4i srav64(const GSVector4i& v) const
+	{
+		return GSVector4i(
+			vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
+	}
+
+	template <int i>
+	__forceinline GSVector4i srl64() const
+	{
+		return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
+	}
+
+	__forceinline GSVector4i srl64(s32 i) const
+	{
+		return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
+	}
+
+	__forceinline GSVector4i srlv64(const GSVector4i& v) const
+	{
+		return GSVector4i(
+			vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
+	}
+
+	__forceinline GSVector4i add8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i add16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i add32(const GSVector4i& v) const
+	{
+		return GSVector4i(vaddq_s32(v4s, v.v4s));
+	}
+
+	__forceinline GSVector4i adds8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i adds16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i hadds16(const GSVector4i& v) const
+	{
+		// can't use vpaddq_s16() here, because we need saturation.
+		//return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+		const int16x8_t a = vreinterpretq_s16_s32(v4s);
+		const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
+		return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+	}
+
+	__forceinline GSVector4i addus8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i addus16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i sub8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i sub16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i sub32(const GSVector4i& v) const
+	{
+		return GSVector4i(vsubq_s32(v4s, v.v4s));
+	}
+
+	__forceinline GSVector4i subs8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i subs16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i subus8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i subus16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i avg8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i avg16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i mul16hs(const GSVector4i& v) const
+	{
+		// from sse2neon
+		int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
+		int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
+		int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+		int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
+		int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
+		int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+		uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+		return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
+	}
+
+	__forceinline GSVector4i mul16l(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i mul16hrs(const GSVector4i& v) const
+	{
+		int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+		int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+		int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+		int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+		return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
+	}
+
+	template <int shift>
+	__forceinline GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+	{
+		// (a - this) * f << shift + this
+
+		return add16(a.sub16(*this).modulate16<shift>(f));
+	}
+
+	template <int shift>
+	__forceinline static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+	{
+		// (a - b) * c << shift
+
+		return a.sub16(b).modulate16<shift>(c);
+	}
+
+	template <int shift>
+	__forceinline static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, const GSVector4i& d)
+	{
+		// (a - b) * c << shift + d
+
+		return d.add16(a.sub16(b).modulate16<shift>(c));
+	}
+
+	__forceinline GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
+	{
+		// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+		return add16(a.sub16(*this).mul16l(f).sra16<4>());
+	}
+
+	template <int shift>
+	__forceinline GSVector4i modulate16(const GSVector4i& f) const
+	{
+		// a * f << shift
+		if (shift == 0)
+		{
+			return mul16hrs(f);
+		}
+
+		return sll16<shift + 1>().mul16hs(f);
+	}
+
+	__forceinline bool eq(const GSVector4i& v) const
+	{
+		return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0);
+	}
+
+	__forceinline GSVector4i eq8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i eq16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i eq32(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
+	}
+
+	__forceinline GSVector4i eq64(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i neq8(const GSVector4i& v) const
+	{
+		return ~eq8(v);
+	}
+
+	__forceinline GSVector4i neq16(const GSVector4i& v) const
+	{
+		return ~eq16(v);
+	}
+
+	__forceinline GSVector4i neq32(const GSVector4i& v) const
+	{
+		return ~eq32(v);
+	}
+
+	__forceinline GSVector4i gt8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i gt16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i gt32(const GSVector4i& v) const
+	{
+		return GSVector4i(vcgtq_s32(v4s, v.v4s));
+	}
+
+	__forceinline GSVector4i ge8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i ge16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i ge32(const GSVector4i& v) const
+	{
+		return GSVector4i(vcgeq_s32(v4s, v.v4s));
+	}
+
+	__forceinline GSVector4i lt8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i lt16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i lt32(const GSVector4i& v) const
+	{
+		return GSVector4i(vcltq_s32(v4s, v.v4s));
+	}
+
+	__forceinline GSVector4i le8(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i le16(const GSVector4i& v) const
+	{
+		return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+	}
+
+	__forceinline GSVector4i le32(const GSVector4i& v) const
+	{
+		return GSVector4i(vcleq_s32(v4s, v.v4s));
+	}
+
+
+	__forceinline GSVector4i andnot(const GSVector4i& v) const
+	{
+		return GSVector4i(vbicq_s32(v4s, v.v4s));
+	}
+
+	__forceinline int mask() const
+	{
+		// borrowed from sse2neon
+		const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
+		const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+		const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+		const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+		return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
+	}
+
+	__forceinline bool alltrue() const
+	{
+		// MSB should be set in all 8-bit lanes.
+		return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		// MSB should be clear in all 8-bit lanes.
+		return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
+	}
+
+	template <int i>
+	__forceinline GSVector4i insert8(int a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
+	}
+
+	template <int i>
+	__forceinline int extract8() const
+	{
+		return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
+	}
+
+	template <int i>
+	__forceinline GSVector4i insert16(int a) const
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
+	}
+
+	template <int i>
+	__forceinline int extract16() const
+	{
+		return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
+	}
+
+	template <int i>
+	__forceinline GSVector4i insert32(int a) const
+	{
+		return GSVector4i(vsetq_lane_s32(a, v4s, i));
+	}
+
+	template <int i>
+	__forceinline int extract32() const
+	{
+		return vgetq_lane_s32(v4s, i);
+	}
+
+	template <int i>
+	__forceinline GSVector4i insert64(s64 a) const
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
+	}
+
+	template <int i>
+	__forceinline s64 extract64() const
+	{
+		return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather8_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert8<1>((int)ptr[extract8<src + 0>() >> 4]);
+		v = v.insert8<2>((int)ptr[extract8<src + 1>() & 0xf]);
+		v = v.insert8<3>((int)ptr[extract8<src + 1>() >> 4]);
+		v = v.insert8<4>((int)ptr[extract8<src + 2>() & 0xf]);
+		v = v.insert8<5>((int)ptr[extract8<src + 2>() >> 4]);
+		v = v.insert8<6>((int)ptr[extract8<src + 3>() & 0xf]);
+		v = v.insert8<7>((int)ptr[extract8<src + 3>() >> 4]);
+		v = v.insert8<8>((int)ptr[extract8<src + 4>() & 0xf]);
+		v = v.insert8<9>((int)ptr[extract8<src + 4>() >> 4]);
+		v = v.insert8<10>((int)ptr[extract8<src + 5>() & 0xf]);
+		v = v.insert8<11>((int)ptr[extract8<src + 5>() >> 4]);
+		v = v.insert8<12>((int)ptr[extract8<src + 6>() & 0xf]);
+		v = v.insert8<13>((int)ptr[extract8<src + 6>() >> 4]);
+		v = v.insert8<14>((int)ptr[extract8<src + 7>() & 0xf]);
+		v = v.insert8<15>((int)ptr[extract8<src + 7>() >> 4]);
+
+		return v;
+	}
+
+	template <class T>
+	__forceinline GSVector4i gather8_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<0>()]);
+		v = v.insert8<1>((int)ptr[extract8<1>()]);
+		v = v.insert8<2>((int)ptr[extract8<2>()]);
+		v = v.insert8<3>((int)ptr[extract8<3>()]);
+		v = v.insert8<4>((int)ptr[extract8<4>()]);
+		v = v.insert8<5>((int)ptr[extract8<5>()]);
+		v = v.insert8<6>((int)ptr[extract8<6>()]);
+		v = v.insert8<7>((int)ptr[extract8<7>()]);
+		v = v.insert8<8>((int)ptr[extract8<8>()]);
+		v = v.insert8<9>((int)ptr[extract8<9>()]);
+		v = v.insert8<10>((int)ptr[extract8<10>()]);
+		v = v.insert8<11>((int)ptr[extract8<11>()]);
+		v = v.insert8<12>((int)ptr[extract8<12>()]);
+		v = v.insert8<13>((int)ptr[extract8<13>()]);
+		v = v.insert8<14>((int)ptr[extract8<14>()]);
+		v = v.insert8<15>((int)ptr[extract8<15>()]);
+
+		return v;
+	}
+
+	template <int dst, class T>
+	__forceinline GSVector4i gather8_16(const T* ptr, const GSVector4i& a) const
+	{
+		GSVector4i v = a;
+
+		v = v.insert8<dst + 0>((int)ptr[extract16<0>()]);
+		v = v.insert8<dst + 1>((int)ptr[extract16<1>()]);
+		v = v.insert8<dst + 2>((int)ptr[extract16<2>()]);
+		v = v.insert8<dst + 3>((int)ptr[extract16<3>()]);
+		v = v.insert8<dst + 4>((int)ptr[extract16<4>()]);
+		v = v.insert8<dst + 5>((int)ptr[extract16<5>()]);
+		v = v.insert8<dst + 6>((int)ptr[extract16<6>()]);
+		v = v.insert8<dst + 7>((int)ptr[extract16<7>()]);
+
+		return v;
+	}
+
+	template <int dst, class T>
+	__forceinline GSVector4i gather8_32(const T* ptr, const GSVector4i& a) const
+	{
+		GSVector4i v = a;
+
+		v = v.insert8<dst + 0>((int)ptr[extract32<0>()]);
+		v = v.insert8<dst + 1>((int)ptr[extract32<1>()]);
+		v = v.insert8<dst + 2>((int)ptr[extract32<2>()]);
+		v = v.insert8<dst + 3>((int)ptr[extract32<3>()]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather16_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert16<1>((int)ptr[extract8<src + 0>() >> 4]);
+		v = v.insert16<2>((int)ptr[extract8<src + 1>() & 0xf]);
+		v = v.insert16<3>((int)ptr[extract8<src + 1>() >> 4]);
+		v = v.insert16<4>((int)ptr[extract8<src + 2>() & 0xf]);
+		v = v.insert16<5>((int)ptr[extract8<src + 2>() >> 4]);
+		v = v.insert16<6>((int)ptr[extract8<src + 3>() & 0xf]);
+		v = v.insert16<7>((int)ptr[extract8<src + 3>() >> 4]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather16_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>()]);
+		v = v.insert16<1>((int)ptr[extract8<src + 1>()]);
+		v = v.insert16<2>((int)ptr[extract8<src + 2>()]);
+		v = v.insert16<3>((int)ptr[extract8<src + 3>()]);
+		v = v.insert16<4>((int)ptr[extract8<src + 4>()]);
+		v = v.insert16<5>((int)ptr[extract8<src + 5>()]);
+		v = v.insert16<6>((int)ptr[extract8<src + 6>()]);
+		v = v.insert16<7>((int)ptr[extract8<src + 7>()]);
+
+		return v;
+	}
+
+	template <class T>
+	__forceinline GSVector4i gather16_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract16<0>()]);
+		v = v.insert16<1>((int)ptr[extract16<1>()]);
+		v = v.insert16<2>((int)ptr[extract16<2>()]);
+		v = v.insert16<3>((int)ptr[extract16<3>()]);
+		v = v.insert16<4>((int)ptr[extract16<4>()]);
+		v = v.insert16<5>((int)ptr[extract16<5>()]);
+		v = v.insert16<6>((int)ptr[extract16<6>()]);
+		v = v.insert16<7>((int)ptr[extract16<7>()]);
+
+		return v;
+	}
+
+	template <class T1, class T2>
+	__forceinline GSVector4i gather16_16(const T1* ptr1, const T2* ptr2) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr2[ptr1[extract16<0>()]]);
+		v = v.insert16<1>((int)ptr2[ptr1[extract16<1>()]]);
+		v = v.insert16<2>((int)ptr2[ptr1[extract16<2>()]]);
+		v = v.insert16<3>((int)ptr2[ptr1[extract16<3>()]]);
+		v = v.insert16<4>((int)ptr2[ptr1[extract16<4>()]]);
+		v = v.insert16<5>((int)ptr2[ptr1[extract16<5>()]]);
+		v = v.insert16<6>((int)ptr2[ptr1[extract16<6>()]]);
+		v = v.insert16<7>((int)ptr2[ptr1[extract16<7>()]]);
+
+		return v;
+	}
+
+	template <int dst, class T>
+	__forceinline GSVector4i gather16_32(const T* ptr, const GSVector4i& a) const
+	{
+		GSVector4i v = a;
+
+		v = v.insert16<dst + 0>((int)ptr[extract32<0>()]);
+		v = v.insert16<dst + 1>((int)ptr[extract32<1>()]);
+		v = v.insert16<dst + 2>((int)ptr[extract32<2>()]);
+		v = v.insert16<dst + 3>((int)ptr[extract32<3>()]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather32_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert32<1>((int)ptr[extract8<src + 0>() >> 4]);
+		v = v.insert32<2>((int)ptr[extract8<src + 1>() & 0xf]);
+		v = v.insert32<3>((int)ptr[extract8<src + 1>() >> 4]);
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather32_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>()]);
+		v = v.insert32<1>((int)ptr[extract8<src + 1>()]);
+		v = v.insert32<2>((int)ptr[extract8<src + 2>()]);
+		v = v.insert32<3>((int)ptr[extract8<src + 3>()]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather32_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract16<src + 0>()]);
+		v = v.insert32<1>((int)ptr[extract16<src + 1>()]);
+		v = v.insert32<2>((int)ptr[extract16<src + 2>()]);
+		v = v.insert32<3>((int)ptr[extract16<src + 3>()]);
+
+		return v;
+	}
+
+	template <class T>
+	__forceinline GSVector4i gather32_32(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract32<0>()]);
+		v = v.insert32<1>((int)ptr[extract32<1>()]);
+		v = v.insert32<2>((int)ptr[extract32<2>()]);
+		v = v.insert32<3>((int)ptr[extract32<3>()]);
+
+		return v;
+	}
+
+	template <class T1, class T2>
+	__forceinline GSVector4i gather32_32(const T1* ptr1, const T2* ptr2) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr2[ptr1[extract32<0>()]]);
+		v = v.insert32<1>((int)ptr2[ptr1[extract32<1>()]]);
+		v = v.insert32<2>((int)ptr2[ptr1[extract32<2>()]]);
+		v = v.insert32<3>((int)ptr2[ptr1[extract32<3>()]]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather64_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((s64)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert64<1>((s64)ptr[extract8<src + 0>() >> 4]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather64_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((s64)ptr[extract8<src + 0>()]);
+		v = v.insert64<1>((s64)ptr[extract8<src + 1>()]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather64_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((s64)ptr[extract16<src + 0>()]);
+		v = v.insert64<1>((s64)ptr[extract16<src + 1>()]);
+
+		return v;
+	}
+
+	template <int src, class T>
+	__forceinline GSVector4i gather64_32(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((s64)ptr[extract32<src + 0>()]);
+		v = v.insert64<1>((s64)ptr[extract32<src + 1>()]);
+
+		return v;
+	}
+
+	template <class T>
+	__forceinline GSVector4i gather64_64(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((s64)ptr[extract64<0>()]);
+		v = v.insert64<1>((s64)ptr[extract64<1>()]);
+
+		return v;
+	}
+
+	template <class T>
+	__forceinline void gather8_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather8_4<0>(ptr);
+		dst[1] = gather8_4<8>(ptr);
+	}
+
+	__forceinline void gather8_8(const u8* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather8_8<>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather16_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather16_4<0>(ptr);
+		dst[1] = gather16_4<4>(ptr);
+		dst[2] = gather16_4<8>(ptr);
+		dst[3] = gather16_4<12>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather16_8(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather16_8<0>(ptr);
+		dst[1] = gather16_8<8>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather16_16(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather16_16<>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather32_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_4<0>(ptr);
+		dst[1] = gather32_4<2>(ptr);
+		dst[2] = gather32_4<4>(ptr);
+		dst[3] = gather32_4<6>(ptr);
+		dst[4] = gather32_4<8>(ptr);
+		dst[5] = gather32_4<10>(ptr);
+		dst[6] = gather32_4<12>(ptr);
+		dst[7] = gather32_4<14>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather32_8(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_8<0>(ptr);
+		dst[1] = gather32_8<4>(ptr);
+		dst[2] = gather32_8<8>(ptr);
+		dst[3] = gather32_8<12>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather32_16(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_16<0>(ptr);
+		dst[1] = gather32_16<4>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather32_32(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_32<>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather64_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_4<0>(ptr);
+		dst[1] = gather64_4<1>(ptr);
+		dst[2] = gather64_4<2>(ptr);
+		dst[3] = gather64_4<3>(ptr);
+		dst[4] = gather64_4<4>(ptr);
+		dst[5] = gather64_4<5>(ptr);
+		dst[6] = gather64_4<6>(ptr);
+		dst[7] = gather64_4<7>(ptr);
+		dst[8] = gather64_4<8>(ptr);
+		dst[9] = gather64_4<9>(ptr);
+		dst[10] = gather64_4<10>(ptr);
+		dst[11] = gather64_4<11>(ptr);
+		dst[12] = gather64_4<12>(ptr);
+		dst[13] = gather64_4<13>(ptr);
+		dst[14] = gather64_4<14>(ptr);
+		dst[15] = gather64_4<15>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather64_8(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_8<0>(ptr);
+		dst[1] = gather64_8<2>(ptr);
+		dst[2] = gather64_8<4>(ptr);
+		dst[3] = gather64_8<6>(ptr);
+		dst[4] = gather64_8<8>(ptr);
+		dst[5] = gather64_8<10>(ptr);
+		dst[6] = gather64_8<12>(ptr);
+		dst[7] = gather64_8<14>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather64_16(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_16<0>(ptr);
+		dst[1] = gather64_16<2>(ptr);
+		dst[2] = gather64_16<4>(ptr);
+		dst[3] = gather64_16<8>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather64_32(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_32<0>(ptr);
+		dst[1] = gather64_32<2>(ptr);
+	}
+
+	template <class T>
+	__forceinline void gather64_64(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_64<>(ptr);
+	}
+
+	__forceinline static GSVector4i loadnt(const void* p)
+	{
+#if __has_builtin(__builtin_nontemporal_store)
+		return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
+#else
+		return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
+#endif
+	}
+
+	__forceinline static GSVector4i loadl(const void* p)
+	{
+		return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
+	}
+
+	__forceinline static GSVector4i loadh(const void* p)
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
+	}
+
+	__forceinline static GSVector4i loadh(const void* p, const GSVector4i& v)
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p))));
+	}
+
+	__forceinline static GSVector4i loadh(const GSVector2i& v)
+	{
+		return loadh(&v);
+	}
+
+	__forceinline static GSVector4i load(const void* pl, const void* ph)
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph))));
+	}
+
+	template <bool aligned>
+	__forceinline static GSVector4i load(const void* p)
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
+	}
+
+	__forceinline static GSVector4i load(int i)
+	{
+		return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0));
+	}
+
+	__forceinline static GSVector4i loadq(s64 i)
+	{
+		return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0)));
+	}
+
+	__forceinline static void storent(void* p, const GSVector4i& v)
+	{
+#if __has_builtin(__builtin_nontemporal_store)
+		__builtin_nontemporal_store(v.v4s, ((int32x4_t*)p));
+#else
+		vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
+#endif
+	}
+
+	__forceinline static void storel(void* p, const GSVector4i& v)
+	{
+		vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector4i& v)
+	{
+		vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
+	}
+
+	__forceinline static void store(void* pl, void* ph, const GSVector4i& v)
+	{
+		GSVector4i::storel(pl, v);
+		GSVector4i::storeh(ph, v);
+	}
+
+	template <bool aligned>
+	__forceinline static void store(void* p, const GSVector4i& v)
+	{
+		vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
+	}
+
+	__forceinline static int store(const GSVector4i& v)
+	{
+		return vgetq_lane_s32(v.v4s, 0);
+	}
+
+	__forceinline static s64 storeq(const GSVector4i& v)
+	{
+		return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0);
+	}
+
+	__forceinline static void storent(void* RESTRICT dst, const void* RESTRICT src, size_t size)
+	{
+		const GSVector4i* s = (const GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		if (size == 0)
+			return;
+
+		size_t i = 0;
+		size_t j = size >> 6;
+
+		for (; i < j; i++, s += 4, d += 4)
+		{
+			storent(&d[0], s[0]);
+			storent(&d[1], s[1]);
+			storent(&d[2], s[2]);
+			storent(&d[3], s[3]);
+		}
+
+		size &= 63;
+
+		if (size == 0)
+			return;
+
+		memcpy(d, s, size);
+	}
+
+	__forceinline static void mix4(GSVector4i& a, GSVector4i& b)
+	{
+		GSVector4i mask(vdupq_n_s32(0x0f0f0f0f));
+
+		GSVector4i c = (b << 4).blend(a, mask);
+		GSVector4i d = b.blend(a >> 4, mask);
+		a = c;
+		b = d;
+	}
+
+	__forceinline static void sw4(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		mix4(a, b);
+		mix4(c, d);
+		sw8(a, b, c, d);
+	}
+
+	__forceinline static void sw8(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl8(b);
+		c = e.uph8(b);
+		b = f.upl8(d);
+		d = f.uph8(d);
+	}
+
+	__forceinline static void sw16(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl16(b);
+		c = e.uph16(b);
+		b = f.upl16(d);
+		d = f.uph16(d);
+	}
+
+	__forceinline static void sw16rl(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = b.upl16(e);
+		c = e.uph16(b);
+		b = d.upl16(f);
+		d = f.uph16(d);
+	}
+
+	__forceinline static void sw16rh(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl16(b);
+		c = b.uph16(e);
+		b = f.upl16(d);
+		d = d.uph16(f);
+	}
+
+	__forceinline static void sw32(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl32(b);
+		c = e.uph32(b);
+		b = f.upl32(d);
+		d = f.uph32(d);
+	}
+
+	__forceinline static void sw32_inv(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d);
+
+	__forceinline static void sw64(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl64(b);
+		c = e.uph64(b);
+		b = f.upl64(d);
+		d = f.uph64(d);
+	}
+
+	__forceinline static bool compare16(const void* dst, const void* src, size_t size)
+	{
+		pxAssert((size & 15) == 0);
+
+		size >>= 4;
+
+		GSVector4i* s = (GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		for (size_t i = 0; i < size; i++)
+		{
+			if (!d[i].eq(s[i]))
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	__forceinline static bool compare64(const void* dst, const void* src, size_t size)
+	{
+		pxAssert((size & 63) == 0);
+
+		size >>= 6;
+
+		GSVector4i* s = (GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		for (size_t i = 0; i < size; ++i)
+		{
+			GSVector4i v0 = (d[i * 4 + 0] == s[i * 4 + 0]);
+			GSVector4i v1 = (d[i * 4 + 1] == s[i * 4 + 1]);
+			GSVector4i v2 = (d[i * 4 + 2] == s[i * 4 + 2]);
+			GSVector4i v3 = (d[i * 4 + 3] == s[i * 4 + 3]);
+
+			v0 = v0 & v1;
+			v2 = v2 & v3;
+
+			if (!(v0 & v2).alltrue())
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	__forceinline static bool update(const void* dst, const void* src, size_t size)
+	{
+		pxAssert((size & 15) == 0);
+
+		size >>= 4;
+
+		GSVector4i* s = (GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		GSVector4i v = GSVector4i::xffffffff();
+
+		for (size_t i = 0; i < size; i++)
+		{
+			v &= d[i] == s[i];
+
+			d[i] = s[i];
+		}
+
+		return v.alltrue();
+	}
+
+	__forceinline void operator+=(const GSVector4i& v)
+	{
+		v4s = vaddq_s32(v4s, v.v4s);
+	}
+
+	__forceinline void operator-=(const GSVector4i& v)
+	{
+		v4s = vsubq_s32(v4s, v.v4s);
+	}
+
+	__forceinline void operator+=(int i)
+	{
+		*this += GSVector4i(i);
+	}
+
+	__forceinline void operator-=(int i)
+	{
+		*this -= GSVector4i(i);
+	}
+
+	__forceinline void operator<<=(const int i)
+	{
+		v4s = vshlq_s32(v4s, vdupq_n_s32(i));
+	}
+
+	__forceinline void operator>>=(const int i)
+	{
+		v4s = vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i)));
+	}
+
+	__forceinline void operator&=(const GSVector4i& v)
+	{
+		v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+	}
+
+	__forceinline void operator|=(const GSVector4i& v)
+	{
+		v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+	}
+
+	__forceinline void operator^=(const GSVector4i& v)
+	{
+		v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+	}
+
+	__forceinline friend GSVector4i operator+(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vaddq_s32(v1.v4s, v2.v4s));
+	}
+
+	__forceinline friend GSVector4i operator-(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vsubq_s32(v1.v4s, v2.v4s));
+	}
+
+	__forceinline friend GSVector4i operator+(const GSVector4i& v, int i)
+	{
+		return v + GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator-(const GSVector4i& v, int i)
+	{
+		return v - GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator<<(const GSVector4i& v, const int i)
+	{
+		return GSVector4i(vshlq_s32(v.v4s, vdupq_n_s32(i)));
+	}
+
+	__forceinline friend GSVector4i operator>>(const GSVector4i& v, const int i)
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v.v4s), vdupq_n_s32(-i))));
+	}
+
+	__forceinline friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+	}
+
+	__forceinline friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+	}
+
+	__forceinline friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+	}
+
+	__forceinline friend GSVector4i operator&(const GSVector4i& v, int i)
+	{
+		return v & GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator|(const GSVector4i& v, int i)
+	{
+		return v | GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator^(const GSVector4i& v, int i)
+	{
+		return v ^ GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator~(const GSVector4i& v)
+	{
+		return GSVector4i(vmvnq_s32(v.v4s));
+	}
+
+	__forceinline friend GSVector4i operator==(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4i operator!=(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return ~(v1 == v2);
+	}
+
+	__forceinline friend GSVector4i operator>(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vcgtq_s32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4i operator<(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(vreinterpretq_s32_u32(vcltq_s32(v1.v4s, v2.v4s)));
+	}
+
+	__forceinline friend GSVector4i operator>=(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return (v1 > v2) | (v1 == v2);
+	}
+
+	__forceinline friend GSVector4i operator<=(const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return (v1 < v2) | (v1 == v2);
+	}
+
+	// clang-format off
+
+
+	#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); }
+
+		// __forceinline GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+		// __forceinline GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+		// __forceinline GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+		// __forceinline GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+	#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR4i_SHUFFLE_1(xs, xn) \
+		VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR4i_SHUFFLE_1(x, 0)
+	VECTOR4i_SHUFFLE_1(y, 1)
+	VECTOR4i_SHUFFLE_1(z, 2)
+	VECTOR4i_SHUFFLE_1(w, 3)
+
+	// TODO: Make generic like above.
+	__forceinline GSVector4i xxzzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 2, 2, 4, 4, 6, 6))); }
+	__forceinline GSVector4i yywwlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 1, 3, 3, 5, 5, 7, 7))); }
+	__forceinline GSVector4i yxwzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 5, 4, 7, 6))); }
+	__forceinline GSVector4i xxxxlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 4, 4, 4))); }
+
+	__forceinline GSVector4i xxxxl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 5, 6, 7))); }
+	__forceinline GSVector4i zwxyl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 0, 1, 4, 5, 6, 7))); }
+	__forceinline GSVector4i yxwzl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 4, 5, 6, 7))); }
+	__forceinline GSVector4i zwzwl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 2, 3, 4, 5, 6, 7))); }
+
+	__forceinline GSVector4i zzzzh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 6, 6, 6, 6))); }
+
+	// clang-format on
+
+	/// Noop, here so broadcast128 can be used generically over all vectors
+	__forceinline static GSVector4i broadcast128(const GSVector4i& v)
+	{
+		return v;
+	}
+
+	__forceinline static GSVector4i broadcast16(u16 value)
+	{
+		return GSVector4i(vreinterpretq_s32_u16(vdupq_n_u16((value))));
+	}
+
+	__forceinline static GSVector4i zero() { return GSVector4i(0); }
+
+	__forceinline static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
+
+	__forceinline static GSVector4i x00000001() { return xffffffff().srl32<31>(); }
+	__forceinline static GSVector4i x00000003() { return xffffffff().srl32<30>(); }
+	__forceinline static GSVector4i x00000007() { return xffffffff().srl32<29>(); }
+	__forceinline static GSVector4i x0000000f() { return xffffffff().srl32<28>(); }
+	__forceinline static GSVector4i x0000001f() { return xffffffff().srl32<27>(); }
+	__forceinline static GSVector4i x0000003f() { return xffffffff().srl32<26>(); }
+	__forceinline static GSVector4i x0000007f() { return xffffffff().srl32<25>(); }
+	__forceinline static GSVector4i x000000ff() { return xffffffff().srl32<24>(); }
+	__forceinline static GSVector4i x000001ff() { return xffffffff().srl32<23>(); }
+	__forceinline static GSVector4i x000003ff() { return xffffffff().srl32<22>(); }
+	__forceinline static GSVector4i x000007ff() { return xffffffff().srl32<21>(); }
+	__forceinline static GSVector4i x00000fff() { return xffffffff().srl32<20>(); }
+	__forceinline static GSVector4i x00001fff() { return xffffffff().srl32<19>(); }
+	__forceinline static GSVector4i x00003fff() { return xffffffff().srl32<18>(); }
+	__forceinline static GSVector4i x00007fff() { return xffffffff().srl32<17>(); }
+	__forceinline static GSVector4i x0000ffff() { return xffffffff().srl32<16>(); }
+	__forceinline static GSVector4i x0001ffff() { return xffffffff().srl32<15>(); }
+	__forceinline static GSVector4i x0003ffff() { return xffffffff().srl32<14>(); }
+	__forceinline static GSVector4i x0007ffff() { return xffffffff().srl32<13>(); }
+	__forceinline static GSVector4i x000fffff() { return xffffffff().srl32<12>(); }
+	__forceinline static GSVector4i x001fffff() { return xffffffff().srl32<11>(); }
+	__forceinline static GSVector4i x003fffff() { return xffffffff().srl32<10>(); }
+	__forceinline static GSVector4i x007fffff() { return xffffffff().srl32<9>(); }
+	__forceinline static GSVector4i x00ffffff() { return xffffffff().srl32<8>(); }
+	__forceinline static GSVector4i x01ffffff() { return xffffffff().srl32<7>(); }
+	__forceinline static GSVector4i x03ffffff() { return xffffffff().srl32<6>(); }
+	__forceinline static GSVector4i x07ffffff() { return xffffffff().srl32<5>(); }
+	__forceinline static GSVector4i x0fffffff() { return xffffffff().srl32<4>(); }
+	__forceinline static GSVector4i x1fffffff() { return xffffffff().srl32<3>(); }
+	__forceinline static GSVector4i x3fffffff() { return xffffffff().srl32<2>(); }
+	__forceinline static GSVector4i x7fffffff() { return xffffffff().srl32<1>(); }
+
+	__forceinline static GSVector4i x80000000() { return xffffffff().sll32<31>(); }
+	__forceinline static GSVector4i xc0000000() { return xffffffff().sll32<30>(); }
+	__forceinline static GSVector4i xe0000000() { return xffffffff().sll32<29>(); }
+	__forceinline static GSVector4i xf0000000() { return xffffffff().sll32<28>(); }
+	__forceinline static GSVector4i xf8000000() { return xffffffff().sll32<27>(); }
+	__forceinline static GSVector4i xfc000000() { return xffffffff().sll32<26>(); }
+	__forceinline static GSVector4i xfe000000() { return xffffffff().sll32<25>(); }
+	__forceinline static GSVector4i xff000000() { return xffffffff().sll32<24>(); }
+	__forceinline static GSVector4i xff800000() { return xffffffff().sll32<23>(); }
+	__forceinline static GSVector4i xffc00000() { return xffffffff().sll32<22>(); }
+	__forceinline static GSVector4i xffe00000() { return xffffffff().sll32<21>(); }
+	__forceinline static GSVector4i xfff00000() { return xffffffff().sll32<20>(); }
+	__forceinline static GSVector4i xfff80000() { return xffffffff().sll32<19>(); }
+	__forceinline static GSVector4i xfffc0000() { return xffffffff().sll32<18>(); }
+	__forceinline static GSVector4i xfffe0000() { return xffffffff().sll32<17>(); }
+	__forceinline static GSVector4i xffff0000() { return xffffffff().sll32<16>(); }
+	__forceinline static GSVector4i xffff8000() { return xffffffff().sll32<15>(); }
+	__forceinline static GSVector4i xffffc000() { return xffffffff().sll32<14>(); }
+	__forceinline static GSVector4i xffffe000() { return xffffffff().sll32<13>(); }
+	__forceinline static GSVector4i xfffff000() { return xffffffff().sll32<12>(); }
+	__forceinline static GSVector4i xfffff800() { return xffffffff().sll32<11>(); }
+	__forceinline static GSVector4i xfffffc00() { return xffffffff().sll32<10>(); }
+	__forceinline static GSVector4i xfffffe00() { return xffffffff().sll32<9>(); }
+	__forceinline static GSVector4i xffffff00() { return xffffffff().sll32<8>(); }
+	__forceinline static GSVector4i xffffff80() { return xffffffff().sll32<7>(); }
+	__forceinline static GSVector4i xffffffc0() { return xffffffff().sll32<6>(); }
+	__forceinline static GSVector4i xffffffe0() { return xffffffff().sll32<5>(); }
+	__forceinline static GSVector4i xfffffff0() { return xffffffff().sll32<4>(); }
+	__forceinline static GSVector4i xfffffff8() { return xffffffff().sll32<3>(); }
+	__forceinline static GSVector4i xfffffffc() { return xffffffff().sll32<2>(); }
+	__forceinline static GSVector4i xfffffffe() { return xffffffff().sll32<1>(); }
+
+	__forceinline static GSVector4i x0001() { return xffffffff().srl16<15>(); }
+	__forceinline static GSVector4i x0003() { return xffffffff().srl16<14>(); }
+	__forceinline static GSVector4i x0007() { return xffffffff().srl16<13>(); }
+	__forceinline static GSVector4i x000f() { return xffffffff().srl16<12>(); }
+	__forceinline static GSVector4i x001f() { return xffffffff().srl16<11>(); }
+	__forceinline static GSVector4i x003f() { return xffffffff().srl16<10>(); }
+	__forceinline static GSVector4i x007f() { return xffffffff().srl16<9>(); }
+	__forceinline static GSVector4i x00ff() { return xffffffff().srl16<8>(); }
+	__forceinline static GSVector4i x01ff() { return xffffffff().srl16<7>(); }
+	__forceinline static GSVector4i x03ff() { return xffffffff().srl16<6>(); }
+	__forceinline static GSVector4i x07ff() { return xffffffff().srl16<5>(); }
+	__forceinline static GSVector4i x0fff() { return xffffffff().srl16<4>(); }
+	__forceinline static GSVector4i x1fff() { return xffffffff().srl16<3>(); }
+	__forceinline static GSVector4i x3fff() { return xffffffff().srl16<2>(); }
+	__forceinline static GSVector4i x7fff() { return xffffffff().srl16<1>(); }
+
+	__forceinline static GSVector4i x8000() { return xffffffff().sll16<15>(); }
+	__forceinline static GSVector4i xc000() { return xffffffff().sll16<14>(); }
+	__forceinline static GSVector4i xe000() { return xffffffff().sll16<13>(); }
+	__forceinline static GSVector4i xf000() { return xffffffff().sll16<12>(); }
+	__forceinline static GSVector4i xf800() { return xffffffff().sll16<11>(); }
+	__forceinline static GSVector4i xfc00() { return xffffffff().sll16<10>(); }
+	__forceinline static GSVector4i xfe00() { return xffffffff().sll16<9>(); }
+	__forceinline static GSVector4i xff00() { return xffffffff().sll16<8>(); }
+	__forceinline static GSVector4i xff80() { return xffffffff().sll16<7>(); }
+	__forceinline static GSVector4i xffc0() { return xffffffff().sll16<6>(); }
+	__forceinline static GSVector4i xffe0() { return xffffffff().sll16<5>(); }
+	__forceinline static GSVector4i xfff0() { return xffffffff().sll16<4>(); }
+	__forceinline static GSVector4i xfff8() { return xffffffff().sll16<3>(); }
+	__forceinline static GSVector4i xfffc() { return xffffffff().sll16<2>(); }
+	__forceinline static GSVector4i xfffe() { return xffffffff().sll16<1>(); }
+
+	__forceinline static GSVector4i xffffffff(const GSVector4i& v) { return v == v; }
+
+	__forceinline static GSVector4i x00000001(const GSVector4i& v) { return xffffffff(v).srl32<31>(); }
+	__forceinline static GSVector4i x00000003(const GSVector4i& v) { return xffffffff(v).srl32<30>(); }
+	__forceinline static GSVector4i x00000007(const GSVector4i& v) { return xffffffff(v).srl32<29>(); }
+	__forceinline static GSVector4i x0000000f(const GSVector4i& v) { return xffffffff(v).srl32<28>(); }
+	__forceinline static GSVector4i x0000001f(const GSVector4i& v) { return xffffffff(v).srl32<27>(); }
+	__forceinline static GSVector4i x0000003f(const GSVector4i& v) { return xffffffff(v).srl32<26>(); }
+	__forceinline static GSVector4i x0000007f(const GSVector4i& v) { return xffffffff(v).srl32<25>(); }
+	__forceinline static GSVector4i x000000ff(const GSVector4i& v) { return xffffffff(v).srl32<24>(); }
+	__forceinline static GSVector4i x000001ff(const GSVector4i& v) { return xffffffff(v).srl32<23>(); }
+	__forceinline static GSVector4i x000003ff(const GSVector4i& v) { return xffffffff(v).srl32<22>(); }
+	__forceinline static GSVector4i x000007ff(const GSVector4i& v) { return xffffffff(v).srl32<21>(); }
+	__forceinline static GSVector4i x00000fff(const GSVector4i& v) { return xffffffff(v).srl32<20>(); }
+	__forceinline static GSVector4i x00001fff(const GSVector4i& v) { return xffffffff(v).srl32<19>(); }
+	__forceinline static GSVector4i x00003fff(const GSVector4i& v) { return xffffffff(v).srl32<18>(); }
+	__forceinline static GSVector4i x00007fff(const GSVector4i& v) { return xffffffff(v).srl32<17>(); }
+	__forceinline static GSVector4i x0000ffff(const GSVector4i& v) { return xffffffff(v).srl32<16>(); }
+	__forceinline static GSVector4i x0001ffff(const GSVector4i& v) { return xffffffff(v).srl32<15>(); }
+	__forceinline static GSVector4i x0003ffff(const GSVector4i& v) { return xffffffff(v).srl32<14>(); }
+	__forceinline static GSVector4i x0007ffff(const GSVector4i& v) { return xffffffff(v).srl32<13>(); }
+	__forceinline static GSVector4i x000fffff(const GSVector4i& v) { return xffffffff(v).srl32<12>(); }
+	__forceinline static GSVector4i x001fffff(const GSVector4i& v) { return xffffffff(v).srl32<11>(); }
+	__forceinline static GSVector4i x003fffff(const GSVector4i& v) { return xffffffff(v).srl32<10>(); }
+	__forceinline static GSVector4i x007fffff(const GSVector4i& v) { return xffffffff(v).srl32<9>(); }
+	__forceinline static GSVector4i x00ffffff(const GSVector4i& v) { return xffffffff(v).srl32<8>(); }
+	__forceinline static GSVector4i x01ffffff(const GSVector4i& v) { return xffffffff(v).srl32<7>(); }
+	__forceinline static GSVector4i x03ffffff(const GSVector4i& v) { return xffffffff(v).srl32<6>(); }
+	__forceinline static GSVector4i x07ffffff(const GSVector4i& v) { return xffffffff(v).srl32<5>(); }
+	__forceinline static GSVector4i x0fffffff(const GSVector4i& v) { return xffffffff(v).srl32<4>(); }
+	__forceinline static GSVector4i x1fffffff(const GSVector4i& v) { return xffffffff(v).srl32<3>(); }
+	__forceinline static GSVector4i x3fffffff(const GSVector4i& v) { return xffffffff(v).srl32<2>(); }
+	__forceinline static GSVector4i x7fffffff(const GSVector4i& v) { return xffffffff(v).srl32<1>(); }
+
+	__forceinline static GSVector4i x80000000(const GSVector4i& v) { return xffffffff(v).sll32<31>(); }
+	__forceinline static GSVector4i xc0000000(const GSVector4i& v) { return xffffffff(v).sll32<30>(); }
+	__forceinline static GSVector4i xe0000000(const GSVector4i& v) { return xffffffff(v).sll32<29>(); }
+	__forceinline static GSVector4i xf0000000(const GSVector4i& v) { return xffffffff(v).sll32<28>(); }
+	__forceinline static GSVector4i xf8000000(const GSVector4i& v) { return xffffffff(v).sll32<27>(); }
+	__forceinline static GSVector4i xfc000000(const GSVector4i& v) { return xffffffff(v).sll32<26>(); }
+	__forceinline static GSVector4i xfe000000(const GSVector4i& v) { return xffffffff(v).sll32<25>(); }
+	__forceinline static GSVector4i xff000000(const GSVector4i& v) { return xffffffff(v).sll32<24>(); }
+	__forceinline static GSVector4i xff800000(const GSVector4i& v) { return xffffffff(v).sll32<23>(); }
+	__forceinline static GSVector4i xffc00000(const GSVector4i& v) { return xffffffff(v).sll32<22>(); }
+	__forceinline static GSVector4i xffe00000(const GSVector4i& v) { return xffffffff(v).sll32<21>(); }
+	__forceinline static GSVector4i xfff00000(const GSVector4i& v) { return xffffffff(v).sll32<20>(); }
+	__forceinline static GSVector4i xfff80000(const GSVector4i& v) { return xffffffff(v).sll32<19>(); }
+	__forceinline static GSVector4i xfffc0000(const GSVector4i& v) { return xffffffff(v).sll32<18>(); }
+	__forceinline static GSVector4i xfffe0000(const GSVector4i& v) { return xffffffff(v).sll32<17>(); }
+	__forceinline static GSVector4i xffff0000(const GSVector4i& v) { return xffffffff(v).sll32<16>(); }
+	__forceinline static GSVector4i xffff8000(const GSVector4i& v) { return xffffffff(v).sll32<15>(); }
+	__forceinline static GSVector4i xffffc000(const GSVector4i& v) { return xffffffff(v).sll32<14>(); }
+	__forceinline static GSVector4i xffffe000(const GSVector4i& v) { return xffffffff(v).sll32<13>(); }
+	__forceinline static GSVector4i xfffff000(const GSVector4i& v) { return xffffffff(v).sll32<12>(); }
+	__forceinline static GSVector4i xfffff800(const GSVector4i& v) { return xffffffff(v).sll32<11>(); }
+	__forceinline static GSVector4i xfffffc00(const GSVector4i& v) { return xffffffff(v).sll32<10>(); }
+	__forceinline static GSVector4i xfffffe00(const GSVector4i& v) { return xffffffff(v).sll32<9>(); }
+	__forceinline static GSVector4i xffffff00(const GSVector4i& v) { return xffffffff(v).sll32<8>(); }
+	__forceinline static GSVector4i xffffff80(const GSVector4i& v) { return xffffffff(v).sll32<7>(); }
+	__forceinline static GSVector4i xffffffc0(const GSVector4i& v) { return xffffffff(v).sll32<6>(); }
+	__forceinline static GSVector4i xffffffe0(const GSVector4i& v) { return xffffffff(v).sll32<5>(); }
+	__forceinline static GSVector4i xfffffff0(const GSVector4i& v) { return xffffffff(v).sll32<4>(); }
+	__forceinline static GSVector4i xfffffff8(const GSVector4i& v) { return xffffffff(v).sll32<3>(); }
+	__forceinline static GSVector4i xfffffffc(const GSVector4i& v) { return xffffffff(v).sll32<2>(); }
+	__forceinline static GSVector4i xfffffffe(const GSVector4i& v) { return xffffffff(v).sll32<1>(); }
+
+	__forceinline static GSVector4i x0001(const GSVector4i& v) { return xffffffff(v).srl16<15>(); }
+	__forceinline static GSVector4i x0003(const GSVector4i& v) { return xffffffff(v).srl16<14>(); }
+	__forceinline static GSVector4i x0007(const GSVector4i& v) { return xffffffff(v).srl16<13>(); }
+	__forceinline static GSVector4i x000f(const GSVector4i& v) { return xffffffff(v).srl16<12>(); }
+	__forceinline static GSVector4i x001f(const GSVector4i& v) { return xffffffff(v).srl16<11>(); }
+	__forceinline static GSVector4i x003f(const GSVector4i& v) { return xffffffff(v).srl16<10>(); }
+	__forceinline static GSVector4i x007f(const GSVector4i& v) { return xffffffff(v).srl16<9>(); }
+	__forceinline static GSVector4i x00ff(const GSVector4i& v) { return xffffffff(v).srl16<8>(); }
+	__forceinline static GSVector4i x01ff(const GSVector4i& v) { return xffffffff(v).srl16<7>(); }
+	__forceinline static GSVector4i x03ff(const GSVector4i& v) { return xffffffff(v).srl16<6>(); }
+	__forceinline static GSVector4i x07ff(const GSVector4i& v) { return xffffffff(v).srl16<5>(); }
+	__forceinline static GSVector4i x0fff(const GSVector4i& v) { return xffffffff(v).srl16<4>(); }
+	__forceinline static GSVector4i x1fff(const GSVector4i& v) { return xffffffff(v).srl16<3>(); }
+	__forceinline static GSVector4i x3fff(const GSVector4i& v) { return xffffffff(v).srl16<2>(); }
+	__forceinline static GSVector4i x7fff(const GSVector4i& v) { return xffffffff(v).srl16<1>(); }
+
+	__forceinline static GSVector4i x8000(const GSVector4i& v) { return xffffffff(v).sll16<15>(); }
+	__forceinline static GSVector4i xc000(const GSVector4i& v) { return xffffffff(v).sll16<14>(); }
+	__forceinline static GSVector4i xe000(const GSVector4i& v) { return xffffffff(v).sll16<13>(); }
+	__forceinline static GSVector4i xf000(const GSVector4i& v) { return xffffffff(v).sll16<12>(); }
+	__forceinline static GSVector4i xf800(const GSVector4i& v) { return xffffffff(v).sll16<11>(); }
+	__forceinline static GSVector4i xfc00(const GSVector4i& v) { return xffffffff(v).sll16<10>(); }
+	__forceinline static GSVector4i xfe00(const GSVector4i& v) { return xffffffff(v).sll16<9>(); }
+	__forceinline static GSVector4i xff00(const GSVector4i& v) { return xffffffff(v).sll16<8>(); }
+	__forceinline static GSVector4i xff80(const GSVector4i& v) { return xffffffff(v).sll16<7>(); }
+	__forceinline static GSVector4i xffc0(const GSVector4i& v) { return xffffffff(v).sll16<6>(); }
+	__forceinline static GSVector4i xffe0(const GSVector4i& v) { return xffffffff(v).sll16<5>(); }
+	__forceinline static GSVector4i xfff0(const GSVector4i& v) { return xffffffff(v).sll16<4>(); }
+	__forceinline static GSVector4i xfff8(const GSVector4i& v) { return xffffffff(v).sll16<3>(); }
+	__forceinline static GSVector4i xfffc(const GSVector4i& v) { return xffffffff(v).sll16<2>(); }
+	__forceinline static GSVector4i xfffe(const GSVector4i& v) { return xffffffff(v).sll16<1>(); }
+
+	__forceinline static GSVector4i xff(int n) { return m_xff[n]; }
+	__forceinline static GSVector4i x0f(int n) { return m_x0f[n]; }
+};
diff --git a/pcsx2/GS/MultiISA.cpp b/pcsx2/GS/MultiISA.cpp
index fac9901e81..415fa4d9d9 100644
--- a/pcsx2/GS/MultiISA.cpp
+++ b/pcsx2/GS/MultiISA.cpp
@@ -11,6 +11,8 @@
 #define strcasecmp _stricmp
 #endif
 
+#ifdef _M_X86
+
 static ProcessorFeatures::VectorISA getCurrentISA()
 {
 	// For debugging
@@ -41,11 +43,14 @@ static ProcessorFeatures::VectorISA getCurrentISA()
 		return ProcessorFeatures::VectorISA::SSE4;
 }
 
+#endif
+
 static ProcessorFeatures getProcessorFeatures()
 {
 	cpuinfo_initialize();
 
 	ProcessorFeatures features = {};
+#if defined(_M_X86)
 	features.vectorISA = getCurrentISA();
 	features.hasFMA = cpuinfo_has_x86_fma3();
 	if (const char* over = getenv("OVERRIDE_FMA"))
@@ -74,6 +79,7 @@ static ProcessorFeatures getProcessorFeatures()
 			features.hasSlowGather = true;
 		}
 	}
+#endif
 	return features;
 }
 
diff --git a/pcsx2/GS/MultiISA.h b/pcsx2/GS/MultiISA.h
index 728947e731..8a6a39a80f 100644
--- a/pcsx2/GS/MultiISA.h
+++ b/pcsx2/GS/MultiISA.h
@@ -44,10 +44,12 @@
 
 struct ProcessorFeatures
 {
+#ifdef _M_X86
 	enum class VectorISA { SSE4, AVX, AVX2 };
 	VectorISA vectorISA;
 	bool hasFMA;
 	bool hasSlowGather;
+#endif
 };
 
 extern const ProcessorFeatures g_cpu;
diff --git a/pcsx2/GS/Renderers/Common/GSVertex.h b/pcsx2/GS/Renderers/Common/GSVertex.h
index 470c8efd1a..fb84b9d499 100644
--- a/pcsx2/GS/Renderers/Common/GSVertex.h
+++ b/pcsx2/GS/Renderers/Common/GSVertex.h
@@ -21,10 +21,14 @@ struct alignas(32) GSVertex
 			u32 FOG;        // FOG:28
 		};
 
+#if defined(_M_X86)
 #if _M_SSE >= 0x500
 		__m256i mx;
 #endif
 		__m128i m[2];
+#elif defined(_M_ARM64)
+		int32x4_t m[2];
+#endif
 	};
 };
 
diff --git a/pcsx2/GS/Renderers/DX11/D3D.cpp b/pcsx2/GS/Renderers/DX11/D3D.cpp
index b472db194c..cdbdb40e18 100644
--- a/pcsx2/GS/Renderers/DX11/D3D.cpp
+++ b/pcsx2/GS/Renderers/DX11/D3D.cpp
@@ -318,8 +318,6 @@ std::string D3D::GetDriverVersionFromLUID(const LUID& luid)
 	return ret;
 }
 
-#ifdef _M_X86
-
 D3D::VendorID D3D::GetVendorID(IDXGIAdapter1* adapter)
 {
 	DXGI_ADAPTER_DESC1 desc;
@@ -382,6 +380,7 @@ GSRendererType D3D::GetPreferredRenderer()
 			Console.Error("D3D12CreateDevice() for automatic renderer failed: %08X", hr);
 		return device;
 	};
+#ifdef ENABLE_VULKAN
 	static constexpr auto check_for_mapping_layers = []() {
 		PCWSTR familyName = L"Microsoft.D3DMappingLayers_8wekyb3d8bbwe";
 		UINT32 numPackages = 0, bufferLength = 0;
@@ -391,7 +390,7 @@ GSRendererType D3D::GetPreferredRenderer()
 			Host::AddIconOSDMessage("VKDriverUnsupported", ICON_FA_TV,
 				TRANSLATE_STR("GS",
 					"Your system has the \"OpenCL, OpenGL, and Vulkan Compatibility Pack\" installed.\n"
-					"This Vulkan driver crashes PCSX2 on some GPUs.\n" 
+					"This Vulkan driver crashes PCSX2 on some GPUs.\n"
 					"To use the Vulkan renderer, you should remove this app package."),
 				Host::OSD_WARNING_DURATION);
 			return true;
@@ -416,6 +415,9 @@ GSRendererType D3D::GetPreferredRenderer()
 			"       to use the Vulkan renderer."), Host::OSD_WARNING_DURATION);
 		return false;
 	};
+#else
+	static constexpr auto check_vulkan_supported = []() { return false; };
+#endif
 
 	switch (GetVendorID(adapter.get()))
 	{
@@ -470,14 +472,16 @@ GSRendererType D3D::GetPreferredRenderer()
 
 		default:
 		{
-			// Default is D3D11
+			// Default is D3D11, but prefer DX12 on ARM (better drivers).
+#ifdef _M_ARM64
+			return GSRendererType::DX12;
+#else
 			return GSRendererType::DX11;
+#endif
 		}
 	}
 }
 
-#endif // _M_X86
-
 wil::com_ptr_nothrow<ID3DBlob> D3D::CompileShader(D3D::ShaderType type, D3D_FEATURE_LEVEL feature_level, bool debug,
 	const std::string_view code, const D3D_SHADER_MACRO* macros /* = nullptr */,
 	const char* entry_point /* = "main" */)
diff --git a/pcsx2/GS/Renderers/DX11/D3D.h b/pcsx2/GS/Renderers/DX11/D3D.h
index aa9db86f62..56767ca183 100644
--- a/pcsx2/GS/Renderers/DX11/D3D.h
+++ b/pcsx2/GS/Renderers/DX11/D3D.h
@@ -44,7 +44,6 @@ namespace D3D
 	// returns the driver version from the registry as a string
 	std::string GetDriverVersionFromLUID(const LUID& luid);
 
-#ifdef _M_X86
 	// this is sort of a legacy thing that doesn't have much to do with d3d (just the easiest way)
 	// checks to see if the adapter at 0 is NV and thus we should prefer OpenGL
 	enum class VendorID
@@ -57,7 +56,6 @@ namespace D3D
 
 	VendorID GetVendorID(IDXGIAdapter1* adapter);
 	GSRendererType GetPreferredRenderer();
-#endif
 
 	// D3DCompiler wrapper.
 	enum class ShaderType
diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.h b/pcsx2/GS/Renderers/SW/GSDrawScanline.h
index c9331a633b..2f37d1973b 100644
--- a/pcsx2/GS/Renderers/SW/GSDrawScanline.h
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.h
@@ -1,11 +1,18 @@
-// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
+// SPDX-FileCopyrightText: 2002-2024 PCSX2 Dev Team
 // SPDX-License-Identifier: LGPL-3.0+
 
 #pragma once
 
 #include "GS/GSState.h"
+
+#ifdef _M_X86
 #include "GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h"
 #include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h"
+#endif
+#ifdef _M_ARM64
+#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h"
+#include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h"
+#endif
 
 struct GSScanlineLocalData;
 
diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp
new file mode 100644
index 0000000000..1edf20c5a9
--- /dev/null
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.cpp
@@ -0,0 +1,2443 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#include "GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h"
+#include "GS/Renderers/SW/GSDrawScanline.h"
+#include "GS/Renderers/SW/GSVertexSW.h"
+#include "GS/GSState.h"
+
+#include "common/StringUtil.h"
+#include "common/Perf.h"
+
+#include <cstdint>
+
+// warning : offset of on non-standard-layout type 'GSScanlineGlobalData' [-Winvalid-offsetof]
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winvalid-offsetof"
+#endif
+
+MULTI_ISA_UNSHARED_IMPL;
+
+using namespace vixl::aarch64;
+
+static const auto& _steps = w0;
+static const auto& _left = w1;
+static const auto& _skip = w1;
+static const auto& _top = w2;
+static const auto& _v = x3;
+static const auto& _locals = x4;
+
+// x5-x9 used internally
+
+static const auto& _globals = x10;
+static const auto& _vm = x11;
+static const auto& _vm_high = x12;
+static const auto& _global_tex0 = x13;
+static const auto& _global_clut = x14;
+
+static const auto& _wscratch = w15;
+static const auto& _xscratch = x15;
+static const auto& _wscratch2 = w16;
+static const auto& _xscratch2 = x16;
+static const auto& _scratchaddr = x17;
+
+static const auto& _global_dimx = x19;
+static const auto& _local_aref = w20;
+static const auto& _global_frb = w21;
+static const auto& _global_fga = w22;
+static const auto& _global_l = w23;
+static const auto& _global_k = w24;
+static const auto& _global_mxl = w25;
+
+static const auto& _vscratch = v31;
+static const auto& _vscratch2 = v30;
+static const auto& _vscratch3 = v29;
+static const auto& _temp_s = v28;
+static const auto& _temp_t = v27;
+static const auto& _temp_q = v26;
+static const auto& _temp_z0 = v25;
+static const auto& _temp_z1 = v24;
+static const auto& _temp_rb = v23;
+static const auto& _temp_ga = v22;
+static const auto& _temp_zs = v21;
+static const auto& _temp_zd = v20;
+static const auto& _temp_vf = v19;
+static const auto& _d4_z = v18;
+static const auto& _d4_stq = v17;
+static const auto& _d4_c = v16;
+static const auto& _global_tmin = v15;
+static const auto& _global_tmax = v14;
+static const auto& _global_tmask = v13;
+static const auto& _const_movemskw_mask = v12;
+static const auto& _const_log2_coef = v11;
+static const auto& _temp_f = v10;
+static const auto& _d4_f = v9;
+
+static const auto& _test = v8;
+static const auto& _fd = v2;
+
+#define _local(field) MemOperand(_locals, offsetof(GSScanlineLocalData, field))
+#define _global(field) MemOperand(_globals, offsetof(GSScanlineGlobalData, field))
+#define armAsm (&m_emitter)
+
+GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(u64 key, void* code, size_t maxsize)
+	: m_emitter(static_cast<vixl::byte*>(code), maxsize, vixl::aarch64::PositionDependentCode)
+	, m_sel(key)
+{
+	// hopefully no constants which need to be moved to register first..
+	m_emitter.GetScratchRegisterList()->Remove(_xscratch.GetCode());
+	m_emitter.GetScratchRegisterList()->Remove(_xscratch2.GetCode());
+}
+
+void GSDrawScanlineCodeGenerator::Generate()
+{
+	if (m_sel.breakpoint)
+		armAsm->Brk(1);
+
+	if (GSDrawScanline::ShouldUseCDrawScanline(m_sel.key))
+	{
+		armAsm->Mov(vixl::aarch64::x15, reinterpret_cast<uintptr_t>(
+											static_cast<void (*)(int, int, int, const GSVertexSW&, GSScanlineLocalData&)>(
+												&GSDrawScanline::CDrawScanline)));
+		armAsm->Br(vixl::aarch64::x15);
+		armAsm->FinalizeCode();
+		return;
+	}
+
+	armAsm->Sub(sp, sp, 128);
+	armAsm->Stp(x19, x20, MemOperand(sp, 0));
+	armAsm->Stp(x21, x22, MemOperand(sp, 16));
+	armAsm->Stp(x23, x24, MemOperand(sp, 32));
+	armAsm->Stp(x25, x26, MemOperand(sp, 48));
+	armAsm->Stp(d8, d9, MemOperand(sp, 64));
+	armAsm->Stp(d10, d11, MemOperand(sp, 80));
+	armAsm->Stp(d12, d13, MemOperand(sp, 96));
+	armAsm->Stp(d14, d15, MemOperand(sp, 112));
+
+	armAsm->Ldr(_globals, _local(gd));
+	armAsm->Ldr(_vm, _global(vm));
+	armAsm->Add(_vm_high, _vm, 8 * 2);
+
+	Init();
+
+	Label loop;
+	armAsm->Bind(&loop);
+
+	bool tme = m_sel.tfx != TFX_NONE;
+
+	TestZ(tme ? v5 : v2, tme ? v6 : v3);
+
+	if (m_sel.mmin)
+		SampleTextureLOD();
+	else
+		SampleTexture();
+
+	AlphaTFX();
+
+	ReadMask();
+
+	TestAlpha();
+
+	ColorTFX();
+
+	Fog();
+
+	ReadFrame();
+
+	TestDestAlpha();
+
+	WriteMask();
+
+	WriteZBuf();
+
+	AlphaBlend();
+
+	WriteFrame();
+
+	Label exit;
+	armAsm->Bind(&m_step_label);
+
+	// if(steps <= 0) break;
+
+	if (!m_sel.edge)
+	{
+		armAsm->Cmp(_steps, 0);
+		armAsm->B(le, &exit);
+
+		Step();
+
+		armAsm->B(&loop);
+	}
+
+	armAsm->Bind(&exit);
+
+	armAsm->Ldp(d14, d15, MemOperand(sp, 112));
+	armAsm->Ldp(d12, d13, MemOperand(sp, 96));
+	armAsm->Ldp(d10, d11, MemOperand(sp, 80));
+	armAsm->Ldp(d8, d9, MemOperand(sp, 64));
+	armAsm->Ldp(x25, x26, MemOperand(sp, 48));
+	armAsm->Ldp(x23, x24, MemOperand(sp, 32));
+	armAsm->Ldp(x21, x22, MemOperand(sp, 16));
+	armAsm->Ldp(x19, x20, MemOperand(sp, 0));
+	armAsm->Add(sp, sp, 128);
+
+	armAsm->Ret();
+
+	armAsm->FinalizeCode();
+
+	Perf::any.RegisterKey(GetCode(), GetSize(), "GSDrawScanline_", m_sel.key);
+}
+
+void GSDrawScanlineCodeGenerator::Init()
+{
+	if (!m_sel.notest)
+	{
+		// int skip = left & 3;
+
+		armAsm->Mov(w6, _left);
+		armAsm->And(_left, _left, 3);
+
+		// int steps = pixels + skip - 4;
+
+		armAsm->Add(_steps, _steps, _left);
+		armAsm->Sub(_steps, _steps, 4);
+
+		// left -= skip;
+
+		armAsm->Sub(w6, w6, _left);
+
+		// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
+
+		armAsm->Lsl(_left, _left, 4);
+
+		armAsm->Add(_scratchaddr, _globals, offsetof(GSScanlineGlobalData, const_test_128b[0]));
+		armAsm->Ldr(_test, MemOperand(_scratchaddr, x1));
+
+		armAsm->Asr(w5, _steps, 31);
+		armAsm->And(w5, w5, _steps);
+		armAsm->Lsl(w5, w5, 4);
+
+		armAsm->Add(_scratchaddr, _globals, offsetof(GSScanlineGlobalData, const_test_128b[7]));
+		armAsm->Ldr(_vscratch, MemOperand(_scratchaddr, w5, SXTW));
+		armAsm->Orr(_test.V16B(), _test.V16B(), _vscratch.V16B());
+	}
+	else
+	{
+		armAsm->Mov(w6, _left); // left
+		armAsm->Mov(_skip, wzr); // skip
+		armAsm->Sub(_steps, _steps, 4); // steps
+	}
+
+	// GSVector2i* fza_base = &m_local.gd->fzbr[top];
+
+	armAsm->Ldr(_scratchaddr, _global(fzbr));
+	armAsm->Lsl(w7, _top, 3); // *8
+	armAsm->Add(x7, _scratchaddr, x7);
+
+	// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
+	armAsm->Ldr(_scratchaddr, _global(fzbc));
+	armAsm->Lsl(w8, w6, 1); // *2
+	armAsm->Add(x8, _scratchaddr, x8);
+
+	if ((m_sel.prim != GS_SPRITE_CLASS && ((m_sel.fwrite && m_sel.fge) || m_sel.zb)) || (m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)))
+	{
+		// w1 = &m_local.d[skip]
+
+		armAsm->Lsl(w1, w1, 3); // *8
+		armAsm->Add(x1, x1, _locals);
+		static_assert(offsetof(GSScanlineLocalData, d) == 0);
+	}
+
+	if (m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if ((m_sel.fwrite && m_sel.fge) || m_sel.zb)
+		{
+			armAsm->Ldr(_d4_z, _local(d4.z));
+
+			if (m_sel.fwrite && m_sel.fge)
+			{
+				// f = GSVector4i(v.t).zzzzh().zzzz().add16(m_local.d[skip].f);
+				armAsm->Ldr(_temp_f.S(), MemOperand(_v, offsetof(GSVertexSW, t.w)));
+				armAsm->Ldr(_vscratch, MemOperand(x1, offsetof(GSScanlineLocalData::skip, f)));
+				armAsm->Ldr(_d4_f, _local(d4.f));
+
+				armAsm->Fcvtzs(_temp_f.S(), _temp_f.S());
+				armAsm->Dup(_temp_f.V8H(), _temp_f.V8H(), 0);
+				armAsm->Add(_temp_f.V8H(), _temp_f.V8H(), _vscratch.V8H());
+			}
+
+			if (m_sel.zb && m_sel.zequal)
+			{
+				armAsm->Ldr(_temp_z0.D(), MemOperand(_v, offsetof(GSVertexSW, p.z)));
+				armAsm->Fcvtzs(_temp_z0.D(), _temp_z0.D());
+				armAsm->Dup(_temp_z0.V4S(), _temp_z0.V4S(), 0);
+			}
+			else if (m_sel.zb)
+			{
+				// z = vp.zzzz() + m_local.d[skip].z;
+				armAsm->Add(_scratchaddr, _v, offsetof(GSVertexSW, p.z));
+				armAsm->Ldr(_vscratch, MemOperand(x1, offsetof(GSScanlineLocalData::skip, z)));
+				armAsm->Ld1r(_vscratch2.V2D(), MemOperand(_scratchaddr));
+
+				// low
+				armAsm->Fcvtl(_temp_z0.V2D(), _vscratch.V2S());
+				armAsm->Fadd(_temp_z0.V2D(), _temp_z0.V2D(), _vscratch2.V2D());
+
+				// high
+				armAsm->Fcvtl2(_temp_z1.V2D(), _vscratch.V4S());
+				armAsm->Fadd(_temp_z1.V2D(), _temp_z1.V2D(), _vscratch2.V2D());
+			}
+		}
+	}
+	else
+	{
+		if (m_sel.ztest || m_sel.zwrite)
+		{
+			armAsm->Ldr(_temp_z0, _local(p.z));
+		}
+
+		if (m_sel.fwrite && m_sel.fge)
+		{
+			armAsm->Ldr(_temp_f, _local(p.f));
+		}
+	}
+
+	if (m_sel.fb)
+	{
+		if (m_sel.edge)
+		{
+			armAsm->Add(_scratchaddr, _v, offsetof(GSVertexSW, p.x));
+			armAsm->Ld1r(v3.V8H(), MemOperand(_scratchaddr));
+		}
+
+		if (m_sel.tfx != TFX_NONE)
+		{
+			armAsm->Ldr(v4, MemOperand(_v, offsetof(GSVertexSW, t)));
+		}
+
+		if (m_sel.edge)
+		{
+			// m_local.temp.cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
+			armAsm->Ushr(v3.V8H(), v3.V8H(), 9);
+			armAsm->Str(v3, _local(temp.cov));
+		}
+
+		if (m_sel.tfx != TFX_NONE)
+		{
+			if (m_sel.fst)
+			{
+				// GSVector4i vti(vt);
+
+				armAsm->Fcvtzs(v6.V4S(), v4.V4S());
+
+				// s = vti.xxxx() + m_local.d[skip].s;
+				// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
+
+				armAsm->Dup(_temp_s.V4S(), v6.V4S(), 0);
+				armAsm->Dup(_temp_t.V4S(), v6.V4S(), 1);
+
+				armAsm->Ldr(_vscratch, MemOperand(x1, offsetof(GSScanlineLocalData::skip, s)));
+				armAsm->Add(_temp_s.V4S(), _temp_s.V4S(), _vscratch.V4S());
+
+				if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					armAsm->Ldr(_vscratch, MemOperand(x1, offsetof(GSScanlineLocalData::skip, t)));
+					armAsm->Add(_temp_t.V4S(), _temp_t.V4S(), _vscratch.V4S());
+				}
+				else
+				{
+					if (m_sel.ltf)
+					{
+						armAsm->Trn1(_temp_vf.V8H(), _temp_t.V8H(), _temp_t.V8H());
+						armAsm->Ushr(_temp_vf.V8H(), _temp_vf.V8H(), 12);
+					}
+				}
+			}
+			else
+			{
+				// s = vt.xxxx() + m_local.d[skip].s;
+				// t = vt.yyyy() + m_local.d[skip].t;
+				// q = vt.zzzz() + m_local.d[skip].q;
+
+				armAsm->Ldr(_temp_s, MemOperand(x1, offsetof(GSScanlineLocalData::skip, s)));
+				armAsm->Ldr(_temp_t, MemOperand(x1, offsetof(GSScanlineLocalData::skip, t)));
+				armAsm->Ldr(_temp_q, MemOperand(x1, offsetof(GSScanlineLocalData::skip, q)));
+
+				armAsm->Dup(v2.V4S(), v4.V4S(), 0);
+				armAsm->Dup(v3.V4S(), v4.V4S(), 1);
+				armAsm->Dup(v4.V4S(), v4.V4S(), 2);
+
+				armAsm->Fadd(_temp_s.V4S(), v2.V4S(), _temp_s.V4S());
+				armAsm->Fadd(_temp_t.V4S(), v3.V4S(), _temp_t.V4S());
+				armAsm->Fadd(_temp_q.V4S(), v4.V4S(), _temp_q.V4S());
+			}
+
+			armAsm->Ldr(_d4_stq, _local(d4.stq));
+			armAsm->Ldr(_global_tmin, _global(t.min));
+			armAsm->Ldr(_global_tmax, _global(t.max));
+			armAsm->Ldr(_global_tmask, _global(t.mask));
+
+			if (!m_sel.mmin)
+				armAsm->Ldr(_global_tex0, _global(tex[0]));
+			else
+				armAsm->Add(_global_tex0, _globals, offsetof(GSScanlineGlobalData, tex));
+
+			if (m_sel.tlu)
+				armAsm->Ldr(_global_clut, _global(clut));
+		}
+
+		if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if (m_sel.iip)
+			{
+				// GSVector4i vc = GSVector4i(v.c);
+
+				armAsm->Ldr(v6, MemOperand(_v, offsetof(GSVertexSW, c)));
+				armAsm->Ldr(v1, MemOperand(x1, offsetof(GSScanlineLocalData::skip, rb)));
+				armAsm->Ldr(_vscratch, MemOperand(x1, offsetof(GSScanlineLocalData::skip, ga)));
+				armAsm->Fcvtzs(v6.V4S(), v6.V4S());
+
+				// vc = vc.upl16(vc.zwxy());
+
+				armAsm->Ext(v5.V16B(), v6.V16B(), v6.V16B(), 8);
+				armAsm->Zip1(v6.V8H(), v6.V8H(), v5.V8H());
+
+				// rb = vc.xxxx().add16(m_local.d[skip].rb);
+				// ga = vc.zzzz().add16(m_local.d[skip].ga);
+
+				armAsm->Dup(_temp_rb.V4S(), v6.V4S(), 0);
+				armAsm->Dup(_temp_ga.V4S(), v6.V4S(), 2);
+
+				armAsm->Add(_temp_rb.V8H(), _temp_rb.V8H(), v1.V8H());
+				armAsm->Add(_temp_ga.V8H(), _temp_ga.V8H(), _vscratch.V8H());
+
+				armAsm->Ldr(_d4_c, _local(d4.c));
+			}
+			else
+			{
+				armAsm->Ldr(_temp_rb, _local(c.rb));
+				armAsm->Ldr(_temp_ga, _local(c.ga));
+			}
+		}
+	}
+
+	if (m_sel.atst != ATST_ALWAYS && m_sel.atst != ATST_NEVER)
+	{
+		armAsm->Ldr(_local_aref, _global(aref));
+	}
+
+	if (m_sel.fwrite && m_sel.fge)
+	{
+		armAsm->Ldr(_global_frb, _global(frb));
+		armAsm->Ldr(_global_fga, _global(fga));
+	}
+
+	if (!m_sel.notest)
+		armAsm->Ldr(_const_movemskw_mask, _global(const_movemaskw_mask));
+
+	if (m_sel.mmin && !m_sel.lcm)
+	{
+		armAsm->Ldr(_const_log2_coef, _global(const_log2_coef));
+		armAsm->Ldr(_global_l, _global(l));
+		armAsm->Ldr(_global_k, _global(k));
+		armAsm->Ldr(_global_mxl, _global(mxl));
+	}
+
+	if (m_sel.fpsm == 2 && m_sel.dthe)
+		armAsm->Ldr(_global_dimx, _global(dimx));
+}
+
+void GSDrawScanlineCodeGenerator::Step()
+{
+	// steps -= 4;
+
+	armAsm->Sub(_steps, _steps, 4);
+
+	// fza_offset++;
+
+	armAsm->Add(x8, x8, 8);
+
+	if (m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// z += m_local.d4.z;
+
+		if (m_sel.zb && !m_sel.zequal)
+		{
+			armAsm->Fadd(_temp_z1.V2D(), _temp_z1.V2D(), _d4_z.V2D());
+			armAsm->Fadd(_temp_z0.V2D(), _temp_z0.V2D(), _d4_z.V2D());
+		}
+
+		// f = f.add16(m_local.d4.f);
+
+		if (m_sel.fwrite && m_sel.fge)
+		{
+			armAsm->Add(_temp_f.V8H(), _temp_f.V8H(), _d4_f.V8H());
+		}
+	}
+
+	if (m_sel.fb)
+	{
+		if (m_sel.tfx != TFX_NONE)
+		{
+			if (m_sel.fst)
+			{
+				// GSVector4i stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// if(!sprite) t += stq.yyyy();
+
+				armAsm->Dup(_vscratch.V4S(), _d4_stq.V4S(), 0);
+				if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+					armAsm->Dup(_vscratch2.V4S(), _d4_stq.V4S(), 1);
+
+				armAsm->Add(_temp_s.V4S(), _temp_s.V4S(), _vscratch.V4S());
+
+				if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+					armAsm->Add(_temp_t.V4S(), _temp_t.V4S(), _vscratch2.V4S());
+			}
+			else
+			{
+				// GSVector4 stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// t += stq.yyyy();
+				// q += stq.zzzz();
+
+				armAsm->Dup(_vscratch.V4S(), _d4_stq.V4S(), 0);
+				armAsm->Dup(_vscratch2.V4S(), _d4_stq.V4S(), 1);
+				armAsm->Dup(v1.V4S(), _d4_stq.V4S(), 2);
+
+				armAsm->Fadd(_temp_s.V4S(), _temp_s.V4S(), _vscratch.V4S());
+				armAsm->Fadd(_temp_t.V4S(), _temp_t.V4S(), _vscratch2.V4S());
+				armAsm->Fadd(_temp_q.V4S(), _temp_q.V4S(), v1.V4S());
+			}
+		}
+
+		if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if (m_sel.iip)
+			{
+				// GSVector4i c = m_local.d4.c;
+
+				// rb = rb.add16(c.xxxx());
+				// ga = ga.add16(c.yyyy());
+
+				armAsm->Dup(_vscratch.V4S(), _d4_c.V4S(), 0);
+				armAsm->Dup(_vscratch2.V4S(), _d4_c.V4S(), 1);
+				armAsm->Movi(v1.V8H(), 0);
+
+				armAsm->Add(_temp_rb.V8H(), _temp_rb.V8H(), _vscratch.V8H());
+				armAsm->Add(_temp_ga.V8H(), _temp_ga.V8H(), _vscratch2.V8H());
+
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				armAsm->Smax(_temp_rb.V8H(), _temp_rb.V8H(), v1.V8H());
+				armAsm->Smax(_temp_ga.V8H(), _temp_ga.V8H(), v1.V8H());
+			}
+		}
+	}
+
+	if (!m_sel.notest)
+	{
+		// test = m_test[7 + (steps & (steps >> 31))];
+
+		armAsm->Asr(w1, _steps, 31);
+		armAsm->And(w1, w1, _steps);
+		armAsm->Lsl(w1, w1, 4);
+
+		armAsm->Add(_scratchaddr, _globals, offsetof(GSScanlineGlobalData, const_test_128b[7]));
+		armAsm->Ldr(_test, MemOperand(_scratchaddr, x1, SXTW));
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestZ(const VRegister& temp1, const VRegister& temp2)
+{
+	if (!m_sel.zb)
+	{
+		return;
+	}
+
+	// int za = fza_base.y + fza_offset->y;
+
+	armAsm->Ldr(w9, MemOperand(x7, 4));
+	armAsm->Ldr(_wscratch, MemOperand(x8, 4));
+	armAsm->Add(w9, w9, _wscratch);
+	armAsm->And(w9, w9, HALF_VM_SIZE - 1);
+
+	// GSVector4i zs = zi;
+
+	VRegister zs;
+	if (m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if (m_sel.zequal)
+		{
+			zs = _temp_z0;
+		}
+		else if (m_sel.zoverflow)
+		{
+			// GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32();
+			// GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32();
+
+			armAsm->Movi(temp1.V2D(), GSVector4::m_xc1e00000000fffff.U64[0]);
+			armAsm->Fadd(_temp_z0.V2D(), _temp_z0.V2D(), temp1.V2D());
+			armAsm->Fadd(_temp_z1.V2D(), _temp_z1.V2D(), temp1.V2D());
+
+			// zs = GSVector8i(zl, zh);
+			armAsm->Fcvtzs(v0.V2D(), _temp_z0.V2D());
+			armAsm->Fcvtzs(temp1.V2D(), _temp_z1.V2D());
+			armAsm->Movi(_vscratch.V4S(), 0x80000000);
+			armAsm->Uzp1(v0.V4S(), v0.V4S(), temp1.V4S());
+
+			// zs += VectorI::x80000000();
+			armAsm->Add(v0.V4S(), v0.V4S(), _vscratch.V4S());
+			zs = v0;
+		}
+		else
+		{
+			// zs = GSVector8i(z0.f64toi32(), z1.f64toi32());
+
+			armAsm->Fcvtzs(v0.V2D(), _temp_z0.V2D());
+			armAsm->Fcvtzs(temp1.V2D(), _temp_z1.V2D());
+			armAsm->Uzp1(v0.V4S(), v0.V4S(), temp1.V4S());
+			zs = v0;
+		}
+
+
+		// Clamp Z to ZPSM_FMT_MAX
+		if (m_sel.zclamp)
+		{
+			armAsm->Movi(temp1.V4S(), 0xFFFFFFFFu >> ((m_sel.zpsm & 0x3) * 8));
+			armAsm->Umin(v0.V4S(), zs.V4S(), temp1.V4S());
+			zs = v0;
+		}
+
+		if (m_sel.zwrite)
+			armAsm->Mov(_temp_zs, zs);
+	}
+	else
+	{
+		zs = _temp_z0;
+	}
+
+	if (m_sel.ztest)
+	{
+		VRegister zd(_temp_zd);
+		ReadPixel(zd, w9);
+
+		// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+		if (m_sel.zpsm)
+		{
+			armAsm->Shl(v1.V4S(), zd.V4S(), m_sel.zpsm * 8);
+			armAsm->Ushr(v1.V4S(), v1.V4S(), m_sel.zpsm * 8);
+			zd = v1;
+		}
+
+		if (m_sel.zpsm == 0)
+		{
+			// GSVector4i o = GSVector4i::x80000000();
+			armAsm->Movi(temp1.V4S(), 0x80000000u);
+
+			// GSVector4i zso = zs - o;
+			// GSVector4i zdo = zd - o;
+			armAsm->Sub(v0.V4S(), zs.V4S(), temp1.V4S());
+			armAsm->Sub(v1.V4S(), zd.V4S(), temp1.V4S());
+			zs = v0;
+			zd = v1;
+		}
+
+		switch (m_sel.ztst)
+		{
+			case ZTST_GEQUAL:
+				// test |= zso < zdo; // ~(zso >= zdo)
+				armAsm->Cmgt(v1.V4S(), zd.V4S(), zs.V4S());
+				armAsm->Orr(_test.V16B(), _test.V16B(), v1.V16B());
+				break;
+
+			case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+				// test |= zso <= zdo; // ~(zso > zdo)
+				armAsm->Cmgt(v0.V4S(), zs.V4S(), zd.V4S());
+				armAsm->Mvn(v0.V16B(), v0.V16B());
+				armAsm->Orr(_test.V16B(), _test.V16B(), v0.V16B());
+				break;
+		}
+
+		alltrue(_test, temp1);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture()
+{
+	if (!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	const auto& uf = v4;
+	const auto& vf = v7;
+
+	VRegister ureg = _temp_s;
+	VRegister vreg = _temp_t;
+
+	if (!m_sel.fst)
+	{
+		armAsm->Fdiv(v2.V4S(), _temp_s.V4S(), _temp_q.V4S());
+		armAsm->Fdiv(v3.V4S(), _temp_t.V4S(), _temp_q.V4S());
+		ureg = v2;
+		vreg = v3;
+
+		armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+		armAsm->Fcvtzs(v3.V4S(), v3.V4S());
+
+		if (m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			armAsm->Movi(v1.V4S(), 0x8000);
+			armAsm->Sub(v2.V4S(), v2.V4S(), v1.V4S());
+			armAsm->Sub(v3.V4S(), v3.V4S(), v1.V4S());
+		}
+	}
+
+	if (m_sel.ltf)
+	{
+		// GSVector4i uf = u.xxzzlh().srl16(12);
+
+		armAsm->Trn1(uf.V8H(), ureg.V8H(), ureg.V8H());
+		armAsm->Ushr(uf.V8H(), uf.V8H(), 12);
+
+		if (m_sel.prim != GS_SPRITE_CLASS)
+		{
+			// GSVector4i vf = v.xxzzlh().srl16(12);
+
+			armAsm->Trn1(vf.V8H(), vreg.V8H(), vreg.V8H());
+			armAsm->Ushr(vf.V8H(), vf.V8H(), 12);
+		}
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	armAsm->Sshr(v2.V4S(), ureg.V4S(), 16);
+	armAsm->Sshr(v3.V4S(), vreg.V4S(), 16);
+	armAsm->Sqxtn(v2.V4H(), v2.V4S());
+	armAsm->Sqxtn2(v2.V8H(), v3.V4S());
+
+	if (m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		armAsm->Movi(v1.V8H(), 1);
+		armAsm->Add(v3.V8H(), v2.V8H(), v1.V8H());
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		Wrap(v2, v3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		Wrap(v2);
+	}
+
+	SampleTexture_TexelReadHelper(0);
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture_TexelReadHelper(int mip_offset)
+{
+	const auto& uf = v4;
+	const auto& vf = (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) ? v7 : _temp_vf;
+
+	// GSVector4i y0 = uv0.uph16() << tw;
+	// GSVector4i x0 = uv0.upl16();
+
+	armAsm->Movi(v0.V8H(), 0);
+
+	armAsm->Zip1(v5.V8H(), v2.V8H(), v0.V8H());
+	armAsm->Zip2(v2.V8H(), v2.V8H(), v0.V8H());
+	armAsm->Shl(v2.V4S(), v2.V4S(), m_sel.tw + 3);
+
+	if (m_sel.ltf)
+	{
+		// GSVector4i x1 = uv1.upl16();
+		// GSVector4i y1 = uv1.uph16() << tw;
+
+		armAsm->Zip1(v1.V8H(), v3.V8H(), v0.V8H());
+		armAsm->Zip2(v3.V8H(), v3.V8H(), v0.V8H());
+		armAsm->Shl(v3.V4S(), v3.V4S(), m_sel.tw + 3);
+
+		// GSVector4i addr00 = y0 + x0;
+		// GSVector4i addr01 = y0 + x1;
+		// GSVector4i addr10 = y1 + x0;
+		// GSVector4i addr11 = y1 + x1;
+
+		armAsm->Add(v0.V4S(), v3.V4S(), v1.V4S()); // addr11
+		armAsm->Add(v1.V4S(), v1.V4S(), v2.V4S()); // addr01
+		armAsm->Add(v2.V4S(), v2.V4S(), v5.V4S()); // addr00
+		armAsm->Add(v3.V4S(), v3.V4S(), v5.V4S()); // addr10
+
+		// c00 = addr00.gather32_32((const u32/u8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const u32/u8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const u32/u8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const u32/u8*)tex[, clut]);
+
+		//         d0  d1  d2s0  d3s1 s2 s3
+		ReadTexel4(v5, v6, v0, v2, v1, v3, mip_offset);
+
+		// GSVector4i rb00 = c00 & mask;
+		// GSVector4i ga00 = (c00 >> 8) & mask;
+
+		split16_2x8(v3, v6, v6);
+
+		// GSVector4i rb01 = c01 & mask;
+		// GSVector4i ga01 = (c01 >> 8) & mask;
+
+		split16_2x8(v0, v1, v0);
+
+		// rb00 = rb00.lerp16_4(rb01, uf);
+		// ga00 = ga00.lerp16_4(ga01, uf);
+
+		lerp16_4(v0, v3, uf);
+		lerp16_4(v1, v6, uf);
+
+		// GSVector4i rb10 = c10 & mask;
+		// GSVector4i ga10 = (c10 >> 8) & mask;
+
+		split16_2x8(v2, v3, v2);
+
+		// GSVector4i rb11 = c11 & mask;
+		// GSVector4i ga11 = (c11 >> 8) & mask;
+
+		split16_2x8(v5, v6, v5);
+
+		// rb10 = rb10.lerp16_4(rb11, uf);
+		// ga10 = ga10.lerp16_4(ga11, uf);
+
+		lerp16_4(v5, v2, uf);
+		lerp16_4(v6, v3, uf);
+
+		// rb00 = rb00.lerp16_4(rb10, vf);
+		// ga00 = ga00.lerp16_4(ga10, vf);
+
+		lerp16_4(v5, v0, vf);
+		lerp16_4(v6, v1, vf);
+	}
+	else
+	{
+		// GSVector4i addr00 = y0 + x0;
+
+		armAsm->Add(v2.V4S(), v2.V4S(), v5.V4S());
+
+		// c00 = addr00.gather32_32((const u32/u8*)tex[, clut]);
+
+		ReadTexel1(v5, v2, v0, mip_offset);
+
+		// GSVector4i mask = GSVector4i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		split16_2x8(v5, v6, v5);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const VRegister& uv)
+{
+	// v0, v1, v4, v5, v6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if (wms_clamp == wmt_clamp)
+	{
+		if (wms_clamp)
+		{
+			if (region)
+			{
+				armAsm->Smax(uv.V8H(), uv.V8H(), _global_tmin.V8H());
+			}
+			else
+			{
+				armAsm->Movi(v0.V8H(), 0);
+				armAsm->Smax(uv.V8H(), uv.V8H(), v0.V8H());
+			}
+
+			armAsm->Smin(uv.V8H(), uv.V8H(), _global_tmax.V8H());
+		}
+		else
+		{
+			armAsm->And(uv.V16B(), uv.V16B(), _global_tmin.V16B());
+
+			if (region)
+				armAsm->Orr(uv.V16B(), uv.V16B(), _global_tmax.V16B());
+		}
+	}
+	else
+	{
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		armAsm->And(v1.V16B(), uv.V16B(), _global_tmin.V16B());
+
+		if (region)
+			armAsm->Orr(v1.V16B(), v1.V16B(), _global_tmax.V16B());
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		armAsm->Smax(uv.V8H(), uv.V8H(), _global_tmin.V8H());
+		armAsm->Smin(_vscratch.V8H(), uv.V8H(), _global_tmax.V8H());
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+		armAsm->Sshr(uv.V16B(), _global_tmask.V16B(), 7);
+		armAsm->Bsl(uv.V16B(), v1.V16B(), _vscratch.V16B());
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const VRegister& uv0, const VRegister& uv1)
+{
+	// v0, v1, v4, v5, v6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if (wms_clamp == wmt_clamp)
+	{
+		if (wms_clamp)
+		{
+			if (region)
+			{
+				armAsm->Smax(uv0.V8H(), uv0.V8H(), _global_tmin.V8H());
+				armAsm->Smax(uv1.V8H(), uv1.V8H(), _global_tmin.V8H());
+			}
+			else
+			{
+				armAsm->Movi(v0.V16B(), 0);
+				armAsm->Smax(uv0.V8H(), uv0.V8H(), v0.V8H());
+				armAsm->Smax(uv1.V8H(), uv1.V8H(), v0.V8H());
+			}
+
+			armAsm->Smin(uv0.V8H(), uv0.V8H(), _global_tmax.V8H());
+			armAsm->Smin(uv1.V8H(), uv1.V8H(), _global_tmax.V8H());
+		}
+		else
+		{
+			armAsm->And(uv0.V16B(), uv0.V16B(), _global_tmin.V16B());
+			armAsm->And(uv1.V16B(), uv1.V16B(), _global_tmin.V16B());
+
+			if (region)
+			{
+				armAsm->Orr(uv0.V16B(), uv0.V16B(), _global_tmax.V16B());
+				armAsm->Orr(uv1.V16B(), uv1.V16B(), _global_tmax.V16B());
+			}
+		}
+	}
+	else
+	{
+		// uv0
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		armAsm->And(v1.V16B(), uv0.V16B(), _global_tmin.V16B());
+
+		if (region)
+			armAsm->Orr(v1.V16B(), v1.V16B(), _global_tmax.V16B());
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		armAsm->Smax(uv0.V8H(), uv0.V8H(), _global_tmin.V8H());
+		armAsm->Smin(_vscratch.V8H(), uv0.V8H(), _global_tmax.V8H());
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+		armAsm->Sshr(uv0.V16B(), _global_tmask.V16B(), 7);
+		armAsm->Bsl(uv0.V16B(), v1.V16B(), _vscratch.V16B());
+
+		// uv1
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		armAsm->And(v1.V16B(), uv1.V16B(), _global_tmin.V16B());
+
+		if (region)
+			armAsm->Orr(v1.V16B(), v1.V16B(), _global_tmax.V16B());
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		armAsm->Smax(uv1.V8H(), uv1.V8H(), _global_tmin.V8H());
+		armAsm->Smin(_vscratch.V8H(), uv1.V8H(), _global_tmax.V8H());
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+		armAsm->Sshr(uv1.V16B(), _global_tmask.V16B(), 7);
+		armAsm->Bsl(uv1.V16B(), v1.V16B(), _vscratch.V16B());
+	}
+}
+
+/// Input: v4=q, v2=s, v3=t
+/// Output: _rb, _ga
+void GSDrawScanlineCodeGenerator::SampleTextureLOD()
+{
+	if (!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	const auto& uf = v4;
+	const auto& vf = v7;
+	const auto& local0 = _vscratch; // used for uv
+	const auto& local1 = _vscratch2; // used for uv
+	const auto& local2 = _vscratch3;
+
+	VRegister uv0(_temp_s);
+	VRegister uv1(_temp_t);
+
+	if (!m_sel.fst)
+	{
+		armAsm->Fdiv(local0.V4S(), _temp_s.V4S(), _temp_q.V4S());
+		armAsm->Fdiv(local1.V4S(), _temp_t.V4S(), _temp_q.V4S());
+
+		armAsm->Fcvtzs(local0.V4S(), local0.V4S());
+		armAsm->Fcvtzs(local1.V4S(), local1.V4S());
+
+		uv0 = local0;
+		uv1 = local1;
+	}
+
+	// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?)
+
+	if (!m_sel.lcm)
+	{
+		// lod = -log2(Q) * (1 << L) + K
+
+		armAsm->Movi(v1.V4S(), 127);
+		armAsm->Shl(v0.V4S(), _temp_q.V4S(), 1);
+		armAsm->Ushr(v0.V4S(), v0.V4S(), 24);
+		armAsm->Sub(v0.V4S(), v0.V4S(), v1.V4S());
+		armAsm->Scvtf(v0.V4S(), v0.V4S());
+
+		// v0 = (float)(exp(q) - 127)
+
+		armAsm->Shl(v4.V4S(), _temp_q.V4S(), 9);
+		armAsm->Ushr(v4.V4S(), v4.V4S(), 9);
+
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 3);
+		armAsm->Orr(v4.V16B(), v4.V16B(), v1.V16B()); // m_log2_coef_128b[3]
+
+		// v4 = mant(q) | 1.0f
+		// v4 = log2(Q) = ((((c0 * v4) + c1) * v4) + c2) * (v4 - 1.0f) + v0
+
+#if 0
+		// non-fma
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 0);
+		armAsm->Fmul(v5.V4S(), v4.V4S(), v1.V4S());
+
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 1);
+		armAsm->Fadd(v5.V4S(), v5.V4S(), v1.V4S());
+
+		armAsm->Fmul(v5.V4S(), v5.V4S(), v4.V4S());
+
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 3);
+		armAsm->Fsub(v4.V4S(), v4.V4S(), v1.V4S());
+			
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 2);
+		armAsm->Fadd(v5.V4S(), v5.V4S(), v1.V4S());
+
+		armAsm->Fmul(v4.V4S(), v4.V4S(), v5.V4S());
+		armAsm->Fadd(v4.V4S(), v4.V4S(), v0.V4S());
+
+		armAsm->Dup(v0.V4S(), _global_l);
+		armAsm->Dup(v1.V4S(), _global_k);
+
+		armAsm->Fmul(v4.V4S(), v4.V4S(), v0.V4S());
+		armAsm->Fadd(v4.V4S(), v4.V4S(), v1.V4S());
+#else
+		// fma
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 0); // v1 = c0
+		armAsm->Dup(local2.V4S(), _const_log2_coef.V4S(), 1); // local2 = c1
+		armAsm->Fmla(local2.V4S(), v4.V4S(), v1.V4S()); // local2 = c0 * v4 + c1
+		armAsm->Dup(v1.V4S(), _const_log2_coef.V4S(), 2); // v1 = c2
+		armAsm->Fmla(v1.V4S(), local2.V4S(), v4.V4S()); // v1 = ((c0 * v4 + c1) * v4) + c2
+		armAsm->Dup(local2.V4S(), _const_log2_coef.V4S(), 3); // local2 = c3
+		armAsm->Fsub(v4.V4S(), v4.V4S(), local2.V4S()); // v4 -= 1.0f
+		armAsm->Fmla(v0.V4S(), v4.V4S(), v1.V4S()); // v0 = (v4 - 1.0f) * (((c0 * v4 + c1) * v4) + c2) + v0
+
+		armAsm->Dup(v1.V4S(), _global_l); // v1 = llll
+		armAsm->Dup(v4.V4S(), _global_k); // v4 = kkkk
+		armAsm->Fmla(v4.V4S(), v0.V4S(), v1.V4S()); // v4 = k + v0 * l
+#endif
+
+		// v4 = (-log2(Q) * (1 << L) + K) * 0x10000
+
+		armAsm->Dup(v0.V4S(), _global_mxl);
+		armAsm->Movi(v1.V4S(), 0);
+		armAsm->Fminnm(v4.V4S(), v4.V4S(), v0.V4S());
+		armAsm->Fmaxnm(v4.V4S(), v4.V4S(), v1.V4S());
+		armAsm->Fcvtzs(v4.V4S(), v4.V4S());
+
+		if (m_sel.mmin == 1) // round-off mode
+		{
+			armAsm->Movi(v0.V4S(), 0x8000);
+			armAsm->Add(v4.V4S(), v4.V4S(), v0.V4S());
+		}
+
+		armAsm->Ushr(v0.V4S(), v4.V4S(), 16);
+
+		// NOTE: Must go to memory, it gets indexed
+		armAsm->Str(v0, _local(temp.lod.i));
+
+		if (m_sel.mmin == 2) // trilinear mode
+		{
+			armAsm->Trn1(v1.V8H(), v4.V8H(), v4.V8H());
+			armAsm->Str(v1.V4S(), _local(temp.lod.f));
+		}
+
+		// shift u/v/minmax by (int)lod
+
+		armAsm->Neg(v0.V4S(), v0.V4S());
+		armAsm->Sshl(local0.V4S(), uv0.V4S(), v0.V4S());
+		armAsm->Sshl(local1.V4S(), uv1.V4S(), v0.V4S());
+		uv0 = local0;
+		uv1 = local1;
+
+		// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]
+
+		armAsm->Movi(v1.V4S(), 0);
+		armAsm->Zip1(v5.V8H(), _global_tmin.V8H(), v1.V8H()); // minu
+		armAsm->Zip2(v6.V8H(), _global_tmin.V8H(), v1.V8H()); // minv
+		armAsm->Ushl(v5.V4S(), v5.V4S(), v0.V4S());
+		armAsm->Ushl(v6.V4S(), v6.V4S(), v0.V4S());
+		armAsm->Sqxtun(v5.V4H(), v5.V4S());
+		armAsm->Sqxtun2(v5.V8H(), v6.V4S());
+
+		armAsm->Zip1(v6.V8H(), _global_tmax.V8H(), v1.V8H()); // maxu
+		armAsm->Zip2(v4.V8H(), _global_tmax.V8H(), v1.V8H()); // maxu
+		armAsm->Ushl(v6.V4S(), v6.V4S(), v0.V4S());
+		armAsm->Ushl(v4.V4S(), v4.V4S(), v0.V4S());
+		armAsm->Sqxtun(v6.V4H(), v6.V4S());
+		armAsm->Sqxtun2(v6.V8H(), v4.V4S());
+
+		if (m_sel.mmin != 1)
+		{
+			armAsm->Str(v5, _local(temp.uv_minmax[0]));
+			armAsm->Str(v6, _local(temp.uv_minmax[1]));
+		}
+	}
+	else
+	{
+		// lod = K
+
+		armAsm->Add(_scratchaddr, _globals, offsetof(GSScanlineGlobalData, lod));
+		armAsm->Ld1r(v0.V4S(), MemOperand(_scratchaddr));
+		armAsm->Neg(v0.V4S(), v0.V4S());
+
+		armAsm->Sshl(local0.V4S(), uv0.V4S(), v0.V4S());
+		armAsm->Sshl(local1.V4S(), uv1.V4S(), v0.V4S());
+		uv0 = local0;
+		uv1 = local1;
+
+		armAsm->Ldr(v5, _local(temp.uv_minmax[0]));
+		armAsm->Ldr(v6, _local(temp.uv_minmax[1]));
+	}
+
+	if (m_sel.ltf)
+	{
+		// u -= 0x8000;
+		// v -= 0x8000;
+
+		armAsm->Movi(v4.V4S(), 0x8000);
+		armAsm->Sub(v2.V4S(), uv0.V4S(), v4.V4S());
+		armAsm->Sub(v3.V4S(), uv1.V4S(), v4.V4S());
+
+		// GSVector4i uf = u.xxzzlh().srl16(1);
+
+		armAsm->Trn1(uf.V8H(), v2.V8H(), v2.V8H());
+		armAsm->Ushr(uf.V8H(), uf.V8H(), 12);
+
+		// GSVector4i vf = v.xxzzlh().srl16(1);
+
+		armAsm->Trn1(vf.V8H(), v3.V8H(), v3.V8H());
+		armAsm->Ushr(vf.V8H(), vf.V8H(), 12);
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	armAsm->Sshr(v2.V4S(), m_sel.ltf ? v2.V4S() : uv0.V4S(), 16);
+	armAsm->Sshr(v3.V4S(), m_sel.ltf ? v3.V4S() : uv1.V4S(), 16);
+	armAsm->Sqxtn(v2.V4H(), v2.V4S());
+	armAsm->Sqxtn2(v2.V8H(), v3.V4S());
+
+	if (m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		armAsm->Movi(v1.V8H(), 1);
+		armAsm->Add(v3.V8H(), v2.V8H(), v1.V8H());
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		WrapLOD(v2, v3, v0, v1, v5, v6);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		WrapLOD(v2, v0, v1, v5, v6);
+	}
+
+	SampleTexture_TexelReadHelper(0);
+
+	if (m_sel.mmin != 1) // !round-off mode
+	{
+		armAsm->Sshr(v2.V4S(), uv0.V4S(), 1);
+		armAsm->Sshr(v3.V4S(), uv1.V4S(), 1);
+
+		armAsm->Mov(local0, v5);
+		armAsm->Mov(local1, v6);
+
+		armAsm->Ldr(v5, _local(temp.uv_minmax[0]));
+		armAsm->Ldr(v6, _local(temp.uv_minmax[1]));
+
+		armAsm->Ushr(v5.V8H(), v5.V8H(), 1);
+		armAsm->Ushr(v6.V8H(), v6.V8H(), 1);
+
+		if (m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			armAsm->Movi(v4.V4S(), 0x8000);
+			armAsm->Sub(v2.V4S(), v2.V4S(), v4.V4S());
+			armAsm->Sub(v3.V4S(), v3.V4S(), v4.V4S());
+
+			// GSVector4i uf = u.xxzzlh().srl16(1);
+
+			armAsm->Trn1(uf.V8H(), v2.V8H(), v2.V8H());
+			armAsm->Ushr(uf.V8H(), uf.V8H(), 12);
+
+			// GSVector4i vf = v.xxzzlh().srl16(1);
+
+			armAsm->Trn1(vf.V8H(), v3.V8H(), v3.V8H());
+			armAsm->Ushr(vf.V8H(), vf.V8H(), 12);
+		}
+
+		// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+		armAsm->Sshr(v2.V4S(), v2.V4S(), 16);
+		armAsm->Sshr(v3.V4S(), v3.V4S(), 16);
+		armAsm->Sqxtn(v2.V4H(), v2.V4S());
+		armAsm->Sqxtn2(v2.V8H(), v3.V4S());
+
+		if (m_sel.ltf)
+		{
+			// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+			armAsm->Movi(v1.V8H(), 1);
+			armAsm->Add(v3.V8H(), v2.V8H(), v1.V8H());
+
+			// uv0 = Wrap(uv0);
+			// uv1 = Wrap(uv1);
+
+			WrapLOD(v2, v3, v0, v1, v5, v6);
+		}
+		else
+		{
+			// uv0 = Wrap(uv0);
+
+			WrapLOD(v2, v0, v1, v5, v6);
+		}
+
+		armAsm->Ldr(local2, m_sel.lcm ? _global(lod.f) : _local(temp.lod.f));
+
+		SampleTexture_TexelReadHelper(1);
+
+		// v5: rb
+		// v6: ga
+
+		armAsm->Ushr(v0.V8H(), local2.V8H(), 1);
+
+		lerp16(v5, local0, v0, 0);
+		lerp16(v6, local1, v0, 0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const VRegister& uv,
+	const VRegister& tmp, const VRegister& tmp2,
+	const VRegister& min, const VRegister& max)
+{
+	const int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	const int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+	const int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if (wms_clamp == wmt_clamp)
+	{
+		if (wms_clamp)
+		{
+			if (region)
+			{
+				armAsm->Smax(uv.V8H(), uv.V8H(), min.V8H());
+			}
+			else
+			{
+				armAsm->Movi(tmp.V8H(), 0);
+				armAsm->Smax(uv.V8H(), uv.V8H(), tmp.V8H());
+			}
+
+			armAsm->Smin(uv.V8H(), uv.V8H(), max.V8H());
+		}
+		else
+		{
+			armAsm->And(uv.V16B(), uv.V16B(), min.V16B());
+
+			if (region)
+				armAsm->Orr(uv.V16B(), uv.V16B(), max.V16B());
+		}
+	}
+	else
+	{
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+		armAsm->And(tmp.V16B(), uv.V16B(), min.V16B());
+		if (region)
+			armAsm->Orr(tmp.V16B(), tmp.V16B(), max.V16B());
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+		armAsm->Smax(uv.V8H(), uv.V8H(), min.V8H());
+		armAsm->Smin(tmp2.V8H(), uv.V8H(), max.V8H());
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+		armAsm->Sshr(uv.V16B(), _global_tmask.V16B(), 7);
+		armAsm->Bsl(uv.V16B(), tmp.V16B(), tmp2.V16B());
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(
+	const VRegister& uv0, const VRegister& uv1,
+	const VRegister& tmp, const VRegister& tmp2,
+	const VRegister& min, const VRegister& max)
+{
+	const int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	const int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+	const int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if (wms_clamp == wmt_clamp)
+	{
+		if (wms_clamp)
+		{
+			if (region)
+			{
+				armAsm->Smax(uv0.V8H(), uv0.V8H(), min.V8H());
+				armAsm->Smax(uv1.V8H(), uv1.V8H(), min.V8H());
+			}
+			else
+			{
+				armAsm->Movi(tmp.V8H(), 0);
+				armAsm->Smax(uv0.V8H(), uv0.V8H(), tmp.V8H());
+				armAsm->Smax(uv1.V8H(), uv1.V8H(), tmp.V8H());
+			}
+
+			armAsm->Smin(uv0.V8H(), uv0.V8H(), max.V8H());
+			armAsm->Smin(uv1.V8H(), uv1.V8H(), max.V8H());
+		}
+		else
+		{
+			armAsm->And(uv0.V16B(), uv0.V16B(), min.V16B());
+			armAsm->And(uv1.V16B(), uv1.V16B(), min.V16B());
+
+			if (region)
+			{
+				armAsm->Orr(uv0.V16B(), uv0.V16B(), max.V16B());
+				armAsm->Orr(uv1.V16B(), uv1.V16B(), max.V16B());
+			}
+		}
+	}
+	else
+	{
+		for (const VRegister& uv : {uv0, uv1})
+		{
+			// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+			armAsm->And(tmp.V16B(), uv.V16B(), min.V16B());
+			if (region)
+				armAsm->Orr(tmp.V16B(), tmp.V16B(), max.V16B());
+
+			// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+			armAsm->Smax(uv.V8H(), uv.V8H(), min.V8H());
+			armAsm->Smin(tmp2.V8H(), uv.V8H(), max.V8H());
+
+			// clamp.blend8(repeat, m_local.gd->t.mask);
+
+			armAsm->Sshr(uv.V16B(), _global_tmask.V16B(), 7);
+			armAsm->Bsl(uv.V16B(), tmp.V16B(), tmp2.V16B());
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::AlphaTFX()
+{
+	if (!m_sel.fb)
+	{
+		return;
+	}
+
+	switch (m_sel.tfx)
+	{
+		case TFX_MODULATE:
+
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			// gat = gat.modulate16<1>(ga).clamp8();
+			// modulate16(v6, v4, 1);
+			modulate16(v6, _temp_ga, 1);
+			clamp16(v6, v3);
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if (!m_sel.tcc)
+			{
+				armAsm->Ushr(v4.V8H(), _temp_ga.V8H(), 7);
+
+				mix16(v6, v4, v3);
+			}
+
+			break;
+
+		case TFX_DECAL:
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if (!m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+				armAsm->Ushr(v4.V8H(), _temp_ga.V8H(), 7);
+				mix16(v6, v4, v3);
+			}
+
+			break;
+
+		case TFX_HIGHLIGHT:
+
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+			armAsm->Ushr(v4.V8H(), _temp_ga.V8H(), 7);
+
+			if (m_sel.tcc)
+				armAsm->Uqadd(v4.V16B(), v4.V16B(), v6.V16B());
+
+			mix16(v6, v4, v3);
+
+			break;
+
+		case TFX_HIGHLIGHT2:
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if (!m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_local.c.ga;
+				armAsm->Ushr(v4.V8H(), _temp_ga.V8H(), 7);
+
+				mix16(v6, v4, v3);
+			}
+
+			break;
+
+		case TFX_NONE:
+
+			// gat = iip ? ga.srl16(7) : ga;
+
+			if (m_sel.iip)
+				armAsm->Ushr(v6.V8H(), _temp_ga.V8H(), 7);
+			else
+				armAsm->Mov(v6, _temp_ga);
+
+			break;
+	}
+
+	if (m_sel.aa1)
+	{
+		// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
+
+		// FIXME: bios config screen cubes
+
+		if (!m_sel.abe)
+		{
+			// a = cov
+
+			if (m_sel.edge)
+				armAsm->Ldr(v0, _local(temp.cov));
+			else
+				armAsm->Movi(v0.V8H(), 0x0080);
+
+			mix16(v6, v0, v1);
+		}
+		else
+		{
+			// a = a == 0x80 ? cov : a
+
+			armAsm->Movi(v0.V8H(), 0x0080);
+
+			if (m_sel.edge)
+				armAsm->Ldr(v1, _local(temp.cov));
+			else
+				armAsm->Mov(v1, v0);
+
+			armAsm->Cmeq(v0.V8H(), v0.V8H(), v6.V8H());
+			armAsm->Ushr(v0.V4S(), v0.V4S(), 16);
+			armAsm->Shl(v0.V4S(), v0.V4S(), 16);
+
+			blend8(v6, v1, v0, _vscratch);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadMask()
+{
+	if (m_sel.fwrite)
+		armAsm->Ldr(v3, _global(fm));
+
+	if (m_sel.zwrite)
+		armAsm->Ldr(v4, _global(zm));
+}
+
+void GSDrawScanlineCodeGenerator::TestAlpha()
+{
+	switch (m_sel.atst)
+	{
+		case ATST_NEVER:
+			// t = GSVector4i::xffffffff();
+			// pcmpeqd(v1, v1);
+			armAsm->Movi(v1.V2D(), 0xFFFFFFFFFFFFFFFFULL);
+			break;
+
+		case ATST_ALWAYS:
+			return;
+
+		case ATST_LESS:
+		case ATST_LEQUAL:
+			// t = (ga >> 16) > m_local.gd->aref;
+			armAsm->Dup(_vscratch.V4S(), _local_aref);
+			armAsm->Ushr(v1.V4S(), v6.V4S(), 16);
+			armAsm->Cmgt(v1.V4S(), v1.V4S(), _vscratch.V4S());
+			break;
+
+		case ATST_EQUAL:
+			// t = (ga >> 16) != m_local.gd->aref;
+			armAsm->Dup(_vscratch.V4S(), _local_aref);
+			armAsm->Ushr(v1.V4S(), v6.V4S(), 16);
+			armAsm->Cmeq(v1.V4S(), v1.V4S(), _vscratch.V4S());
+			armAsm->Mvn(v1.V16B(), v1.V16B());
+			break;
+
+		case ATST_GEQUAL:
+		case ATST_GREATER:
+			// t = (ga >> 16) < m_local.gd->aref;
+			armAsm->Dup(_vscratch.V4S(), _local_aref);
+			armAsm->Ushr(v1.V4S(), v6.V4S(), 16);
+			armAsm->Cmgt(v1.V4S(), _vscratch.V4S(), v1.V4S());
+			break;
+
+		case ATST_NOTEQUAL:
+			// t = (ga >> 16) == m_local.gd->aref;
+			armAsm->Dup(_vscratch.V4S(), _local_aref);
+			armAsm->Ushr(v1.V4S(), v6.V4S(), 16);
+			armAsm->Cmeq(v1.V4S(), v1.V4S(), _vscratch.V4S());
+			break;
+	}
+
+	switch (m_sel.afail)
+	{
+		case AFAIL_KEEP:
+			// test |= t;
+			armAsm->Orr(_test.V16B(), _test.V16B(), v1.V16B());
+			alltrue(_test, _vscratch);
+			break;
+
+		case AFAIL_FB_ONLY:
+			// zm |= t;
+			armAsm->Orr(v4.V16B(), v4.V16B(), v1.V16B());
+			break;
+
+		case AFAIL_ZB_ONLY:
+			// fm |= t;
+			armAsm->Orr(v3.V16B(), v3.V16B(), v1.V16B());
+			break;
+
+		case AFAIL_RGB_ONLY:
+			// zm |= t;
+			armAsm->Orr(v4.V16B(), v4.V16B(), v1.V16B());
+
+			// fm |= t & GSVector4i::xff000000();
+			armAsm->Ushr(v1.V4S(), v1.V4S(), 24);
+			armAsm->Shl(v1.V4S(), v1.V4S(), 24);
+			armAsm->Orr(v3.V16B(), v3.V16B(), v1.V16B());
+			break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ColorTFX()
+{
+	if (!m_sel.fwrite)
+	{
+		return;
+	}
+
+	switch (m_sel.tfx)
+	{
+		case TFX_MODULATE:
+
+			// GSVector4i rb = iip ? rbf : m_local.c.rb;
+
+			// rbt = rbt.modulate16<1>(rb).clamp8();
+
+			modulate16(v5, _temp_rb, 1);
+			clamp16(v5, v1);
+
+			break;
+
+		case TFX_DECAL:
+
+			break;
+
+		case TFX_HIGHLIGHT:
+		case TFX_HIGHLIGHT2:
+
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+			// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+			armAsm->Mov(v1, v6);
+			modulate16(v6, _temp_ga, 1);
+
+			armAsm->Trn2(v2.V8H(), _temp_ga.V8H(), _temp_ga.V8H());
+			armAsm->Ushr(v2.V8H(), v2.V8H(), 7);
+			armAsm->Add(v6.V8H(), v6.V8H(), v2.V8H());
+
+			clamp16(v6, v0);
+
+			mix16(v6, v1, v0);
+
+			// GSVector4i rb = iip ? rbf : m_local.c.rb;
+
+			// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+			modulate16(v5, _temp_rb, 1);
+			armAsm->Add(v5.V8H(), v5.V8H(), v2.V8H());
+
+			clamp16(v5, v0);
+
+			break;
+
+		case TFX_NONE:
+
+			// rbt = iip ? rb.srl16(7) : rb;
+
+			if (m_sel.iip)
+				armAsm->Ushr(v5.V8H(), _temp_rb.V8H(), 7);
+			else
+				armAsm->Mov(v5, _temp_rb);
+
+			break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Fog()
+{
+	if (!m_sel.fwrite || !m_sel.fge)
+	{
+		return;
+	}
+
+	// rb = m_local.gd->frb.lerp16<0>(rb, f);
+	// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
+
+	armAsm->Dup(_vscratch.V4S(), _global_frb);
+	armAsm->Dup(_vscratch2.V4S(), _global_fga);
+	armAsm->Mov(v1, v6);
+
+	lerp16(v5, _vscratch, _temp_f, 0);
+
+	lerp16(v6, _vscratch2, _temp_f, 0);
+	mix16(v6, v1, v0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadFrame()
+{
+	if (!m_sel.fb)
+	{
+		return;
+	}
+
+	// int fa = fza_base.x + fza_offset->x;
+
+	armAsm->Ldr(w6, MemOperand(x7));
+	armAsm->Ldr(_wscratch, MemOperand(x8));
+	armAsm->Add(w6, w6, _wscratch);
+	armAsm->And(w6, w6, HALF_VM_SIZE - 1);
+
+	if (!m_sel.rfb)
+	{
+		return;
+	}
+
+	ReadPixel(v2, w6);
+}
+
+void GSDrawScanlineCodeGenerator::TestDestAlpha()
+{
+	if (!m_sel.date || (m_sel.fpsm != 0 && m_sel.fpsm != 2))
+	{
+		return;
+	}
+
+	// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
+
+	if (m_sel.datm)
+	{
+		if (m_sel.fpsm == 2)
+		{
+			armAsm->Movi(v0.V4S(), 0);
+			armAsm->Shl(v1.V4S(), _fd.V4S(), 16);
+			armAsm->Ushr(v1.V4S(), v1.V4S(), 31);
+			armAsm->Cmeq(v1.V4S(), v0.V4S(), 0);
+		}
+		else
+		{
+			armAsm->Mvn(v1.V16B(), _fd.V16B());
+			armAsm->Sshr(v1.V4S(), v1.V4S(), 31);
+		}
+	}
+	else
+	{
+		if (m_sel.fpsm == 2)
+		{
+			armAsm->Shl(v1.V4S(), _fd.V4S(), 16);
+			armAsm->Sshr(v1.V4S(), v1.V4S(), 31);
+		}
+		else
+		{
+			armAsm->Sshr(v1.V4S(), _fd.V4S(), 31);
+		}
+	}
+
+	armAsm->Orr(_test.V16B(), _test.V16B(), v1.V16B());
+
+	alltrue(_test, _vscratch);
+}
+
+void GSDrawScanlineCodeGenerator::WriteMask()
+{
+	if (m_sel.notest)
+	{
+		return;
+	}
+
+	// fm |= test;
+	// zm |= test;
+
+	if (m_sel.fwrite)
+		armAsm->Orr(v3.V16B(), v3.V16B(), _test.V16B());
+
+	if (m_sel.zwrite)
+		armAsm->Orr(v4.V16B(), v4.V16B(), _test.V16B());
+
+	// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+
+	armAsm->Movi(v1.V4S(), 0xFFFFFFFFu);
+
+	if (m_sel.fwrite && m_sel.zwrite)
+	{
+		armAsm->Cmeq(v0.V4S(), v1.V4S(), v4.V4S());
+		armAsm->Cmeq(v1.V4S(), v1.V4S(), v3.V4S());
+		armAsm->Sqxtn(v1.V4H(), v1.V4S());
+		armAsm->Sqxtn2(v1.V8H(), v0.V4S());
+	}
+	else if (m_sel.fwrite)
+	{
+		armAsm->Cmeq(_vscratch.V4S(), v1.V4S(), v3.V4S());
+		armAsm->Sqxtn(v1.V4H(), _vscratch.V4S());
+		armAsm->Sqxtn2(v1.V8H(), _vscratch.V4S());
+	}
+	else if (m_sel.zwrite)
+	{
+		armAsm->Cmeq(v1.V4S(), v1.V4S(), v4.V4S());
+		armAsm->Sqxtn(v1.V4H(), _vscratch.V4S());
+		armAsm->Sqxtn2(v1.V8H(), _vscratch.V4S());
+	}
+
+	armAsm->And(v1.V16B(), v1.V16B(), _const_movemskw_mask.V16B());
+	armAsm->Addv(v1.H(), v1.V8H());
+	armAsm->Umov(w1, v1.V8H(), 0);
+	armAsm->Mvn(w1, w1);
+}
+
+void GSDrawScanlineCodeGenerator::WriteZBuf()
+{
+	if (!m_sel.zwrite)
+	{
+		return;
+	}
+
+	armAsm->Mov(v1, m_sel.prim != GS_SPRITE_CLASS ? _temp_zs : _temp_z0);
+
+	if (m_sel.ztest && m_sel.zpsm < 2)
+	{
+		// zs = zs.blend8(zd, zm);
+
+		blend8(v1, _temp_zd, v4, _vscratch);
+	}
+
+	// Clamp Z to ZPSM_FMT_MAX
+	if (m_sel.zclamp)
+	{
+		armAsm->Movi(v7.V4S(), 0xFFFFFFFFu >> (u8)((m_sel.zpsm & 0x3) * 8));
+		armAsm->Smin(v1.V4S(), v1.V4S(), v7.V4S());
+	}
+
+	bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
+
+	WritePixel(v1, w9, w1, true, fast, m_sel.zpsm, 1);
+}
+
+void GSDrawScanlineCodeGenerator::AlphaBlend()
+{
+	if (!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if (m_sel.abe == 0 && m_sel.aa1 == 0)
+	{
+		return;
+	}
+
+	if (((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1)) || m_sel.abd == 1)
+	{
+		switch (m_sel.fpsm)
+		{
+			case 0:
+			case 1:
+
+				// c[2] = fd & mask;
+				// c[3] = (fd >> 8) & mask;
+
+				split16_2x8(v0, v1, v2);
+
+				break;
+
+			case 2:
+
+				// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+				// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+
+				armAsm->Movi(v7.V4S(), 0x1F);
+				armAsm->And(v0.V16B(), v2.V16B(), v7.V16B());
+				armAsm->Shl(v0.V4S(), v0.V4S(), 3);
+
+				armAsm->Movi(v7.V4S(), 0x7C00);
+				armAsm->And(v4.V16B(), v2.V16B(), v7.V16B());
+				armAsm->Movi(v7.V4S(), 0x3E0);
+				armAsm->Shl(v4.V4S(), v4.V4S(), 9);
+
+				armAsm->Orr(v0.V16B(), v0.V16B(), v4.V16B());
+
+				armAsm->And(v1.V16B(), v2.V16B(), v7.V16B());
+				armAsm->Ushr(v1.V4S(), v1.V4S(), 2);
+
+				armAsm->Movi(v7.V4S(), 0x8000);
+				armAsm->And(v4.V16B(), v2.V16B(), v7.V16B());
+				armAsm->Shl(v4.V4S(), v4.V4S(), 8);
+
+				armAsm->Orr(v1.V16B(), v1.V16B(), v4.V16B());
+				break;
+		}
+	}
+
+	if (m_sel.pabe || ((m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)))
+	{
+		// movdqa(v4, v5);
+		armAsm->Mov(v4, v5);
+	}
+
+	if (m_sel.aba != m_sel.abb)
+	{
+		// rb = c[aba * 2 + 0];
+
+		switch (m_sel.aba)
+		{
+			case 0:
+				break;
+			case 1:
+				armAsm->Mov(v5, v0);
+				break;
+			case 2:
+				armAsm->Movi(v5.V16B(), 0);
+				break;
+		}
+
+		// rb = rb.sub16(c[abb * 2 + 0]);
+
+		switch (m_sel.abb)
+		{
+			case 0:
+				armAsm->Sub(v5.V8H(), v5.V8H(), v4.V8H());
+				break;
+			case 1:
+				armAsm->Sub(v5.V8H(), v5.V8H(), v0.V8H());
+				break;
+			case 2:
+				break;
+		}
+
+		if (!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
+
+			switch (m_sel.abc)
+			{
+				case 0:
+				case 1:
+					armAsm->Trn2(v7.V8H(), m_sel.abc ? v1.V8H() : v6.V8H(), m_sel.abc ? v1.V8H() : v6.V8H());
+					armAsm->Shl(v7.V8H(), v7.V8H(), 7);
+					break;
+				case 2:
+					armAsm->Ldr(v7, _global(afix));
+					break;
+			}
+
+			// rb = rb.modulate16<1>(a);
+
+			modulate16(v5, v7, 1);
+		}
+
+		// rb = rb.add16(c[abd * 2 + 0]);
+
+		switch (m_sel.abd)
+		{
+			case 0:
+				armAsm->Add(v5.V8H(), v5.V8H(), v4.V8H());
+				break;
+			case 1:
+				armAsm->Add(v5.V8H(), v5.V8H(), v0.V8H());
+				break;
+			case 2:
+				break;
+		}
+	}
+	else
+	{
+		// rb = c[abd * 2 + 0];
+
+		switch (m_sel.abd)
+		{
+			case 0:
+				break;
+			case 1:
+				armAsm->Mov(v5, v0);
+				break;
+			case 2:
+				armAsm->Movi(v5.V16B(), 0);
+				break;
+		}
+	}
+
+	if (m_sel.pabe)
+	{
+		// mask = (c[1] << 8).sra32(31);
+
+		armAsm->Shl(v0.V4S(), v6.V4S(), 8);
+		armAsm->Sshr(v0.V4S(), v0.V4S(), 31);
+
+		// rb = c[0].blend8(rb, mask);
+
+		blend8r(v5, v4, v0, _vscratch);
+	}
+
+	armAsm->Mov(v4, v6);
+
+	if (m_sel.aba != m_sel.abb)
+	{
+		// ga = c[aba * 2 + 1];
+
+		switch (m_sel.aba)
+		{
+			case 0:
+				break;
+			case 1:
+				armAsm->Mov(v6, v1);
+				break;
+			case 2:
+				armAsm->Movi(v6.V16B(), 0);
+				break;
+		}
+
+		// ga = ga.sub16(c[abeb * 2 + 1]);
+
+		switch (m_sel.abb)
+		{
+			case 0:
+				armAsm->Sub(v6.V8H(), v6.V8H(), v4.V8H());
+				break;
+			case 1:
+				armAsm->Sub(v6.V8H(), v6.V8H(), v1.V8H());
+				break;
+			case 2:
+				break;
+		}
+
+		if (!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// ga = ga.modulate16<1>(a);
+
+			modulate16(v6, v7, 1);
+		}
+
+		// ga = ga.add16(c[abd * 2 + 1]);
+
+		switch (m_sel.abd)
+		{
+			case 0:
+				armAsm->Add(v6.V8H(), v6.V8H(), v4.V8H());
+				break;
+			case 1:
+				armAsm->Add(v6.V8H(), v6.V8H(), v1.V8H());
+				break;
+			case 2:
+				break;
+		}
+	}
+	else
+	{
+		// ga = c[abd * 2 + 1];
+
+		switch (m_sel.abd)
+		{
+			case 0:
+				break;
+			case 1:
+				armAsm->Mov(v6, v1);
+				break;
+			case 2:
+				armAsm->Movi(v6.V16B(), 0);
+				break;
+		}
+	}
+
+	if (m_sel.pabe)
+	{
+		armAsm->Ushr(v0.V4S(), v0.V4S(), 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+		// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+		blend8r(v6, v4, v0, _vscratch);
+	}
+	else
+	{
+		if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+		{
+			mix16(v6, v4, v7);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WriteFrame()
+{
+	if (!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if (m_sel.fpsm == 2 && m_sel.dthe)
+	{
+		armAsm->And(w5, _top, 3);
+		armAsm->Lsl(w5, w5, 5);
+		armAsm->Ldr(_vscratch, MemOperand(_global_dimx, x5));
+		armAsm->Add(x5, x5, sizeof(GSVector4i));
+		armAsm->Ldr(_vscratch2, MemOperand(_global_dimx, x5));
+		armAsm->Add(v5.V8H(), v5.V8H(), _vscratch.V8H());
+		armAsm->Add(v6.V8H(), v6.V8H(), _vscratch2.V8H());
+	}
+
+	if (m_sel.colclamp == 0)
+	{
+		// c[0] &= 0x000000ff;
+		// c[1] &= 0x000000ff;
+
+		armAsm->Movi(v7.V8H(), 0xFF);
+
+		armAsm->And(v5.V16B(), v5.V16B(), v7.V16B());
+		armAsm->And(v6.V16B(), v6.V16B(), v7.V16B());
+	}
+
+	// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+
+	armAsm->Zip2(v7.V8H(), v5.V8H(), v6.V8H());
+	armAsm->Zip1(v5.V8H(), v5.V8H(), v6.V8H());
+	armAsm->Sqxtun(v5.V8B(), v5.V8H());
+	armAsm->Sqxtun2(v5.V16B(), v7.V8H());
+
+	if (m_sel.fba && m_sel.fpsm != 1)
+	{
+		// fs |= 0x80000000;
+
+		armAsm->Movi(v7.V4S(), 0x80000000);
+		armAsm->Orr(v5.V16B(), v5.V16B(), v7.V16B());
+	}
+
+	if (m_sel.fpsm == 2)
+	{
+		// GSVector4i rb = fs & 0x00f800f8;
+		// GSVector4i ga = fs & 0x8000f800;
+
+		armAsm->Movi(v6.V4S(), 0x00f800f8);
+
+		armAsm->Movi(v7.V4S(), 0x8000f800);
+
+		armAsm->And(v4.V16B(), v5.V16B(), v6.V16B());
+		armAsm->And(v5.V16B(), v5.V16B(), v7.V16B());
+
+		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+		armAsm->Ushr(v6.V4S(), v4.V4S(), 9);
+		armAsm->Ushr(v7.V4S(), v5.V4S(), 16);
+		armAsm->Ushr(v4.V4S(), v4.V4S(), 3);
+		armAsm->Ushr(v5.V4S(), v5.V4S(), 6);
+
+		armAsm->Orr(v5.V16B(), v5.V16B(), v4.V16B());
+		armAsm->Orr(v7.V16B(), v7.V16B(), v6.V16B());
+		armAsm->Orr(v5.V16B(), v5.V16B(), v7.V16B());
+	}
+
+	const VRegister& pixel = m_sel.rfb ? v3 : v5;
+	if (m_sel.rfb)
+	{
+		// fs = fs.blend(fd, fm);
+
+		armAsm->Bsl(v3.V16B(), v2.V16B(), v5.V16B());
+	}
+
+	const bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
+
+	WritePixel(pixel, w6, w1, false, fast, m_sel.fpsm, 0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadPixel(const VRegister& dst, const Register& addr)
+{
+	pxAssert(addr.IsW());
+	armAsm->Lsl(_wscratch, addr, 1); // *2
+	armAsm->Ldr(dst.D(), MemOperand(_vm, _xscratch));
+	armAsm->Add(_scratchaddr, _vm_high, _xscratch);
+	armAsm->Ld1(dst.V2D(), 1, MemOperand(_scratchaddr));
+}
+
+void GSDrawScanlineCodeGenerator::WritePixel(const VRegister& src, const Register& addr, const Register& mask, bool high, bool fast, int psm, int fz)
+{
+	pxAssert(addr.IsW() && mask.IsW());
+	if (m_sel.notest)
+	{
+		if (fast)
+		{
+			armAsm->Lsl(_wscratch, addr, 1); // *2
+			armAsm->Str(src.D(), MemOperand(_vm, _xscratch));
+			armAsm->Add(_scratchaddr, _vm_high, _xscratch);
+			armAsm->St1(src.V2D(), 1, MemOperand(_scratchaddr));
+		}
+		else
+		{
+			WritePixel(src, addr, 0, psm);
+			WritePixel(src, addr, 1, psm);
+			WritePixel(src, addr, 2, psm);
+			WritePixel(src, addr, 3, psm);
+		}
+	}
+	else
+	{
+		if (fast)
+		{
+			// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
+			// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
+
+			Label skip_low, skip_high;
+			armAsm->Lsl(_wscratch, addr, 1); // *2
+
+			armAsm->Tst(mask, high ? 0x0F00 : 0x0F);
+			armAsm->B(eq, &skip_low);
+			armAsm->Str(src.D(), MemOperand(_vm, _xscratch));
+			armAsm->Bind(&skip_low);
+
+			armAsm->Tst(mask, high ? 0xF000 : 0xF0);
+			armAsm->B(eq, &skip_high);
+			armAsm->Add(_scratchaddr, _vm_high, _xscratch);
+			armAsm->St1(src.V2D(), 1, MemOperand(_scratchaddr));
+			armAsm->Bind(&skip_high);
+		}
+		else
+		{
+			// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
+			// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
+			// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
+			// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
+
+			Label skip_0, skip_1, skip_2, skip_3;
+
+			armAsm->Tst(mask, high ? 0x0300 : 0x03);
+			armAsm->B(eq, &skip_0);
+			WritePixel(src, addr, 0, psm);
+			armAsm->Bind(&skip_0);
+
+			armAsm->Tst(mask, high ? 0x0c00 : 0x0c);
+			armAsm->B(eq, &skip_1);
+			WritePixel(src, addr, 1, psm);
+			armAsm->Bind(&skip_1);
+
+			armAsm->Tst(mask, high ? 0x3000 : 0x30);
+			armAsm->B(eq, &skip_2);
+			WritePixel(src, addr, 2, psm);
+			armAsm->Bind(&skip_2);
+
+			armAsm->Tst(mask, high ? 0xc000 : 0xc0);
+			armAsm->B(eq, &skip_3);
+			WritePixel(src, addr, 3, psm);
+			armAsm->Bind(&skip_3);
+		}
+	}
+}
+
+static const int s_offsets[4] = {0, 2, 8, 10};
+
+void GSDrawScanlineCodeGenerator::WritePixel(const VRegister& src, const Register& addr, u8 i, int psm)
+{
+	pxAssert(addr.IsW());
+	// Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
+	armAsm->Lsl(_wscratch, addr, 1); // *2
+	armAsm->Add(_scratchaddr, _vm, s_offsets[i] * 2);
+
+	switch (psm)
+	{
+		case 0:
+			if (i == 0)
+			{
+				armAsm->Str(src.S(), MemOperand(_scratchaddr, _xscratch));
+			}
+			else
+			{
+				armAsm->Add(_scratchaddr, _scratchaddr, _xscratch);
+				armAsm->St1(src.V4S(), i, MemOperand(_scratchaddr));
+			}
+			break;
+		case 1:
+			armAsm->Ldr(_wscratch2, MemOperand(_scratchaddr, _xscratch));
+
+			armAsm->Mov(w5, src.V4S(), i);
+
+			armAsm->Eor(w5, w5, _wscratch2);
+			armAsm->And(w5, w5, 0xffffff);
+			armAsm->Eor(_wscratch2, _wscratch2, w5);
+			armAsm->Str(_wscratch2, MemOperand(_scratchaddr, _xscratch));
+
+			break;
+		case 2:
+			armAsm->Umov(w5, src.V8H(), i * 2);
+			armAsm->Strh(w5, MemOperand(_scratchaddr, _xscratch));
+			break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel1(const VRegister& dst, const VRegister& src, const VRegister& tmp1, int mip_offset)
+{
+	const VRegister no; // Hopefully this will assert if we accidentally use it
+	ReadTexelImpl(dst, tmp1, src, no, no, no, 1, mip_offset);
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel4(
+	const VRegister& d0, const VRegister& d1,
+	const VRegister& d2s0, const VRegister& d3s1,
+	const VRegister& s2, const VRegister& s3,
+	int mip_offset)
+{
+	ReadTexelImpl(d0, d1, d2s0, d3s1, s2, s3, 4, mip_offset);
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexelImplLoadTexLOD(const Register& addr, int lod, int mip_offset)
+{
+	pxAssert(addr.IsX());
+	pxAssert(m_sel.mmin);
+	armAsm->Ldr(addr.W(), m_sel.lcm ? _global(lod.i.U32[lod]) : _local(temp.lod.i.U32[lod]));
+	if (mip_offset != 0)
+		armAsm->Add(addr.W(), addr.W(), mip_offset);
+	armAsm->Ldr(addr.X(), MemOperand(_global_tex0, addr, LSL, 3));
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexelImpl(
+	const VRegister& d0, const VRegister& d1,
+	const VRegister& d2s0, const VRegister& d3s1,
+	const VRegister& s2, const VRegister& s3,
+	int pixels, int mip_offset)
+{
+	//mip_offset *= wordsize;
+
+	const bool preserve[] = {false, false, true, true};
+	const VRegister dst[] = {d0, d1, d2s0, d3s1};
+	const VRegister src[] = {d2s0, d3s1, s2, s3};
+
+	if (m_sel.mmin && !m_sel.lcm)
+	{
+		for (int j = 0; j < 4; j++)
+		{
+			ReadTexelImplLoadTexLOD(_xscratch, j, mip_offset);
+
+			for (int i = 0; i < pixels; i++)
+			{
+				ReadTexelImpl(dst[i], src[i], j, _xscratch, preserve[i]);
+			}
+		}
+	}
+	else
+	{
+		Register base_register(_global_tex0);
+
+		if (m_sel.mmin && m_sel.lcm)
+		{
+			ReadTexelImplLoadTexLOD(_xscratch, 0, mip_offset);
+			base_register = _xscratch;
+		}
+
+		for (int i = 0; i < pixels; i++)
+		{
+			for (int j = 0; j < 4; j++)
+			{
+				ReadTexelImpl(dst[i], src[i], j, base_register, false);
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexelImpl(const VRegister& dst,
+	const VRegister& addr, u8 i, const Register& baseRegister, bool preserveDst)
+{
+	// const Address& src = m_sel.tlu ? ptr[w1 + w5 * 4] : ptr[w6 + w5 * 4];
+	pxAssert(baseRegister.GetCode() != _scratchaddr.GetCode());
+	pxAssert(baseRegister.IsX());
+	armAsm->Mov(_scratchaddr.W(), addr.V4S(), i);
+
+	if (m_sel.tlu)
+	{
+		armAsm->Ldrb(_scratchaddr.W(), MemOperand(baseRegister, _scratchaddr));
+
+		armAsm->Add(_scratchaddr, _global_clut, Operand(_scratchaddr, UXTW, 2));
+		if (i == 0 && !preserveDst)
+			armAsm->Ldr(dst.S(), MemOperand(_scratchaddr));
+		else
+			armAsm->Ld1(dst.V4S(), i, MemOperand(_scratchaddr));
+	}
+	else
+	{
+		armAsm->Add(_scratchaddr, baseRegister, Operand(_scratchaddr, UXTW, 2));
+		if (i == 0 && !preserveDst)
+			armAsm->Ldr(dst.S(), MemOperand(_scratchaddr));
+		else
+			armAsm->Ld1(dst.V4S(), i, MemOperand(_scratchaddr));
+	}
+}
+
+
+void GSDrawScanlineCodeGenerator::modulate16(const VRegister& a, const VRegister& f, u8 shift)
+{
+	modulate16(a, a, f, shift);
+}
+
+void GSDrawScanlineCodeGenerator::modulate16(const VRegister& d, const VRegister& a, const VRegister& f, u8 shift)
+{
+	// potentially going to cause issues due to saturation
+	armAsm->Shl(d.V8H(), a.V8H(), shift + 1);
+	if (shift != 0)
+		armAsm->Sqdmulh(a.V8H(), a.V8H(), f.V8H());
+	else
+		armAsm->Sqrdmulh(a.V8H(), a.V8H(), f.V8H());
+
+	armAsm->Sshr(a.V8H(), a.V8H(), 1);
+}
+
+void GSDrawScanlineCodeGenerator::lerp16(const VRegister& a, const VRegister& b, const VRegister& f, u8 shift)
+{
+	armAsm->Sub(a.V8H(), a.V8H(), b.V8H());
+	modulate16(a, f, shift);
+	armAsm->Add(a.V8H(), a.V8H(), b.V8H());
+}
+
+void GSDrawScanlineCodeGenerator::lerp16_4(const VRegister& a, const VRegister& b, const VRegister& f)
+{
+	armAsm->Sub(a.V8H(), a.V8H(), b.V8H());
+	armAsm->Mul(a.V8H(), a.V8H(), f.V8H());
+	armAsm->Sshr(a.V8H(), a.V8H(), 4);
+	armAsm->Add(a.V8H(), a.V8H(), b.V8H());
+}
+
+void GSDrawScanlineCodeGenerator::mix16(const VRegister& a, const VRegister& b, const VRegister& temp)
+{
+	pxAssert(a.GetCode() != temp.GetCode() && b.GetCode() != temp.GetCode());
+
+	armAsm->Mov(temp, a);
+	armAsm->Movi(a.V4S(), 0xFFFF0000);
+	armAsm->Bsl(a.V16B(), b.V16B(), temp.V16B());
+}
+
+void GSDrawScanlineCodeGenerator::clamp16(const VRegister& a, const VRegister& temp)
+{
+	armAsm->Sqxtun(a.V8B(), a.V8H());
+	armAsm->Ushll(a.V8H(), a.V8B(), 0);
+}
+
+void GSDrawScanlineCodeGenerator::alltrue(const VRegister& test, const VRegister& temp)
+{
+	armAsm->Uminv(temp.S(), test.V4S());
+	armAsm->Fmov(_wscratch, temp.S());
+	armAsm->Cmn(_wscratch, 1);
+	armAsm->B(eq, &m_step_label);
+}
+
+void GSDrawScanlineCodeGenerator::blend8(const VRegister& a, const VRegister& b, const VRegister& mask, const VRegister& temp)
+{
+	armAsm->Sshr(temp.V16B(), mask.V16B(), 7);
+	armAsm->Bsl(temp.V16B(), b.V16B(), a.V16B());
+	armAsm->Mov(a, temp);
+}
+
+void GSDrawScanlineCodeGenerator::blend8r(const VRegister& b, const VRegister& a, const VRegister& mask, const VRegister& temp)
+{
+	armAsm->Sshr(temp.V16B(), mask.V16B(), 7);
+	armAsm->Bsl(temp.V16B(), b.V16B(), a.V16B());
+	armAsm->Mov(b, temp);
+}
+
+void GSDrawScanlineCodeGenerator::split16_2x8(const VRegister& l, const VRegister& h, const VRegister& src)
+{
+	// l = src & 0xFF; (1 left shift + 1 right shift)
+	// h = (src >> 8) & 0xFF; (1 right shift)
+
+	if (src.GetCode() == h.GetCode())
+	{
+		armAsm->Mov(l, src);
+		armAsm->Ushr(h.V8H(), src.V8H(), 8);
+		armAsm->Bic(l.V8H(), 0xFF, 8);
+	}
+	else if (src.GetCode() == l.GetCode())
+	{
+		armAsm->Ushr(h.V8H(), src.V8H(), 8);
+		armAsm->Bic(l.V8H(), 0xFF, 8);
+	}
+	else
+	{
+		armAsm->Mov(l, src);
+		armAsm->Ushr(h.V8H(), src.V8H(), 8);
+		armAsm->Bic(l.V8H(), 0xFF, 8);
+	}
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h
new file mode 100644
index 0000000000..7d0a4d9fc3
--- /dev/null
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.arm64.h
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#pragma once
+
+#include "GS/Renderers/Common/GSFunctionMap.h"
+#include "GS/Renderers/SW/GSScanlineEnvironment.h"
+
+#include "vixl/aarch64/macro-assembler-aarch64.h"
+
+class GSDrawScanlineCodeGenerator
+{
+public:
+	GSDrawScanlineCodeGenerator(u64 key, void* code, size_t maxsize);
+	void Generate();
+
+	size_t GetSize() const { return m_emitter.GetSizeOfCodeGenerated(); }
+	const u8* GetCode() const { return m_emitter.GetBuffer().GetStartAddress<const u8*>(); }
+
+private:
+	void Init();
+	void Step();
+	void TestZ(const vixl::aarch64::VRegister& temp1, const vixl::aarch64::VRegister& temp2);
+	void SampleTexture();
+	void SampleTexture_TexelReadHelper(int mip_offset);
+	void Wrap(const vixl::aarch64::VRegister& uv0);
+	void Wrap(const vixl::aarch64::VRegister& uv0, const vixl::aarch64::VRegister& uv1);
+	void SampleTextureLOD();
+	void WrapLOD(const vixl::aarch64::VRegister& uv,
+		const vixl::aarch64::VRegister& tmp, const vixl::aarch64::VRegister& tmp2,
+		const vixl::aarch64::VRegister& min, const vixl::aarch64::VRegister& max);
+	void WrapLOD(const vixl::aarch64::VRegister& uv0, const vixl::aarch64::VRegister& uv1,
+		const vixl::aarch64::VRegister& tmp, const vixl::aarch64::VRegister& tmp2,
+		const vixl::aarch64::VRegister& min, const vixl::aarch64::VRegister& max);
+	void AlphaTFX();
+	void ReadMask();
+	void TestAlpha();
+	void ColorTFX();
+	void Fog();
+	void ReadFrame();
+	void TestDestAlpha();
+	void WriteMask();
+	void WriteZBuf();
+	void AlphaBlend();
+	void WriteFrame();
+	void ReadPixel(const vixl::aarch64::VRegister& dst, const vixl::aarch64::Register& addr);
+	void WritePixel(const vixl::aarch64::VRegister& src, const vixl::aarch64::Register& addr, const vixl::aarch64::Register& mask, bool high, bool fast, int psm, int fz);
+	void WritePixel(const vixl::aarch64::VRegister& src, const vixl::aarch64::Register& addr, u8 i, int psm);
+
+	void ReadTexel1(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src,
+		const vixl::aarch64::VRegister& tmp1, int mip_offset);
+	void ReadTexel4(
+		const vixl::aarch64::VRegister& d0, const vixl::aarch64::VRegister& d1,
+		const vixl::aarch64::VRegister& d2s0, const vixl::aarch64::VRegister& d3s1,
+		const vixl::aarch64::VRegister& s2, const vixl::aarch64::VRegister& s3,
+		int mip_offset);
+	void ReadTexelImplLoadTexLOD(const vixl::aarch64::Register& addr, int lod, int mip_offset);
+	void ReadTexelImpl(
+		const vixl::aarch64::VRegister& d0, const vixl::aarch64::VRegister& d1,
+		const vixl::aarch64::VRegister& d2s0, const vixl::aarch64::VRegister& d3s1,
+		const vixl::aarch64::VRegister& s2, const vixl::aarch64::VRegister& s3,
+		int pixels, int mip_offset);
+	void ReadTexelImpl(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& addr,
+		u8 i, const vixl::aarch64::Register& baseRegister, bool preserveDst);
+
+	void modulate16(const vixl::aarch64::VRegister& d, const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& f, u8 shift);
+	void modulate16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& f, u8 shift);
+	void lerp16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& f, u8 shift);
+	void lerp16_4(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& f);
+	void mix16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& temp);
+	void clamp16(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& temp);
+	void alltrue(const vixl::aarch64::VRegister& test, const vixl::aarch64::VRegister& temp);
+	void blend8(const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& mask, const vixl::aarch64::VRegister& temp);
+	void blend8r(const vixl::aarch64::VRegister& b, const vixl::aarch64::VRegister& a, const vixl::aarch64::VRegister& mask, const vixl::aarch64::VRegister& temp);
+	void split16_2x8(const vixl::aarch64::VRegister& l, const vixl::aarch64::VRegister& h, const vixl::aarch64::VRegister& src);
+
+	vixl::aarch64::MacroAssembler m_emitter;
+
+	GSScanlineSelector m_sel;
+
+	vixl::aarch64::Label m_step_label;
+};
diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
index 43a2c9ab3c..033952b161 100644
--- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
+++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
@@ -156,6 +156,26 @@ struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a p
 	struct { GSVector4i i, f; } lod; // lcm == 1
 
 #endif
+
+#ifdef _M_ARM64
+	// Mini version of constant data for ARM64, we don't need all of it
+	alignas(16) u32 const_test_128b[8][4] = {
+		{0x00000000, 0x00000000, 0x00000000, 0x00000000},
+		{0xffffffff, 0x00000000, 0x00000000, 0x00000000},
+		{0xffffffff, 0xffffffff, 0x00000000, 0x00000000},
+		{0xffffffff, 0xffffffff, 0xffffffff, 0x00000000},
+		{0x00000000, 0xffffffff, 0xffffffff, 0xffffffff},
+		{0x00000000, 0x00000000, 0xffffffff, 0xffffffff},
+		{0x00000000, 0x00000000, 0x00000000, 0xffffffff},
+		{0x00000000, 0x00000000, 0x00000000, 0x00000000},
+	};
+	alignas(16) u16 const_movemaskw_mask[8] = {0x3, 0xc, 0x30, 0xc0, 0x300, 0xc00, 0x3000, 0xc000};
+	alignas(16) float const_log2_coef[4] = {
+		0.204446009836232697516f,
+		-1.04913055217340124191f,
+		2.28330284476918490682f,
+		1.0f};
+#endif
 };
 
 struct alignas(32) GSScanlineLocalData // per prim variables, each thread has its own
diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
new file mode 100644
index 0000000000..20d849ee15
--- /dev/null
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
@@ -0,0 +1,339 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#include "GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h"
+#include "GS/Renderers/SW/GSVertexSW.h"
+
+#include "common/StringUtil.h"
+#include "common/Perf.h"
+
+#include <cstdint>
+
+MULTI_ISA_UNSHARED_IMPL;
+
+using namespace vixl::aarch64;
+
+static const auto& _vertex = x0;
+static const auto& _index = x1;
+static const auto& _dscan = x2;
+static const auto& _locals = x3;
+static const auto& _scratchaddr = x7;
+static const auto& _vscratch = v31;
+
+#define _local(field) MemOperand(_locals, offsetof(GSScanlineLocalData, field))
+#define armAsm (&m_emitter)
+
+GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(u64 key, void* code, size_t maxsize)
+	: m_emitter(static_cast<vixl::byte*>(code), maxsize, vixl::aarch64::PositionDependentCode)
+	, m_sel(key)
+{
+	m_en.z = m_sel.zb ? 1 : 0;
+	m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
+	m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
+	m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
+}
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	const bool needs_shift = ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS) || m_en.t || (m_en.c && m_sel.iip);
+	if (needs_shift)
+	{
+		armAsm->Mov(x4, reinterpret_cast<intptr_t>(g_const.m_shift_128b));
+		for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
+		{
+			armAsm->Ldr(VRegister(3 + i, kFormat16B), MemOperand(x4, i * sizeof(g_const.m_shift_128b[0])));
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	armAsm->Ret();
+
+	armAsm->FinalizeCode();
+
+	Perf::any.RegisterKey(GetCode(), GetSize(), "GSSetupPrim_", m_sel.key);
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if (!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if (m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if (m_en.f)
+		{
+			// GSVector4 df = t.wwww();
+			armAsm->Add(_scratchaddr, _dscan, offsetof(GSVertexSW, t.w));
+			armAsm->Ld1r(v1.V4S(), MemOperand(_scratchaddr));
+
+			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+			armAsm->Fmul(v2.V4S(), v1.V4S(), v3.V4S());
+			armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+			armAsm->Trn1(v2.V8H(), v2.V8H(), v2.V8H());
+
+			armAsm->Str(v2.V4S(), _local(d4.f));
+
+			for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+			{
+				// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+				armAsm->Fmul(v2.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
+				armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+				armAsm->Trn1(v2.V8H(), v2.V8H(), v2.V8H());
+
+				armAsm->Str(v2.V4S(), _local(d[i].f));
+			}
+		}
+
+		if (m_en.z)
+		{
+			// VectorF dz = VectorF::broadcast64(&dscan.p.z)
+			armAsm->Add(_scratchaddr, _dscan, offsetof(GSVertexSW, p.z));
+			armAsm->Ld1r(_vscratch.V2D(), MemOperand(_scratchaddr));
+
+			// m_local.d4.z = dz.mul64(GSVector4::f32to64(shift));
+			armAsm->Fcvtl(v1.V2D(), v3.V2S());
+			armAsm->Fmul(v1.V2D(), v1.V2D(), _vscratch.V2D());
+			armAsm->Str(v1.V2D(), _local(d4.z));
+
+			armAsm->Fcvtn(v0.V2S(), _vscratch.V2D());
+			armAsm->Fcvtn2(v0.V4S(), _vscratch.V2D());
+
+			for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+			{
+				// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
+				// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
+
+				armAsm->Fmul(v1.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
+				armAsm->Str(v1.V4S(), _local(d[i].z));
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertex[index[1]].p;
+
+		armAsm->Ldrh(w4, MemOperand(_index, sizeof(u16)));
+		armAsm->Lsl(w4, w4, 6); // * sizeof(GSVertexSW)
+		armAsm->Add(x4, _vertex, x4);
+
+		if (m_en.f)
+		{
+			// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
+
+			armAsm->Ldr(v0, MemOperand(x4, offsetof(GSVertexSW, p)));
+
+			armAsm->Fcvtzs(v1.V4S(), v0.V4S());
+			armAsm->Dup(v1.V8H(), v1.V8H(), 6);
+
+			armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, p.f)));
+		}
+
+		if (m_en.z)
+		{
+			// uint32 z is bypassed in t.w
+
+			armAsm->Add(_scratchaddr, x4, offsetof(GSVertexSW, t.w));
+			armAsm->Ld1r(v0.V4S(), MemOperand(_scratchaddr));
+			armAsm->Str(v0, MemOperand(_locals, offsetof(GSScanlineLocalData, p.z)));
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if (!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector4 t = dscan.t;
+
+	armAsm->Ldr(v0, MemOperand(_dscan, offsetof(GSVertexSW, t)));
+	armAsm->Fmul(v1.V4S(), v0.V4S(), v3.V4S());
+
+	if (m_sel.fst)
+	{
+		// m_local.d4.stq = GSVector4i(t * 4.0f);
+		armAsm->Fcvtzs(v1.V4S(), v1.V4S());
+		armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.stq)));
+	}
+	else
+	{
+		// m_local.d4.stq = t * 4.0f;
+		armAsm->Str(v1, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.stq)));
+	}
+
+	for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector4 ds = t.xxxx();
+		// GSVector4 dt = t.yyyy();
+		// GSVector4 dq = t.zzzz();
+
+		armAsm->Dup(v1.V4S(), v0.V4S(), j);
+
+		for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4 v = ds/dt * m_shift[i];
+
+			armAsm->Fmul(v2.V4S(), v1.V4S(), VRegister(4 + i, 128, 4));
+
+			if (m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector4i(v);
+
+				armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+
+				switch (j)
+				{
+					case 0: armAsm->Str(v2, _local(d[i].s)); break;
+					case 1: armAsm->Str(v2, _local(d[i].t)); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				switch (j)
+				{
+					case 0: armAsm->Str(v2, _local(d[i].s)); break;
+					case 1: armAsm->Str(v2, _local(d[i].t)); break;
+					case 2: armAsm->Str(v2, _local(d[i].q)); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if (!m_en.c)
+	{
+		return;
+	}
+
+	if (m_sel.iip)
+	{
+		// GSVector4 c = dscan.c;
+		armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));
+
+		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+
+		armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
+		armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+		armAsm->Rev64(_vscratch.V4S(), v2.V4S());
+		armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
+		armAsm->Sqxtn(v2.V4H(), v2.V4S());
+		armAsm->Dup(v2.V2D(), v2.V2D(), 0);
+		armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));
+
+		// GSVector4 dr = c.xxxx();
+		// GSVector4 db = c.zzzz();
+
+		armAsm->Dup(v0.V4S(), v16.V4S(), 0);
+		armAsm->Dup(v1.V4S(), v16.V4S(), 2);
+
+		for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+			armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
+			armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+			armAsm->Sqxtn(v2.V4H(), v2.V4S());
+			armAsm->Dup(v2.V2D(), v2.V2D(), 0);
+
+			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+			armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
+			armAsm->Fcvtzs(v3.V4S(), v3.V4S());
+			armAsm->Sqxtn(v3.V4H(), v3.V4S());
+			armAsm->Dup(v3.V2D(), v3.V2D(), 0);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
+			armAsm->Str(v2, _local(d[i].rb));
+		}
+
+		// GSVector4 c = dscan.c;
+
+		// GSVector4 dg = c.yyyy();
+		// GSVector4 da = c.wwww();
+
+		armAsm->Dup(v0.V4S(), v16.V4S(), 1);
+		armAsm->Dup(v1.V4S(), v16.V4S(), 3);
+
+		for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+			armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
+			armAsm->Fcvtzs(v2.V4S(), v2.V4S());
+			armAsm->Sqxtn(v2.V4H(), v2.V4S());
+			armAsm->Dup(v2.V2D(), v2.V2D(), 0);
+
+			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+			armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
+			armAsm->Fcvtzs(v3.V4S(), v3.V4S());
+			armAsm->Sqxtn(v3.V4H(), v3.V4S());
+			armAsm->Dup(v3.V2D(), v3.V2D(), 0);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
+			armAsm->Str(v2, _local(d[i].ga));
+		}
+	}
+	else
+	{
+		// GSVector4i c = GSVector4i(vertex[index[last].c);
+
+		int last = 0;
+
+		switch (m_sel.prim)
+		{
+			case GS_POINT_CLASS:    last = 0; break;
+			case GS_LINE_CLASS:     last = 1; break;
+			case GS_TRIANGLE_CLASS: last = 2; break;
+			case GS_SPRITE_CLASS:   last = 1; break;
+		}
+
+		if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
+		{
+			armAsm->Ldrh(w4, MemOperand(_index, sizeof(u16) * last));
+			armAsm->Lsl(w4, w4, 6); // * sizeof(GSVertexSW)
+			armAsm->Add(x4, _vertex, x4);
+		}
+
+		armAsm->Ldr(v0, MemOperand(x4, offsetof(GSVertexSW, c)));
+		armAsm->Fcvtzs(v0.V4S(), v0.V4S());
+
+		// c = c.upl16(c.zwxy());
+
+		armAsm->Ext(v1.V16B(), v0.V16B(), v0.V16B(), 8);
+		armAsm->Zip1(v0.V8H(), v0.V8H(), v1.V8H());
+
+		// if(!tme) c = c.srl16(7);
+
+		if (m_sel.tfx == TFX_NONE)
+			armAsm->Ushr(v0.V8H(), v0.V8H(), 7);
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		armAsm->Dup(v1.V4S(), v0.V4S(), 0);
+		armAsm->Dup(v2.V4S(), v0.V4S(), 2);
+
+		armAsm->Str(v1, _local(c.rb));
+		armAsm->Str(v2, _local(c.ga));
+	}
+}
diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h
new file mode 100644
index 0000000000..675c007b09
--- /dev/null
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.h
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#pragma once
+
+#include "GS/Renderers/Common/GSFunctionMap.h"
+#include "GS/Renderers/SW/GSScanlineEnvironment.h"
+
+#include "vixl/aarch64/macro-assembler-aarch64.h"
+
+class GSSetupPrimCodeGenerator
+{
+public:
+	GSSetupPrimCodeGenerator(u64 key, void* code, size_t maxsize);
+	void Generate();
+
+	size_t GetSize() const { return m_emitter.GetSizeOfCodeGenerated(); }
+	const u8* GetCode() const { return m_emitter.GetBuffer().GetStartAddress<const u8*>(); }
+
+private:
+	void Depth();
+	void Texture();
+	void Color();
+
+	vixl::aarch64::MacroAssembler m_emitter;
+
+	GSScanlineSelector m_sel;
+
+	struct
+	{
+		u32 z : 1, f : 1, t : 1, c : 1;
+	} m_en;
+};
diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj
index be7f9860fa..a49f77ff60 100644
--- a/pcsx2/pcsx2.vcxproj
+++ b/pcsx2/pcsx2.vcxproj
@@ -207,6 +207,12 @@
     <ClCompile Include="GS\Renderers\OpenGL\GLStreamBuffer.cpp">
       <ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
     </ClCompile>
+    <ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="GS\Renderers\Vulkan\VKBuilders.cpp">
       <ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
     </ClCompile>
@@ -317,7 +323,9 @@
     <ClCompile Include="GS\Renderers\Common\GSDirtyRect.cpp" />
     <ClCompile Include="GS\GSDrawingContext.cpp" />
     <ClCompile Include="GS\Renderers\SW\GSDrawScanline.cpp" />
-    <ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp" />
+    <ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="GS\GSDump.cpp" />
     <ClCompile Include="GS\Renderers\Common\GSFunctionMap.cpp" />
     <ClCompile Include="GS\Renderers\HW\GSHwHack.cpp" />
@@ -333,7 +341,9 @@
     <ClCompile Include="GS\Renderers\HW\GSRendererHWMultiISA.cpp" />
     <ClCompile Include="GS\Renderers\Null\GSRendererNull.cpp" />
     <ClCompile Include="GS\Renderers\SW\GSRendererSW.cpp" />
-    <ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp" />
+    <ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="GS\GSState.cpp" />
     <ClCompile Include="GS\GSTables.cpp" />
     <ClCompile Include="GS\Renderers\Common\GSTexture.cpp" />
@@ -607,6 +617,8 @@
     <ClInclude Include="DEV9\Win32\pcap_io_win32_funcs.h" />
     <ClInclude Include="DEV9\Win32\tap.h" />
     <ClInclude Include="GameList.h" />
+    <ClInclude Include="GS\GSVector4i_arm64.h" />
+    <ClInclude Include="GS\GSVector4_arm64.h" />
     <ClInclude Include="GS\Renderers\DX11\D3D11ShaderCache.h" />
     <ClInclude Include="GS\Renderers\DX12\D3D12Builders.h" />
     <ClInclude Include="GS\Renderers\DX12\D3D12DescriptorHeapManager.h" />
@@ -628,6 +640,12 @@
     <ClInclude Include="GS\Renderers\OpenGL\GLStreamBuffer.h">
       <ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
     </ClInclude>
+    <ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClInclude>
     <ClInclude Include="GS\Renderers\Vulkan\VKBuilders.h">
       <ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
     </ClInclude>
@@ -727,8 +745,12 @@
     <ClInclude Include="GS\GSDrawingContext.h" />
     <ClInclude Include="GS\GSDrawingEnvironment.h" />
     <ClInclude Include="GS\Renderers\SW\GSDrawScanline.h" />
-    <ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h" />
-    <ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h" />
+    <ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClInclude>
     <ClInclude Include="GS\GSDump.h" />
     <ClInclude Include="GS\Renderers\Common\GSFastList.h" />
     <ClInclude Include="GS\Renderers\Common\GSFunctionMap.h" />
@@ -743,7 +765,9 @@
     <ClInclude Include="GS\Renderers\Null\GSRendererNull.h" />
     <ClInclude Include="GS\Renderers\SW\GSRendererSW.h" />
     <ClInclude Include="GS\Renderers\SW\GSScanlineEnvironment.h" />
-    <ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h" />
+    <ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClInclude>
     <ClInclude Include="GS\GSState.h" />
     <ClInclude Include="GS\GSTables.h" />
     <ClInclude Include="GS\Renderers\Common\GSTexture.h" />
diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters
index c5f43e59f3..a4e1643a36 100644
--- a/pcsx2/pcsx2.vcxproj.filters
+++ b/pcsx2/pcsx2.vcxproj.filters
@@ -1392,6 +1392,12 @@
     <ClCompile Include="CDVD\FlatFileReader.cpp">
       <Filter>System\ISO</Filter>
     </ClCompile>
+    <ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.cpp">
+      <Filter>System\Ps2\GS\Renderers\Software</Filter>
+    </ClCompile>
+    <ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
+      <Filter>System\Ps2\GS\Renderers\Software</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Patch.h">
@@ -2303,6 +2309,18 @@
     <ClInclude Include="Vif_HashBucket.h">
       <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
     </ClInclude>
+    <ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.arm64.h">
+      <Filter>System\Ps2\GS\Renderers\Software</Filter>
+    </ClInclude>
+    <ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.h">
+      <Filter>System\Ps2\GS\Renderers\Software</Filter>
+    </ClInclude>
+    <ClInclude Include="GS\GSVector4_arm64.h">
+      <Filter>System\Ps2\GS</Filter>
+    </ClInclude>
+    <ClInclude Include="GS\GSVector4i_arm64.h">
+      <Filter>System\Ps2\GS</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <CustomBuildStep Include="rdebug\deci2.h">