From 5c03e1d9405b634e342f481f6ee9b7f1c6524fa4 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 22 Nov 2024 18:44:10 +1000
Subject: [PATCH] Misc: Fix alignment errors on ARM32

---
 src/common-tests/gsvector_yuvtorgb_test.cpp |   4 +-
 src/common/gsvector.cpp                     |   2 +-
 src/common/gsvector.h                       |   2 +-
 src/common/gsvector_neon.h                  | 138 ++++++++++++++++++--
 src/common/gsvector_nosimd.h                |  47 ++++++-
 src/common/gsvector_sse.h                   |  28 +++-
 src/core/bus.cpp                            |   2 +
 src/core/gpu.h                              |   4 +-
 src/core/gpu_hw.cpp                         |  18 +--
 src/core/gpu_hw.h                           |   2 +-
 src/core/gpu_hw_texture_cache.cpp           |   8 +-
 src/core/gpu_sw.cpp                         |  12 +-
 src/core/gpu_sw_rasterizer.inl              |   4 +-
 src/core/mdec.cpp                           |  12 +-
 src/core/spu.cpp                            |   2 +-
 src/util/postprocessing_shader_fx.cpp       |   2 +-
 16 files changed, 230 insertions(+), 57 deletions(-)
diff --git a/src/common-tests/gsvector_yuvtorgb_test.cpp b/src/common-tests/gsvector_yuvtorgb_test.cpp
index 3153b2822..edd435c11 100644
--- a/src/common-tests/gsvector_yuvtorgb_test.cpp
+++ b/src/common-tests/gsvector_yuvtorgb_test.cpp
@@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array<s16, 64>& Crblk, const std::array<s
   const GSVector4i addval = signed_output ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
   for (u32 y = 0; y < 8; y++)
   {
-    const GSVector4i Cr = GSVector4i::loadl(&Crblk[(y / 2) * 8]).s16to32();
-    const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(y / 2) * 8]).s16to32();
+    const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(y / 2) * 8]).s16to32();
+    const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(y / 2) * 8]).s16to32();
     const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
 
     // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
diff --git a/src/common/gsvector.cpp b/src/common/gsvector.cpp
index 448ddcae8..392b9da44 100644
--- a/src/common/gsvector.cpp
+++ b/src/common/gsvector.cpp
@@ -53,7 +53,7 @@ GSMatrix2x2 GSMatrix2x2::Rotation(float angle_in_radians)
 
 GSVector2 GSMatrix2x2::row(size_t i) const
 {
-  return GSVector2::load(&E[i][0]);
+  return GSVector2::load<true>(&E[i][0]);
 }
 
 GSVector2 GSMatrix2x2::col(size_t i) const
diff --git a/src/common/gsvector.h b/src/common/gsvector.h
index b343a43c9..3bccfa3ca 100644
--- a/src/common/gsvector.h
+++ b/src/common/gsvector.h
@@ -35,5 +35,5 @@ public:
 
   void store(void* m);
 
-  float E[2][2];
+  alignas(8) float E[2][2];
 };
diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h
index 9a558269f..c8efec076 100644
--- a/src/common/gsvector_neon.h
+++ b/src/common/gsvector_neon.h
@@ -690,7 +690,16 @@ public:
 
   ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
 
-  ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); }
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector2i load(const void* p)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
+#endif
+
+    return GSVector2i(vld1_s32((const int32_t*)p));
+  }
 
   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
   {
@@ -698,7 +707,19 @@ public:
     std::memcpy(p, &val, sizeof(s32));
   }
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
+      return;
+    }
+#endif
+
+    vst1_s32((int32_t*)p, v.v2s);
+  }
 
   ALWAYS_INLINE void operator&=(const GSVector2i& v)
   {
@@ -903,9 +924,30 @@ public:
 
   ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
 
-  ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast<const float*>(p))); }
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector2 load(const void* p)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
+#endif
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast<float*>(p), v.v2s); }
+    return GSVector2(vld1_f32(static_cast<const float*>(p)));
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
+      return;
+    }
+#endif
+
+    vst1_f32(static_cast<float*>(p), v.v2s);
+  }
 
   ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
 
@@ -2134,13 +2176,25 @@ public:
 
   ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadl(const void* p)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
+#endif
+
     return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
+#endif
+
     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
   }
 
@@ -2149,6 +2203,11 @@ public:
   template<bool aligned>
   ALWAYS_INLINE static GSVector4i load(const void* p)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
+#endif
+
     return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
   }
 
@@ -2167,19 +2226,45 @@ public:
     std::memcpy(p, &val, sizeof(u32));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
+      return;
+    }
+#endif
+
     vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
+      return;
+    }
+#endif
+
     vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
   }
 
   template<bool aligned>
   ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
+      return;
+    }
+#endif
+
     vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
   }
 
@@ -2652,8 +2737,14 @@ public:
 
   ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4 loadl(const void* p)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
+#endif
+
     return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
   }
 
@@ -2662,32 +2753,55 @@ public:
   template<bool aligned>
   ALWAYS_INLINE static GSVector4 load(const void* p)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
+#endif
+
     return GSVector4(vld1q_f32((const float*)p));
   }
 
   ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
 
+  template<bool aligned>
   ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
   {
-#ifdef CPU_ARCH_ARM64
-    vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
-#else
-    vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
+      return;
+    }
 #endif
+
+    vst1_f32((float*)p, vget_low_f32(v.v4s));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
   {
-#ifdef CPU_ARCH_ARM64
-    vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
-#else
-    vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
+      return;
+    }
 #endif
+
+    vst1_f32((float*)p, vget_high_f32(v.v4s));
   }
 
   template<bool aligned>
   ALWAYS_INLINE static void store(void* p, const GSVector4& v)
   {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
+      return;
+    }
+#endif
+
     vst1q_f32((float*)p, v.v4s);
   }
 
diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h
index b7a719858..6d73592d9 100644
--- a/src/common/gsvector_nosimd.h
+++ b/src/common/gsvector_nosimd.h
@@ -467,6 +467,7 @@ public:
 
   ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector2i load(const void* p)
   {
     GSVector2i ret;
@@ -474,7 +475,11 @@ public:
     return ret;
   }
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
+  {
+    std::memcpy(p, v.S32, sizeof(S32));
+  }
 
   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
 
@@ -658,6 +663,7 @@ public:
     return ret;
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector2 load(const void* p)
   {
     GSVector2 ret;
@@ -665,7 +671,11 @@ public:
     return ret;
   }
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
+  {
+    std::memcpy(p, &v.F32, sizeof(F32));
+  }
 
   ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
 
@@ -1530,6 +1540,7 @@ public:
 
   ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadl(const void* p)
   {
     GSVector4i ret;
@@ -1538,6 +1549,7 @@ public:
     return ret;
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   {
     GSVector4i ret;
@@ -1546,7 +1558,11 @@ public:
     return ret;
   }
 
-  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
+  {
+    return loadh<true>(&v);
+  }
 
   template<bool aligned>
   ALWAYS_INLINE static GSVector4i load(const void* p)
@@ -1558,9 +1574,17 @@ public:
 
   ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); }
 
-  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
+  {
+    std::memcpy(p, &v.S32[0], sizeof(s32) * 2);
+  }
 
-  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
+  {
+    std::memcpy(p, &v.S32[2], sizeof(s32) * 2);
+  }
 
   template<bool aligned>
   ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
@@ -1958,6 +1982,7 @@ public:
     return ret;
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4 loadl(const void* p)
   {
     GSVector4 ret;
@@ -1977,9 +2002,17 @@ public:
 
   ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }
 
-  ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
+  {
+    std::memcpy(p, &v.x, sizeof(float) * 2);
+  }
 
-  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
+  {
+    std::memcpy(p, &v.z, sizeof(float) * 2);
+  }
 
   template<bool aligned>
   ALWAYS_INLINE static void store(void* p, const GSVector4& v)
diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h
index 86517ed56..c37d50d2f 100644
--- a/src/common/gsvector_sse.h
+++ b/src/common/gsvector_sse.h
@@ -585,12 +585,19 @@ public:
 
   ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
   ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); }
+
+  template<bool aligned>
   ALWAYS_INLINE static GSVector2i load(const void* p)
   {
     return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
   }
 
-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
+  {
+    _mm_storel_epi64(static_cast<__m128i*>(p), v.m);
+  }
+
   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
 
   ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
@@ -806,11 +813,13 @@ public:
 
   ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector2 load(const void* p)
   {
     return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static void store(void* p, const GSVector2& v)
   {
     _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
@@ -1711,16 +1720,19 @@ public:
   ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
   ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadl(const void* p)
   {
     return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   {
     return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
   }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
   {
     return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
@@ -1734,7 +1746,14 @@ public:
   }
 
   ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
-  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
+  {
+    _mm_storel_epi64(static_cast<__m128i*>(p), v.m);
+  }
+
+  template<bool aligned>
   ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
   {
     _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
@@ -2115,6 +2134,7 @@ public:
 
   ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
 
+  template<bool aligned>
   ALWAYS_INLINE static GSVector4 loadl(const void* p)
   {
     return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
@@ -2127,10 +2147,14 @@ public:
   }
 
   ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
+
+  template<bool aligned>
   ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
   {
     _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
   }
+
+  template<bool aligned>
   ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
   {
     _mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
diff --git a/src/core/bus.cpp b/src/core/bus.cpp
index 2c7bcb3ca..2c04e8680 100644
--- a/src/core/bus.cpp
+++ b/src/core/bus.cpp
@@ -549,7 +549,9 @@ u8* Bus::GetLUTFastmemPointer(u32 address, u8* ram_ptr)
 
 void Bus::MapFastmemViews()
 {
+#ifdef ENABLE_MMAP_FASTMEM
   Assert(s_fastmem_ram_views.empty());
+#endif
 
   const CPUFastmemMode mode = g_settings.cpu_fastmem_mode;
   if (mode == CPUFastmemMode::MMap)
diff --git a/src/core/gpu.h b/src/core/gpu.h
index a4f858794..a98b20cbb 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -345,8 +345,8 @@ protected:
     // However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect
     // the edges with the clip rectangle.
     // TODO: Coordinates are exclusive, so off by one here...
-    const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x);
-    const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z);
+    const GSVector2i clamp_min = GSVector2i::load<true>(&m_clamped_drawing_area.x);
+    const GSVector2i clamp_max = GSVector2i::load<true>(&m_clamped_drawing_area.z);
     v1 = v1.sat_s32(clamp_min, clamp_max);
     v2 = v2.sat_s32(clamp_min, clamp_max);
     v3 = v3.sat_s32(clamp_min, clamp_max);
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index 13dd40ebd..3b93b2f3d 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -2570,9 +2570,9 @@ void GPU_HW::LoadVertices()
       }
 
       // Cull polygons which are too large.
-      const GSVector2 v0f = GSVector2::load(&vertices[0].x);
-      const GSVector2 v1f = GSVector2::load(&vertices[1].x);
-      const GSVector2 v2f = GSVector2::load(&vertices[2].x);
+      const GSVector2 v0f = GSVector2::load<false>(&vertices[0].x);
+      const GSVector2 v1f = GSVector2::load<false>(&vertices[1].x);
+      const GSVector2 v2f = GSVector2::load<false>(&vertices[2].x);
       const GSVector2 min_pos_12 = v1f.min(v2f);
       const GSVector2 max_pos_12 = v1f.max(v2f);
       const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
@@ -2617,7 +2617,7 @@ void GPU_HW::LoadVertices()
       // quads
       if (rc.quad_polygon)
       {
-        const GSVector2 v3f = GSVector2::load(&vertices[3].x);
+        const GSVector2 v3f = GSVector2::load<false>(&vertices[3].x);
         const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
                                            .add32(GSVector4i::cxpr(0, 0, 1, 1));
         const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
@@ -2845,9 +2845,9 @@ void GPU_HW::LoadVertices()
         {
           GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2);
           FillDrawCommand(cmd, rc);
-          GSVector4i::storel(&cmd->vertices[0], bounds);
+          GSVector4i::storel<false>(&cmd->vertices[0], bounds);
           cmd->vertices[0].color = start_color;
-          GSVector4i::storeh(&cmd->vertices[1], bounds);
+          GSVector4i::storeh<false>(&cmd->vertices[1], bounds);
           cmd->vertices[1].color = end_color;
           m_sw_renderer->PushCommand(cmd);
         }
@@ -2870,7 +2870,7 @@ void GPU_HW::LoadVertices()
         {
           cmd = m_sw_renderer->NewDrawLineCommand(num_vertices);
           FillDrawCommand(cmd, rc);
-          GSVector2i::store(&cmd->vertices[0].x, start_pos);
+          GSVector2i::store<false>(&cmd->vertices[0].x, start_pos);
           cmd->vertices[0].color = start_color;
         }
         else
@@ -2905,7 +2905,7 @@ void GPU_HW::LoadVertices()
 
           if (cmd)
           {
-            GSVector2i::store(&cmd->vertices[i], end_pos);
+            GSVector2i::store<false>(&cmd->vertices[i], end_pos);
             cmd->vertices[i].color = end_color;
           }
         }
@@ -2978,7 +2978,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
 
   const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode;
   const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast<u8>(tmode));
-  const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy();
+  const GSVector4i page_offset = GSVector4i::loadl<true>(m_current_texture_page_offset).xyxy();
 
   uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift));   // shift only goes on the x
   uv_rect = uv_rect.add32(page_offset);                  // page offset
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index a35f88bad..46bdc78fa 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -317,7 +317,7 @@ private:
   GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero.
   GSVector4i m_current_uv_rect = INVALID_RECT;
   GSVector4i m_current_draw_rect = INVALID_RECT;
-  s32 m_current_texture_page_offset[2] = {};
+  alignas(8) s32 m_current_texture_page_offset[2] = {};
 
   std::unique_ptr<GPUPipeline> m_wireframe_pipeline;
 
diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp
index 1efc0988d..f7f825887 100644
--- a/src/core/gpu_hw_texture_cache.cpp
+++ b/src/core/gpu_hw_texture_cache.cpp
@@ -3305,8 +3305,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
   // TODO: Use rects instead of fullscreen tris, maybe avoid the copy..
   alignas(VECTOR_ALIGNMENT) float uniforms[4];
   GSVector2 texture_size = GSVector2(GSVector2i(entry->texture->GetWidth(), entry->texture->GetHeight()));
-  GSVector2::store(&uniforms[0], texture_size);
-  GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
+  GSVector2::store<true>(&uniforms[0], texture_size);
+  GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
   g_gpu_device->InvalidateRenderTarget(s_state.replacement_texture_render_target.get());
   g_gpu_device->SetRenderTarget(s_state.replacement_texture_render_target.get());
   g_gpu_device->SetViewportAndScissor(0, 0, new_width, new_height);
@@ -3325,8 +3325,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
 
     const GSVector4i dst_rect = GSVector4i(GSVector4(si.dst_rect) * max_scale_v);
     texture_size = GSVector2(GSVector2i(temp_texture->GetWidth(), temp_texture->GetHeight()));
-    GSVector2::store(&uniforms[0], texture_size);
-    GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
+    GSVector2::store<true>(&uniforms[0], texture_size);
+    GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
     g_gpu_device->SetViewportAndScissor(dst_rect);
     g_gpu_device->SetTextureSampler(0, temp_texture.get(), g_gpu_device->GetNearestSampler());
     g_gpu_device->SetPipeline(si.invert_alpha ? s_state.replacement_semitransparent_draw_pipeline.get() :
diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp
index 10eb5a5bc..068db4164 100644
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@@ -537,7 +537,7 @@ void GPU_SW::DispatchRenderCommand()
         vert->x = m_drawing_offset.x + vp.x;
         vert->y = m_drawing_offset.y + vp.y;
         vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
-        positions[i] = GSVector2i::load(&vert->x);
+        positions[i] = GSVector2i::load<false>(&vert->x);
       }
 
       // Cull polygons which are too large.
@@ -686,8 +686,8 @@ void GPU_SW::DispatchRenderCommand()
           cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
         }
 
-        const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
-        const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
+        const GSVector4i v0 = GSVector4i::loadl<false>(&cmd->vertices[0].x);
+        const GSVector4i v1 = GSVector4i::loadl<false>(&cmd->vertices[1].x);
         const GSVector4i rect = v0.min_s32(v1).xyxy(v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
         const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
 
@@ -711,7 +711,7 @@ void GPU_SW::DispatchRenderCommand()
 
         u32 buffer_pos = 0;
         const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
-        const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x);
+        const GSVector2i draw_offset = GSVector2i::load<false>(&m_drawing_offset.x);
         GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset);
         u32 start_color = m_render_command.color_for_first_vertex;
 
@@ -740,9 +740,9 @@ void GPU_SW::DispatchRenderCommand()
             GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count];
             out_vertex_count += 2;
 
-            GSVector2i::store(&out_vertex[0].x, start_pos);
+            GSVector2i::store<false>(&out_vertex[0].x, start_pos);
             out_vertex[0].color = start_color;
-            GSVector2i::store(&out_vertex[1].x, end_pos);
+            GSVector2i::store<false>(&out_vertex[1].x, end_pos);
             out_vertex[1].color = end_color;
           }
 
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index 128f3af9c..3343862a9 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -397,7 +397,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
 {
   if (x <= (VRAM_WIDTH - 4))
   {
-    return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
+    return GSVector4i::loadl<false>(&g_vram[y * VRAM_WIDTH + x]).u16to32();
   }
   else
   {
@@ -415,7 +415,7 @@ ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
   const GSVector4i packed_color = color.pu32();
   if (x <= (VRAM_WIDTH - 4))
   {
-    GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
+    GSVector4i::storel<false>(&g_vram[y * VRAM_WIDTH + x], packed_color);
   }
   else
   {
diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp
index de2029833..f5a54f060 100644
--- a/src/core/mdec.cpp
+++ b/src/core/mdec.cpp
@@ -711,10 +711,10 @@ void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late)
 
       for (u32 index = 0; index < s_state.block_rgb.size(); index += 16)
       {
-        const GSVector4i rgbx0 = GSVector4i::load<false>(&s_state.block_rgb[index]);
-        const GSVector4i rgbx1 = GSVector4i::load<false>(&s_state.block_rgb[index + 4]);
-        const GSVector4i rgbx2 = GSVector4i::load<false>(&s_state.block_rgb[index + 8]);
-        const GSVector4i rgbx3 = GSVector4i::load<false>(&s_state.block_rgb[index + 12]);
+        const GSVector4i rgbx0 = GSVector4i::load<true>(&s_state.block_rgb[index]);
+        const GSVector4i rgbx1 = GSVector4i::load<true>(&s_state.block_rgb[index + 4]);
+        const GSVector4i rgbx2 = GSVector4i::load<true>(&s_state.block_rgb[index + 8]);
+        const GSVector4i rgbx3 = GSVector4i::load<true>(&s_state.block_rgb[index + 12]);
 
         GSVector4i::store<true>(&rgbp[0], rgbx0.shuffle8(mask00) | rgbx1.shuffle8(mask01));
         GSVector4i::store<true>(&rgbp[4], rgbx1.shuffle8(mask11) | rgbx2.shuffle8(mask12));
@@ -1048,8 +1048,8 @@ void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const
   const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
   for (u32 y = 0; y < 8; y++)
   {
-    const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
-    const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
+    const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
+    const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
     const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
 
     // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
diff --git a/src/core/spu.cpp b/src/core/spu.cpp
index 4800ec264..f24c8c8b2 100644
--- a/src/core/spu.cpp
+++ b/src/core/spu.cpp
@@ -2318,7 +2318,7 @@ void SPU::ProcessReverb(s32 left_in, s32 right_in, s32* left_out, s32* right_out
       srcs = GSVector4i::load<false>(&src[8]);
       acc = acc.add32(GSVector4i::load<true>(&resample_coeff[8]).mul32l(srcs.s16to32()));
       acc = acc.add32(GSVector4i::load<true>(&resample_coeff[12]).mul32l(srcs.uph64().s16to32()));
-      srcs = GSVector4i::loadl(&src[16]);
+      srcs = GSVector4i::loadl<false>(&src[16]);
       acc = acc.add32(GSVector4i::load<true>(&resample_coeff[16]).mul32l(srcs.s16to32()));
 
       out[channel] = std::clamp<s32>(acc.addv_s32() >> 14, -32768, 32767);
diff --git a/src/util/postprocessing_shader_fx.cpp b/src/util/postprocessing_shader_fx.cpp
index 6486ce9ae..dc4234048 100644
--- a/src/util/postprocessing_shader_fx.cpp
+++ b/src/util/postprocessing_shader_fx.cpp
@@ -1683,7 +1683,7 @@ GPUDevice::PresentResult PostProcessing::ReShadeFXShader::Apply(GPUTexture* inpu
 
         case SourceOptionType::ViewportOffset:
         {
-          GSVector4::storel(dst, GSVector4(final_rect));
+          GSVector4::storel<false>(dst, GSVector4(final_rect));
         }
         break;