Misc: Fix alignment errors on ARM32

2024-11-22 18:44:10 +10:00 · 2024-11-22 18:44:10 +10:00 · 5c03e1d940
parent bb24d406f2
commit 5c03e1d940
16 changed files with 230 additions and 57 deletions
--- a/src/common-tests/gsvector_yuvtorgb_test.cpp
+++ b/src/common-tests/gsvector_yuvtorgb_test.cpp
@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array<s16, 64>& Crblk, const std::array<s
  const GSVector4i addval = signed_output ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
  for (u32 y = 0; y < 8; y++)
  {
-    const GSVector4i Cr = GSVector4i::loadl(&Crblk[(y / 2) * 8]).s16to32();
-    const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(y / 2) * 8]).s16to32();
+    const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(y / 2) * 8]).s16to32();
+    const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(y / 2) * 8]).s16to32();
    const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);

    // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
--- a/src/common/gsvector.cpp
+++ b/src/common/gsvector.cpp
@ -53,7 +53,7 @@ GSMatrix2x2 GSMatrix2x2::Rotation(float angle_in_radians)

 GSVector2 GSMatrix2x2::row(size_t i) const
 {
-  return GSVector2::load(&E[i][0]);
+  return GSVector2::load<true>(&E[i][0]);
 }

 GSVector2 GSMatrix2x2::col(size_t i) const
--- a/src/common/gsvector.h
+++ b/src/common/gsvector.h
@ -35,5 +35,5 @@ public:

  void store(void* m);

-  float E[2][2];
+  alignas(8) float E[2][2];
 };
--- a/src/common/gsvector_neon.h
+++ b/src/common/gsvector_neon.h
@ -690,7 +690,16 @@ public:

  ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }

-  ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); }
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector2i load(const void* p)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
+#endif
+
+    return GSVector2i(vld1_s32((const int32_t*)p));
+  }

  ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
  {
@ -698,7 +707,19 @@ public:
    std::memcpy(p, &val, sizeof(s32));
  }

-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
+      return;
+    }
+#endif
+
+    vst1_s32((int32_t*)p, v.v2s);
+  }

  ALWAYS_INLINE void operator&=(const GSVector2i& v)
  {
@ -903,9 +924,30 @@ public:

  ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }

-  ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast<const float*>(p))); }
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector2 load(const void* p)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
+#endif

-  ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast<float*>(p), v.v2s); }
+    return GSVector2(vld1_f32(static_cast<const float*>(p)));
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
+  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
+      return;
+    }
+#endif
+
+    vst1_f32(static_cast<float*>(p), v.v2s);
+  }

  ALWAYS_INLINE GSVector2 operator-() const { return neg(); }

@ -2134,13 +2176,25 @@ public:

  ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadl(const void* p)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
+#endif
+
    return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
  }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadh(const void* p)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
+#endif
+
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
  }

@ -2149,6 +2203,11 @@ public:
  template<bool aligned>
  ALWAYS_INLINE static GSVector4i load(const void* p)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
+#endif
+
    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
  }

@ -2167,19 +2226,45 @@ public:
    std::memcpy(p, &val, sizeof(u32));
  }

+  template<bool aligned>
  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
+      return;
+    }
+#endif
+
    vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
  }

+  template<bool aligned>
  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
+      return;
+    }
+#endif
+
    vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
  }

  template<bool aligned>
  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
+      return;
+    }
+#endif
+
    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
  }

@ -2652,8 +2737,14 @@ public:

  ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4 loadl(const void* p)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
+#endif
+
    return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
  }

@ -2662,32 +2753,55 @@ public:
  template<bool aligned>
  ALWAYS_INLINE static GSVector4 load(const void* p)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+      return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
+#endif
+
    return GSVector4(vld1q_f32((const float*)p));
  }

  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }

+  template<bool aligned>
  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
  {
-#ifdef CPU_ARCH_ARM64
-    vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
-#else
-    vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
+      return;
+    }
 #endif
+
+    vst1_f32((float*)p, vget_low_f32(v.v4s));
  }

+  template<bool aligned>
  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
  {
-#ifdef CPU_ARCH_ARM64
-    vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
-#else
-    vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
+      return;
+    }
 #endif
+
+    vst1_f32((float*)p, vget_high_f32(v.v4s));
  }

  template<bool aligned>
  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
  {
+#ifdef CPU_ARCH_ARM32
+    if constexpr (!aligned)
+    {
+      vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
+      return;
+    }
+#endif
+
    vst1q_f32((float*)p, v.v4s);
  }

--- a/src/common/gsvector_nosimd.h
+++ b/src/common/gsvector_nosimd.h
@ -467,6 +467,7 @@ public:

  ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector2i load(const void* p)
  {
    GSVector2i ret;
@ -474,7 +475,11 @@ public:
    return ret;
  }

-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
+  {
+    std::memcpy(p, v.S32, sizeof(S32));
+  }

  ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }

@ -658,6 +663,7 @@ public:
    return ret;
  }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector2 load(const void* p)
  {
    GSVector2 ret;
@ -665,7 +671,11 @@ public:
    return ret;
  }

-  ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
+  {
+    std::memcpy(p, &v.F32, sizeof(F32));
+  }

  ALWAYS_INLINE GSVector2 operator-() const { return neg(); }

@ -1530,6 +1540,7 @@ public:

  ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadl(const void* p)
  {
    GSVector4i ret;
@ -1538,6 +1549,7 @@ public:
    return ret;
  }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadh(const void* p)
  {
    GSVector4i ret;
@ -1546,7 +1558,11 @@ public:
    return ret;
  }

-  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
+  {
+    return loadh<true>(&v);
+  }

  template<bool aligned>
  ALWAYS_INLINE static GSVector4i load(const void* p)
@ -1558,9 +1574,17 @@ public:

  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); }

-  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
+  {
+    std::memcpy(p, &v.S32[0], sizeof(s32) * 2);
+  }

-  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
+  {
+    std::memcpy(p, &v.S32[2], sizeof(s32) * 2);
+  }

  template<bool aligned>
  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
@ -1958,6 +1982,7 @@ public:
    return ret;
  }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4 loadl(const void* p)
  {
    GSVector4 ret;
@ -1977,9 +2002,17 @@ public:

  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }

-  ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
+  {
+    std::memcpy(p, &v.x, sizeof(float) * 2);
+  }

-  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
+  template<bool aligned>
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
+  {
+    std::memcpy(p, &v.z, sizeof(float) * 2);
+  }

  template<bool aligned>
  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
--- a/src/common/gsvector_sse.h
+++ b/src/common/gsvector_sse.h
@ -585,12 +585,19 @@ public:

  ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
  ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); }
+
+  template<bool aligned>
  ALWAYS_INLINE static GSVector2i load(const void* p)
  {
    return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
  }

-  ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
+  {
+    _mm_storel_epi64(static_cast<__m128i*>(p), v.m);
+  }
+
  ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }

  ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
@ -806,11 +813,13 @@ public:

  ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector2 load(const void* p)
  {
    return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
  }

+  template<bool aligned>
  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
  {
    _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
@ -1711,16 +1720,19 @@ public:
  ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
  ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadl(const void* p)
  {
    return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
  }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadh(const void* p)
  {
    return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
  }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
  {
    return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
@ -1734,7 +1746,14 @@ public:
  }

  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
-  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
+  {
+    _mm_storel_epi64(static_cast<__m128i*>(p), v.m);
+  }
+
+  template<bool aligned>
  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
  {
    _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
@ -2115,6 +2134,7 @@ public:

  ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }

+  template<bool aligned>
  ALWAYS_INLINE static GSVector4 loadl(const void* p)
  {
    return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
@ -2127,10 +2147,14 @@ public:
  }

  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
+
+  template<bool aligned>
  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
  {
    _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
  }
+
+  template<bool aligned>
  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
  {
    _mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
--- a/src/core/bus.cpp
+++ b/src/core/bus.cpp
@ -549,7 +549,9 @@ u8* Bus::GetLUTFastmemPointer(u32 address, u8* ram_ptr)

 void Bus::MapFastmemViews()
 {
+#ifdef ENABLE_MMAP_FASTMEM
  Assert(s_fastmem_ram_views.empty());
+#endif

  const CPUFastmemMode mode = g_settings.cpu_fastmem_mode;
  if (mode == CPUFastmemMode::MMap)
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -345,8 +345,8 @@ protected:
    // However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect
    // the edges with the clip rectangle.
    // TODO: Coordinates are exclusive, so off by one here...
-    const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x);
-    const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z);
+    const GSVector2i clamp_min = GSVector2i::load<true>(&m_clamped_drawing_area.x);
+    const GSVector2i clamp_max = GSVector2i::load<true>(&m_clamped_drawing_area.z);
    v1 = v1.sat_s32(clamp_min, clamp_max);
    v2 = v2.sat_s32(clamp_min, clamp_max);
    v3 = v3.sat_s32(clamp_min, clamp_max);
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -2570,9 +2570,9 @@ void GPU_HW::LoadVertices()
      }

      // Cull polygons which are too large.
-      const GSVector2 v0f = GSVector2::load(&vertices[0].x);
-      const GSVector2 v1f = GSVector2::load(&vertices[1].x);
-      const GSVector2 v2f = GSVector2::load(&vertices[2].x);
+      const GSVector2 v0f = GSVector2::load<false>(&vertices[0].x);
+      const GSVector2 v1f = GSVector2::load<false>(&vertices[1].x);
+      const GSVector2 v2f = GSVector2::load<false>(&vertices[2].x);
      const GSVector2 min_pos_12 = v1f.min(v2f);
      const GSVector2 max_pos_12 = v1f.max(v2f);
      const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
@ -2617,7 +2617,7 @@ void GPU_HW::LoadVertices()
      // quads
      if (rc.quad_polygon)
      {
-        const GSVector2 v3f = GSVector2::load(&vertices[3].x);
+        const GSVector2 v3f = GSVector2::load<false>(&vertices[3].x);
        const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
                                           .add32(GSVector4i::cxpr(0, 0, 1, 1));
        const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
@ -2845,9 +2845,9 @@ void GPU_HW::LoadVertices()
        {
          GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2);
          FillDrawCommand(cmd, rc);
-          GSVector4i::storel(&cmd->vertices[0], bounds);
+          GSVector4i::storel<false>(&cmd->vertices[0], bounds);
          cmd->vertices[0].color = start_color;
-          GSVector4i::storeh(&cmd->vertices[1], bounds);
+          GSVector4i::storeh<false>(&cmd->vertices[1], bounds);
          cmd->vertices[1].color = end_color;
          m_sw_renderer->PushCommand(cmd);
        }
@ -2870,7 +2870,7 @@ void GPU_HW::LoadVertices()
        {
          cmd = m_sw_renderer->NewDrawLineCommand(num_vertices);
          FillDrawCommand(cmd, rc);
-          GSVector2i::store(&cmd->vertices[0].x, start_pos);
+          GSVector2i::store<false>(&cmd->vertices[0].x, start_pos);
          cmd->vertices[0].color = start_color;
        }
        else
@ -2905,7 +2905,7 @@ void GPU_HW::LoadVertices()

          if (cmd)
          {
-            GSVector2i::store(&cmd->vertices[i], end_pos);
+            GSVector2i::store<false>(&cmd->vertices[i], end_pos);
            cmd->vertices[i].color = end_color;
          }
        }
@ -2978,7 +2978,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)

  const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode;
  const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast<u8>(tmode));
-  const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy();
+  const GSVector4i page_offset = GSVector4i::loadl<true>(m_current_texture_page_offset).xyxy();

  uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift));   // shift only goes on the x
  uv_rect = uv_rect.add32(page_offset);                  // page offset
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -317,7 +317,7 @@ private:
  GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero.
  GSVector4i m_current_uv_rect = INVALID_RECT;
  GSVector4i m_current_draw_rect = INVALID_RECT;
-  s32 m_current_texture_page_offset[2] = {};
+  alignas(8) s32 m_current_texture_page_offset[2] = {};

  std::unique_ptr<GPUPipeline> m_wireframe_pipeline;

--- a/src/core/gpu_hw_texture_cache.cpp
+++ b/src/core/gpu_hw_texture_cache.cpp
@ -3305,8 +3305,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
  // TODO: Use rects instead of fullscreen tris, maybe avoid the copy..
  alignas(VECTOR_ALIGNMENT) float uniforms[4];
  GSVector2 texture_size = GSVector2(GSVector2i(entry->texture->GetWidth(), entry->texture->GetHeight()));
-  GSVector2::store(&uniforms[0], texture_size);
-  GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
+  GSVector2::store<true>(&uniforms[0], texture_size);
+  GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
  g_gpu_device->InvalidateRenderTarget(s_state.replacement_texture_render_target.get());
  g_gpu_device->SetRenderTarget(s_state.replacement_texture_render_target.get());
  g_gpu_device->SetViewportAndScissor(0, 0, new_width, new_height);
@ -3325,8 +3325,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,

    const GSVector4i dst_rect = GSVector4i(GSVector4(si.dst_rect) * max_scale_v);
    texture_size = GSVector2(GSVector2i(temp_texture->GetWidth(), temp_texture->GetHeight()));
-    GSVector2::store(&uniforms[0], texture_size);
-    GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
+    GSVector2::store<true>(&uniforms[0], texture_size);
+    GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
    g_gpu_device->SetViewportAndScissor(dst_rect);
    g_gpu_device->SetTextureSampler(0, temp_texture.get(), g_gpu_device->GetNearestSampler());
    g_gpu_device->SetPipeline(si.invert_alpha ? s_state.replacement_semitransparent_draw_pipeline.get() :
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@ -537,7 +537,7 @@ void GPU_SW::DispatchRenderCommand()
        vert->x = m_drawing_offset.x + vp.x;
        vert->y = m_drawing_offset.y + vp.y;
        vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
-        positions[i] = GSVector2i::load(&vert->x);
+        positions[i] = GSVector2i::load<false>(&vert->x);
      }

      // Cull polygons which are too large.
@ -686,8 +686,8 @@ void GPU_SW::DispatchRenderCommand()
          cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
        }

-        const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
-        const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
+        const GSVector4i v0 = GSVector4i::loadl<false>(&cmd->vertices[0].x);
+        const GSVector4i v1 = GSVector4i::loadl<false>(&cmd->vertices[1].x);
        const GSVector4i rect = v0.min_s32(v1).xyxy(v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
        const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);

@ -711,7 +711,7 @@ void GPU_SW::DispatchRenderCommand()

        u32 buffer_pos = 0;
        const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
-        const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x);
+        const GSVector2i draw_offset = GSVector2i::load<false>(&m_drawing_offset.x);
        GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset);
        u32 start_color = m_render_command.color_for_first_vertex;

@ -740,9 +740,9 @@ void GPU_SW::DispatchRenderCommand()
            GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count];
            out_vertex_count += 2;

-            GSVector2i::store(&out_vertex[0].x, start_pos);
+            GSVector2i::store<false>(&out_vertex[0].x, start_pos);
            out_vertex[0].color = start_color;
-            GSVector2i::store(&out_vertex[1].x, end_pos);
+            GSVector2i::store<false>(&out_vertex[1].x, end_pos);
            out_vertex[1].color = end_color;
          }

--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@ -397,7 +397,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
 {
  if (x <= (VRAM_WIDTH - 4))
  {
-    return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
+    return GSVector4i::loadl<false>(&g_vram[y * VRAM_WIDTH + x]).u16to32();
  }
  else
  {
@ -415,7 +415,7 @@ ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
  const GSVector4i packed_color = color.pu32();
  if (x <= (VRAM_WIDTH - 4))
  {
-    GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
+    GSVector4i::storel<false>(&g_vram[y * VRAM_WIDTH + x], packed_color);
  }
  else
  {
--- a/src/core/mdec.cpp
+++ b/src/core/mdec.cpp
@ -711,10 +711,10 @@ void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late)

      for (u32 index = 0; index < s_state.block_rgb.size(); index += 16)
      {
-        const GSVector4i rgbx0 = GSVector4i::load<false>(&s_state.block_rgb[index]);
-        const GSVector4i rgbx1 = GSVector4i::load<false>(&s_state.block_rgb[index + 4]);
-        const GSVector4i rgbx2 = GSVector4i::load<false>(&s_state.block_rgb[index + 8]);
-        const GSVector4i rgbx3 = GSVector4i::load<false>(&s_state.block_rgb[index + 12]);
+        const GSVector4i rgbx0 = GSVector4i::load<true>(&s_state.block_rgb[index]);
+        const GSVector4i rgbx1 = GSVector4i::load<true>(&s_state.block_rgb[index + 4]);
+        const GSVector4i rgbx2 = GSVector4i::load<true>(&s_state.block_rgb[index + 8]);
+        const GSVector4i rgbx3 = GSVector4i::load<true>(&s_state.block_rgb[index + 12]);

        GSVector4i::store<true>(&rgbp[0], rgbx0.shuffle8(mask00) | rgbx1.shuffle8(mask01));
        GSVector4i::store<true>(&rgbp[4], rgbx1.shuffle8(mask11) | rgbx2.shuffle8(mask12));
@ -1048,8 +1048,8 @@ void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const
  const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
  for (u32 y = 0; y < 8; y++)
  {
-    const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
-    const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
+    const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
+    const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
    const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);

    // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
--- a/src/core/spu.cpp
+++ b/src/core/spu.cpp
@ -2318,7 +2318,7 @@ void SPU::ProcessReverb(s32 left_in, s32 right_in, s32* left_out, s32* right_out
      srcs = GSVector4i::load<false>(&src[8]);
      acc = acc.add32(GSVector4i::load<true>(&resample_coeff[8]).mul32l(srcs.s16to32()));
      acc = acc.add32(GSVector4i::load<true>(&resample_coeff[12]).mul32l(srcs.uph64().s16to32()));
-      srcs = GSVector4i::loadl(&src[16]);
+      srcs = GSVector4i::loadl<false>(&src[16]);
      acc = acc.add32(GSVector4i::load<true>(&resample_coeff[16]).mul32l(srcs.s16to32()));

      out[channel] = std::clamp<s32>(acc.addv_s32() >> 14, -32768, 32767);
--- a/src/util/postprocessing_shader_fx.cpp
+++ b/src/util/postprocessing_shader_fx.cpp
@ -1683,7 +1683,7 @@ GPUDevice::PresentResult PostProcessing::ReShadeFXShader::Apply(GPUTexture* inpu

        case SourceOptionType::ViewportOffset:
        {
-          GSVector4::storel(dst, GSVector4(final_rect));
+          GSVector4::storel<false>(dst, GSVector4(final_rect));
        }
        break;