From 40670726b49df0d1d56eae82fcc930265b9c92e8 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 20 Sep 2024 23:18:20 +1000
Subject: [PATCH] GPU/SW: Fix SSE4 path and add 256-bit AVX2 path

---
 src/core/gpu_sw_rasterizer.cpp |   2 +
 src/core/gpu_sw_rasterizer.inl | 996 +++++++++++++++++++++------------
 2 files changed, 651 insertions(+), 347 deletions(-)

diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp
index 0df0c16c2..371766ddb 100644
--- a/src/core/gpu_sw_rasterizer.cpp
+++ b/src/core/gpu_sw_rasterizer.cpp
@@ -44,7 +44,9 @@ namespace GPU_SW_Rasterizer {
 // Default vector implementation definitions.
 #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
 namespace GPU_SW_Rasterizer::SIMD {
+#define USE_VECTOR 1
 #include "gpu_sw_rasterizer.inl"
+#undef USE_VECTOR
 }
 #endif
 
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index ebfeb4efd..be4ad94b2 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -9,6 +9,7 @@
 
 #define USE_VECTOR 1
 #define GSVECTOR_HAS_SRLV 1
+#define GSVECTOR_HAS_256 1
 
 extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
 
@@ -22,6 +23,7 @@ namespace GPU_SW_Rasterizer {
 #if 0
 static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
 static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
+static u32 s_bad_counter = 0;
 #define BACKUP_VRAM()                                                                                                  \
   do                                                                                                                   \
   {                                                                                                                    \
@@ -257,50 +259,138 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
 
 #else // USE_VECTOR
 
+#ifdef GSVECTOR_HAS_256
+using GSVectorNi = GSVector8i;
+static constexpr GSVector8i SPAN_OFFSET_VEC = GSVector8i::cxpr(0, 1, 2, 3, 4, 5, 6, 7);
+static constexpr GSVector8i SPAN_WIDTH_VEC = GSVector8i::cxpr(1, 2, 3, 4, 5, 6, 7, 8);
+static constexpr GSVector8i PIXELS_PER_VEC_VEC = GSVector8i::cxpr(8);
+static constexpr u32 PIXELS_PER_VEC = 8;
+#else
+using GSVectorNi = GSVector4i;
+static constexpr GSVector4i SPAN_OFFSET_VEC = GSVector4i::cxpr(0, 1, 2, 3);
+static constexpr GSVector4i SPAN_WIDTH_VEC = GSVector4i::cxpr(1, 2, 3, 4);
+static constexpr GSVector4i PIXELS_PER_VEC_VEC = GSVector4i::cxpr(4);
+static constexpr u32 PIXELS_PER_VEC = 4;
+#endif
+
+#ifdef GSVECTOR_HAS_256
+
+ALWAYS_INLINE_RELEASE static GSVector8i GatherVector(GSVector8i coord_x, GSVector8i coord_y)
+{
+  const GSVector8i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x
+  GSVector8i pixels = GSVector8i::zext32(g_vram[static_cast<u32>(offsets.extract32<0>())]);
+  pixels = pixels.insert16<2>(g_vram[static_cast<u32>(offsets.extract32<1>())]);
+  pixels = pixels.insert16<4>(g_vram[static_cast<u32>(offsets.extract32<2>())]);
+  pixels = pixels.insert16<6>(g_vram[static_cast<u32>(offsets.extract32<3>())]);
+  pixels = pixels.insert16<8>(g_vram[static_cast<u32>(offsets.extract32<4>())]);
+  pixels = pixels.insert16<10>(g_vram[static_cast<u32>(offsets.extract32<5>())]);
+  pixels = pixels.insert16<12>(g_vram[static_cast<u32>(offsets.extract32<6>())]);
+  pixels = pixels.insert16<14>(g_vram[static_cast<u32>(offsets.extract32<7>())]);
+  return pixels;
+}
+
+template<u32 mask>
+ALWAYS_INLINE_RELEASE static GSVector8i GatherCLUTVector(GSVector8i indices, GSVector8i shifts)
+{
+  const GSVector8i offsets = indices.srlv32(shifts) & GSVector8i::cxpr(mask);
+  GSVector8i pixels = GSVector8i::zext32(g_gpu_clut[static_cast<u32>(offsets.extract32<0>())]);
+  pixels = pixels.insert16<2>(g_gpu_clut[static_cast<u32>(offsets.extract32<1>())]);
+  pixels = pixels.insert16<4>(g_gpu_clut[static_cast<u32>(offsets.extract32<2>())]);
+  pixels = pixels.insert16<6>(g_gpu_clut[static_cast<u32>(offsets.extract32<3>())]);
+  pixels = pixels.insert16<8>(g_gpu_clut[static_cast<u32>(offsets.extract32<4>())]);
+  pixels = pixels.insert16<10>(g_gpu_clut[static_cast<u32>(offsets.extract32<5>())]);
+  pixels = pixels.insert16<12>(g_gpu_clut[static_cast<u32>(offsets.extract32<6>())]);
+  pixels = pixels.insert16<14>(g_gpu_clut[static_cast<u32>(offsets.extract32<7>())]);
+  return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector8i LoadVector(u32 x, u32 y)
+{
+  // TODO: Split into high/low
+  if (x <= (VRAM_WIDTH - 8))
+  {
+    return GSVector8i::u16to32(GSVector4i::load<false>(&g_vram[y * VRAM_WIDTH + x]));
+  }
+  else
+  {
+    // TODO: Avoid loads for masked pixels if a contiguous region is masked
+    const u16* line = &g_vram[y * VRAM_WIDTH];
+    GSVector8i pixels = GSVector8i::zero();
+    pixels = pixels.insert16<0>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<6>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<8>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<10>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<12>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<14>(line[(x++) & VRAM_WIDTH_MASK]);
+    return pixels;
+  }
+}
+
+ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector8i color)
+{
+  // TODO: Split into high/low
+  const GSVector4i packed = color.low128().pu32(color.high128());
+  if (x <= (VRAM_WIDTH - 8))
+  {
+    GSVector4i::store<false>(&g_vram[y * VRAM_WIDTH + x], packed);
+  }
+  else
+  {
+    // TODO: Avoid stores for masked pixels if a contiguous region is masked
+    u16* line = &g_vram[y * VRAM_WIDTH];
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<0>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<1>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<2>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<3>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<4>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<5>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<6>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<7>());
+  }
+}
+
+#else
+
 ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
 {
-  GSVector4i offsets = coord_y.sll32<11>();    // y * 2048 (1024 * sizeof(pixel))
-  offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel))
+  const GSVector4i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x
 
-  const u32 o0 = offsets.extract32<0>();
-  const u32 o1 = offsets.extract32<1>();
-  const u32 o2 = offsets.extract32<2>();
-  const u32 o3 = offsets.extract32<3>();
-
-  // TODO: split in two, merge, maybe could be zx loaded instead..
-  u16 p0, p1, p2, p3;
-  std::memcpy(&p0, reinterpret_cast<const u8*>(g_vram) + o0, sizeof(p0));
-  std::memcpy(&p1, reinterpret_cast<const u8*>(g_vram) + o1, sizeof(p1));
-  std::memcpy(&p2, reinterpret_cast<const u8*>(g_vram) + o2, sizeof(p2));
-  std::memcpy(&p3, reinterpret_cast<const u8*>(g_vram) + o3, sizeof(p3));
-  GSVector4i pixels = GSVector4i::zext32(p0);
-  pixels = pixels.insert16<2>(p1);
-  pixels = pixels.insert16<4>(p2);
-  pixels = pixels.insert16<6>(p3);
+  // Clang seems to optimize this directly into pextrd+pinsrw, good.
+  GSVector4i pixels = GSVector4i::zext32(g_vram[static_cast<u32>(offsets.extract32<0>())]);
+  pixels = pixels.insert16<2>(g_vram[static_cast<u32>(offsets.extract32<1>())]);
+  pixels = pixels.insert16<4>(g_vram[static_cast<u32>(offsets.extract32<2>())]);
+  pixels = pixels.insert16<6>(g_vram[static_cast<u32>(offsets.extract32<3>())]);
 
   return pixels;
 }
 
-ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices)
+template<u32 mask>
+ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices, GSVector4i shifts)
 {
-  const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel))
-  const u32 o0 = offsets.extract32<0>();
-  const u32 o1 = offsets.extract32<1>();
-  const u32 o2 = offsets.extract32<2>();
-  const u32 o3 = offsets.extract32<3>();
-
-  // TODO: split in two, merge, maybe could be zx loaded instead..
-  u16 p0, p1, p2, p3;
-  std::memcpy(&p0, reinterpret_cast<const u8*>(g_gpu_clut) + o0, sizeof(p0));
-  std::memcpy(&p1, reinterpret_cast<const u8*>(g_gpu_clut) + o1, sizeof(p1));
-  std::memcpy(&p2, reinterpret_cast<const u8*>(g_gpu_clut) + o2, sizeof(p2));
-  std::memcpy(&p3, reinterpret_cast<const u8*>(g_gpu_clut) + o3, sizeof(p3));
-  GSVector4i pixels = GSVector4i::zext32(p0);
-  pixels = pixels.insert16<2>(p1);
-  pixels = pixels.insert16<4>(p2);
-  pixels = pixels.insert16<6>(p3);
-
+#ifdef GSVECTOR_HAS_SRLV
+  // On everywhere except RISC-V, we can do the shl 1 (* 2) as part of the load instruction.
+  const GSVector4i offsets = indices.srlv32(shifts) & GSVector4i::cxpr(mask);
+  GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[static_cast<u32>(offsets.extract32<0>())]);
+  pixels = pixels.insert16<2>(g_gpu_clut[static_cast<u32>(offsets.extract32<1>())]);
+  pixels = pixels.insert16<4>(g_gpu_clut[static_cast<u32>(offsets.extract32<2>())]);
+  pixels = pixels.insert16<6>(g_gpu_clut[static_cast<u32>(offsets.extract32<3>())]);
   return pixels;
+#else
+  // Without variable shifts, it's probably quicker to do it without vectors.
+  // Because otherwise we have to do 4 separate vector shifts, as well as broadcasting the shifts...
+  // Clang seems to turn this into a bunch of extracts, and skips memory. Nice.
+  alignas(VECTOR_ALIGNMENT) s32 indices_array[4], shifts_array[4];
+  GSVector4i::store<true>(indices_array, indices);
+  GSVector4i::store<true>(shifts_array, shifts);
+
+  GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[((indices_array[0] >> shifts_array[0]) & mask)]);
+  pixels = pixels.insert16<2>(g_gpu_clut[((indices_array[1] >> shifts_array[1]) & mask)]);
+  pixels = pixels.insert16<4>(g_gpu_clut[((indices_array[2] >> shifts_array[2]) & mask)]);
+  pixels = pixels.insert16<6>(g_gpu_clut[((indices_array[3] >> shifts_array[3]) & mask)]);
+  return pixels;
+#endif
 }
 
 ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
@@ -322,42 +412,45 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
 
 ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
 {
+  const GSVector4i packed_color = color.pu32();
   if (x <= (VRAM_WIDTH - 4))
   {
-    GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color);
+    GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
   }
   else
   {
     u16* line = &g_vram[y * VRAM_WIDTH];
-    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>());
-    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>());
-    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>());
-    line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<0>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<1>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<2>());
+    line[x & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<3>());
   }
 }
 
-ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba)
+#endif
+
+ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVectorNi rgb5a1, GSVectorNi& rg, GSVectorNi& ba)
 {
-  rg = rgb5a1 & GSVector4i::cxpr(0x1F);                     // R | R | R | R
-  rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
-  ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F);         // B | B | B | B
-  ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
+  rg = rgb5a1 & GSVectorNi::cxpr(0x1F);                     // R | R | R | R
+  rg = rg | (rgb5a1 & GSVectorNi::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
+  ba = rgb5a1.srl32<10>() & GSVectorNi::cxpr(0x1F);         // B | B | B | B
+  ba = ba | (rgb5a1 & GSVectorNi::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
 }
 
-ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba)
+ALWAYS_INLINE_RELEASE static GSVectorNi RG_BAToRGB5A1(GSVectorNi rg, GSVectorNi ba)
 {
-  GSVector4i res;
+  GSVectorNi res;
 
-  res = rg & GSVector4i::cxpr(0x1F);                       // R | R | R | R
-  res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0));  // RG | RG | RG | RG
-  res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
+  res = rg & GSVectorNi::cxpr(0x1F);                       // R | R | R | R
+  res = res | (rg.srl32<11>() & GSVectorNi::cxpr(0x3E0));  // RG | RG | RG | RG
+  res = res | ((ba & GSVectorNi::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
   res = res | ba.srl32<16>().sll32<15>();                  // RGBA | RGBA | RGBA | RGBA
 
   return res;
 }
 
 // Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
-static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
+alignas(VECTOR_ALIGNMENT) static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
 #define P(m, n) static_cast<s16>(DITHER_MATRIX[m][n]), static_cast<s16>(DITHER_MATRIX[m][n])
 #define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
 
@@ -367,70 +460,97 @@ static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
 #undef P
 };
 
+namespace {
+template<bool texture_enable>
+struct PixelVectors
+{
+  struct UnusedField
+  {
+  };
+
+  GSVectorNi clip_left;
+  GSVectorNi clip_right;
+
+  GSVectorNi mask_and;
+  GSVectorNi mask_or;
+
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_and_x;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_or_x;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_and_y;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_or_y;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_base_x;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_base_y;
+
+  PixelVectors(const GPUBackendDrawCommand* cmd)
+  {
+    clip_left = GSVectorNi(g_drawing_area.left);
+    clip_right = GSVectorNi(g_drawing_area.right);
+
+    mask_and = GSVectorNi(cmd->params.GetMaskAND());
+    mask_or = GSVectorNi(cmd->params.GetMaskOR());
+
+    if constexpr (texture_enable)
+    {
+      texture_window_and_x = GSVectorNi(cmd->window.and_x);
+      texture_window_or_x = GSVectorNi(cmd->window.or_x);
+      texture_window_and_y = GSVectorNi(cmd->window.and_y);
+      texture_window_or_y = GSVectorNi(cmd->window.or_y);
+      texture_base_x = GSVectorNi(cmd->draw_mode.GetTexturePageBaseX());
+      texture_base_y = GSVectorNi(cmd->draw_mode.GetTexturePageBaseY());
+    }
+  }
+};
+} // namespace
+
 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
 ALWAYS_INLINE_RELEASE static void
-ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba,
-           GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither)
+ShadePixel(const PixelVectors<texture_enable>& pv, GPUTextureMode texture_mode, GPUTransparencyMode transparency_mode,
+           u32 start_x, u32 y, GSVectorNi vertex_color_rg, GSVectorNi vertex_color_ba, GSVectorNi texcoord_x,
+           GSVectorNi texcoord_y, GSVectorNi preserve_mask, GSVectorNi dither)
 {
-  static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK);
-  static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK);
+  static constexpr GSVectorNi coord_mask_x = GSVectorNi::cxpr(VRAM_WIDTH_MASK);
+  static constexpr GSVectorNi coord_mask_y = GSVectorNi::cxpr(VRAM_HEIGHT_MASK);
 
-  GSVector4i color;
+  GSVectorNi color;
 
   if constexpr (texture_enable)
   {
     // Apply texture window
-    texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x);
-    texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y);
+    texcoord_x = (texcoord_x & pv.texture_window_and_x) | pv.texture_window_or_x;
+    texcoord_y = (texcoord_y & pv.texture_window_and_y) | pv.texture_window_or_y;
 
-    const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX());
-    const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY());
+    texcoord_y = pv.texture_base_y.add32(texcoord_y) & coord_mask_y;
 
-    texcoord_y = base_y.add32(texcoord_y) & coord_mask_y;
-
-    GSVector4i texture_color;
-    switch (cmd->draw_mode.texture_mode)
+    GSVectorNi texture_color;
+    switch (texture_mode)
     {
       case GPUTextureMode::Palette4Bit:
       {
-        GSVector4i load_texcoord_x = texcoord_x.srl32<2>();
-        load_texcoord_x = base_x.add32(load_texcoord_x);
+        GSVectorNi load_texcoord_x = texcoord_x.srl32<2>();
+        load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x);
         load_texcoord_x = load_texcoord_x & coord_mask_x;
 
-        // todo: sse4 path
-        GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>();
-        GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
-#ifdef GSVECTOR_HAS_SRLV
-        palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F);
-#else
-        Assert(false && "Fixme");
-#endif
-
-        texture_color = GatherCLUTVector(palette_indices);
+        const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(3)).sll32<2>();
+        const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+        texture_color = GatherCLUTVector<0x0F>(palette_indices, palette_shift);
       }
       break;
 
       case GPUTextureMode::Palette8Bit:
       {
-        GSVector4i load_texcoord_x = texcoord_x.srl32<1>();
-        load_texcoord_x = base_x.add32(load_texcoord_x);
+        GSVectorNi load_texcoord_x = texcoord_x.srl32<1>();
+        load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x);
         load_texcoord_x = load_texcoord_x & coord_mask_x;
 
-        GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>();
-        GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
-#ifdef GSVECTOR_HAS_SRLV
-        palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF);
-#else
-        Assert(false && "Fixme");
-#endif
-
-        texture_color = GatherCLUTVector(palette_indices);
+        const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(1)).sll32<3>();
+        const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+        texture_color = GatherCLUTVector<0xFF>(palette_indices, palette_shift);
       }
       break;
 
       default:
       {
-        texcoord_x = base_x.add32(texcoord_x);
+        texcoord_x = pv.texture_base_x.add32(texcoord_x);
         texcoord_x = texcoord_x & coord_mask_x;
         texture_color = GatherVector(texcoord_x, texcoord_y);
       }
@@ -438,7 +558,7 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert
     }
 
     // check for zero texture colour across the 4 pixels, early out if so
-    const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero());
+    const GSVectorNi texture_transparent_mask = texture_color.eq32(GSVectorNi::zero());
     if (texture_transparent_mask.alltrue())
       return;
 
@@ -450,19 +570,18 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert
     }
     else
     {
-      GSVector4i trg, tba;
+      GSVectorNi trg, tba;
       RGB5A1ToRG_BA(texture_color, trg, tba);
 
       // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
-      GSVector4i rg = trg.mul16l(vertex_color_rg);
-      GSVector4i ba = tba.mul16l(vertex_color_ba);
+      GSVectorNi rg = trg.mul16l(vertex_color_rg);
+      GSVectorNi ba = tba.mul16l(vertex_color_ba);
 
-      // TODO: Dither
       // Convert to 5bit.
       if constexpr (dithering_enable)
       {
-        rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
-        ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+        rg = rg.sra16<4>().add16(dither).max_i16(GSVectorNi::zero()).sra16<3>();
+        ba = ba.sra16<4>().add16(dither).max_i16(GSVectorNi::zero()).sra16<3>();
       }
       else
       {
@@ -474,7 +593,7 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert
       ba = ba.blend16<0xaa>(tba);
 
       // Clamp to 5bit.
-      static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F);
+      static constexpr GSVectorNi colclamp = GSVectorNi::cxpr16(0x1F);
       rg = rg.min_u16(colclamp);
       ba = ba.min_u16(colclamp);
 
@@ -487,12 +606,12 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert
     // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
     if constexpr (dithering_enable)
     {
-      GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
-      GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+      GSVectorNi rg = vertex_color_rg.add16(dither).max_i16(GSVectorNi::zero()).sra16<3>();
+      GSVectorNi ba = vertex_color_ba.add16(dither).max_i16(GSVectorNi::zero()).sra16<3>();
 
       // Clamp to 5bit. We use 32bit for BA to set a to zero.
-      rg = rg.min_u16(GSVector4i::cxpr16(0x1F));
-      ba = ba.min_u16(GSVector4i::cxpr(0x1F));
+      rg = rg.min_u16(GSVectorNi::cxpr16(0x1F));
+      ba = ba.min_u16(GSVectorNi::cxpr(0x1F));
 
       // And interleave back to 16bpp.
       color = RG_BAToRGB5A1(rg, ba);
@@ -500,17 +619,17 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert
     else
     {
       // Note that bit15 is set to 0 here, which the shift will do.
-      const GSVector4i rg = vertex_color_rg.srl16<3>();
-      const GSVector4i ba = vertex_color_ba.srl16<3>();
+      const GSVectorNi rg = vertex_color_rg.srl16<3>();
+      const GSVectorNi ba = vertex_color_ba.srl16<3>();
       color = RG_BAToRGB5A1(rg, ba);
     }
   }
 
-  GSVector4i bg_color = LoadVector(start_x, y);
+  GSVectorNi bg_color = LoadVector(start_x, y);
 
   if constexpr (transparency_enable)
   {
-    [[maybe_unused]] GSVector4i transparent_mask;
+    [[maybe_unused]] GSVectorNi transparent_mask;
     if constexpr (texture_enable)
     {
       // Compute transparent_mask, ffff per lane if transparent otherwise 0000
@@ -520,78 +639,72 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert
     // TODO: We don't need to OR color here with 0x8000 for textures.
     // 0x8000 is added to match serial path.
 
-    GSVector4i blended_color;
-    switch (cmd->draw_mode.transparency_mode)
+    GSVectorNi blended_color;
+    switch (transparency_mode)
     {
       case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
       {
-        const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
-        const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
-        const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>();
-        blended_color = res & GSVector4i::cxpr(0xffff);
+        const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u);
+        const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u);
+        const GSVectorNi res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x0421u)).srl32<1>();
+        blended_color = res & GSVectorNi::cxpr(0xffff);
       }
       break;
 
       case GPUTransparencyMode::BackgroundPlusForeground:
       {
-        const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
-        const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
-        const GSVector4i sum = fg_bits.add32(bg_bits);
-        const GSVector4i carry =
-          (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u);
-        const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
-        blended_color = res & GSVector4i::cxpr(0xffff);
+        const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u);
+        const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu);
+        const GSVectorNi sum = fg_bits.add32(bg_bits);
+        const GSVectorNi carry =
+          (sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u))) & GSVectorNi::cxpr(0x8420u);
+        const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+        blended_color = res & GSVectorNi::cxpr(0xffff);
       }
       break;
 
       case GPUTransparencyMode::BackgroundMinusForeground:
       {
-        const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
-        const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu);
-        const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u));
-        const GSVector4i borrow =
-          diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u);
-        const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
-        blended_color = res & GSVector4i::cxpr(0xffff);
+        const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u);
+        const GSVectorNi fg_bits = color & GSVectorNi::cxpr(0x7FFFu);
+        const GSVectorNi diff = bg_bits.sub32(fg_bits).add32(GSVectorNi::cxpr(0x108420u));
+        const GSVectorNi borrow =
+          diff.sub32((bg_bits ^ fg_bits) & GSVectorNi::cxpr(0x108420u)) & GSVectorNi::cxpr(0x108420u);
+        const GSVectorNi res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
+        blended_color = res & GSVectorNi::cxpr(0xffff);
       }
       break;
 
       case GPUTransparencyMode::BackgroundPlusQuarterForeground:
       default:
       {
-        const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
-        const GSVector4i fg_bits =
-          ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u);
-        const GSVector4i sum = fg_bits.add32(bg_bits);
-        const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u);
-        const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
-        blended_color = res & GSVector4i::cxpr(0xffff);
+        const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu);
+        const GSVectorNi fg_bits =
+          ((color | GSVectorNi::cxpr(0x8000)).srl32<2>() & GSVectorNi::cxpr(0x1CE7u)) | GSVectorNi::cxpr(0x8000u);
+        const GSVectorNi sum = fg_bits.add32(bg_bits);
+        const GSVectorNi carry = sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u)) & GSVectorNi::cxpr(0x8420u);
+        const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+        blended_color = res & GSVectorNi::cxpr(0xffff);
       }
       break;
     }
 
     // select blended pixels for transparent pixels, otherwise consider opaque
-    // TODO: SSE2
     if constexpr (texture_enable)
       color = color.blend8(blended_color, transparent_mask);
     else
-      color = blended_color & GSVector4i::cxpr(0x7fff);
+      color = blended_color & GSVectorNi::cxpr(0x7fff);
   }
 
-  // TODO: lift out to parent?
-  const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND());
-  const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR());
-
-  GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000
-  mask_bits_set = mask_bits_set.sra16<15>();      // ffff if masked else 0000
-  preserve_mask = preserve_mask | mask_bits_set;  // ffff if preserved else 0000
+  GSVectorNi mask_bits_set = bg_color & pv.mask_and; // 8000 if masked else 0000
+  mask_bits_set = mask_bits_set.sra16<15>();         // ffff if masked else 0000
+  preserve_mask = preserve_mask | mask_bits_set;     // ffff if preserved else 0000
 
   bg_color = bg_color & preserve_mask;
-  color = (color | mask_or).andnot(preserve_mask);
+  color = (color | pv.mask_or).andnot(preserve_mask);
   color = color | bg_color;
 
-  const GSVector4i packed_color = color.pu32();
-  StoreVector(start_x, y, packed_color);
+  StoreVector(start_x, y, color);
 }
 
 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
@@ -600,17 +713,16 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
   const s32 origin_x = cmd->x;
   const s32 origin_y = cmd->y;
 
-  const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
-  GSVector4i rg = rgba.xxxxl();                   // RGRG | RGRG | RGRG | RGRG
-  GSVector4i ba = rgba.yyyyl();                   // BABA | BABA | BABA | BABA
-  rg = rg.u8to16();                               // R0G0 | R0G0 | R0G0 | R0G0
-  ba = ba.u8to16();                               // B0A0 | B0A0 | B0A0 | B0A0
+  const GSVector4i rgba = GSVector4i(cmd->color);               // RGBA | RGBA | RGBA | RGBA
+  const GSVector4i rgp = rgba.xxxxl();                          // RGRG | RGRG | RGRG | RGRG
+  const GSVector4i bap = rgba.yyyyl();                          // BABA | BABA | BABA | BABA
+  const GSVectorNi rg = GSVectorNi::broadcast128(rgp.u8to16()); // R0G0 | R0G0 | R0G0 | R0G0
+  const GSVectorNi ba = GSVectorNi::broadcast128(bap.u8to16()); // B0A0 | B0A0 | B0A0 | B0A0
 
-  const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3));
-  GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8);
+  const GSVectorNi texcoord_x = GSVectorNi(cmd->texcoord & 0xFF).add32(SPAN_OFFSET_VEC);
+  GSVectorNi texcoord_y = GSVectorNi(cmd->texcoord >> 8);
 
-  const GSVector4i clip_left = GSVector4i(g_drawing_area.left);
-  const GSVector4i clip_right = GSVector4i(g_drawing_area.right);
+  const PixelVectors<texture_enable> pv(cmd);
   const u32 width = cmd->width;
 
   BACKUP_VRAM();
@@ -618,41 +730,42 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
   for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
   {
     const s32 y = origin_y + static_cast<s32>(offset_y);
-    if (y < static_cast<s32>(g_drawing_area.top) || y > static_cast<s32>(g_drawing_area.bottom) ||
-        (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
+    if (y >= static_cast<s32>(g_drawing_area.top) && y <= static_cast<s32>(g_drawing_area.bottom) &&
+        (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)))
     {
-      continue;
-    }
+      const s32 draw_y = (y & VRAM_HEIGHT_MASK);
 
-    GSVector4i row_texcoord_x = texcoord_x;
-    GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3));
-    GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+      GSVectorNi row_texcoord_x = texcoord_x;
+      GSVectorNi xvec = GSVectorNi(origin_x).add32(SPAN_OFFSET_VEC);
+      GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC);
 
-    for (u32 offset_x = 0; offset_x < width; offset_x += 4)
-    {
-      const s32 x = origin_x + static_cast<s32>(offset_x);
-
-      // width test
-      GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero());
-
-      // clip test, if all pixels are outside, skip
-      preserve_mask = preserve_mask | xvec.lt32(clip_left);
-      preserve_mask = preserve_mask | xvec.gt32(clip_right);
-      if (!preserve_mask.alltrue())
+      for (u32 offset_x = 0; offset_x < width; offset_x += PIXELS_PER_VEC)
       {
-        ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
-          cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero());
+        const s32 x = origin_x + static_cast<s32>(offset_x);
+
+        // width test
+        GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero());
+
+        // clip test, if all pixels are outside, skip
+        preserve_mask = preserve_mask | xvec.lt32(pv.clip_left);
+        preserve_mask = preserve_mask | xvec.gt32(pv.clip_right);
+        if (!preserve_mask.alltrue())
+        {
+          ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
+            pv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, x, draw_y, rg, ba, row_texcoord_x,
+            texcoord_y, preserve_mask, GSVectorNi::zero());
+        }
+
+        xvec = xvec.add32(PIXELS_PER_VEC_VEC);
+        wvec = wvec.sub32(PIXELS_PER_VEC_VEC);
+
+        if constexpr (texture_enable)
+          row_texcoord_x = row_texcoord_x.add32(PIXELS_PER_VEC_VEC) & GSVectorNi::cxpr(0xFF);
       }
-
-      xvec = xvec.add32(GSVector4i::cxpr(4));
-      wvec = wvec.sub32(GSVector4i::cxpr(4));
-
-      if constexpr (texture_enable)
-        row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF);
     }
 
     if constexpr (texture_enable)
-      texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF);
+      texcoord_y = texcoord_y.add32(GSVectorNi::cxpr(1)) & GSVectorNi::cxpr(0xFF);
   }
 
   CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
@@ -761,7 +874,7 @@ struct UVStepper
   ALWAYS_INLINE u8 GetU() const { return Truncate8(u >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
   ALWAYS_INLINE u8 GetV() const { return Truncate8(v >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
 
-  ALWAYS_INLINE void SetStart(u32 ustart, u32 vstart)
+  ALWAYS_INLINE void Init(u32 ustart, u32 vstart)
   {
     u = (((ustart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
     v = (((vstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
@@ -772,10 +885,24 @@ struct UVStepper
     u = u + steps.dudx;
     v = v + steps.dvdx;
   }
-  ALWAYS_INLINE void StepXY(const UVSteps& steps, s32 x_count, s32 y_count)
+
+  ALWAYS_INLINE void StepX(const UVSteps& steps, s32 count)
   {
-    u = u + (steps.dudx * static_cast<u32>(x_count)) + (steps.dudy * static_cast<u32>(y_count));
-    v = v + (steps.dvdx * static_cast<u32>(x_count)) + (steps.dvdy * static_cast<u32>(y_count));
+    u = u + static_cast<u32>(static_cast<s32>(steps.dudx) * count);
+    v = v + static_cast<u32>(static_cast<s32>(steps.dvdx) * count);
+  }
+
+  template<bool upside_down>
+  ALWAYS_INLINE void StepY(const UVSteps& steps)
+  {
+    u = upside_down ? (u - steps.dudy) : (u + steps.dudy);
+    v = upside_down ? (v - steps.dvdy) : (v + steps.dvdy);
+  }
+
+  ALWAYS_INLINE void StepY(const UVSteps& steps, s32 count)
+  {
+    u = u + static_cast<u32>(static_cast<s32>(steps.dudy) * count);
+    v = v + static_cast<u32>(static_cast<s32>(steps.dvdy) * count);
   }
 };
 
@@ -800,7 +927,7 @@ struct RGBStepper
   ALWAYS_INLINE u8 GetG() const { return Truncate8(g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
   ALWAYS_INLINE u8 GetB() const { return Truncate8(b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
 
-  ALWAYS_INLINE void SetStart(u32 rstart, u32 gstart, u32 bstart)
+  ALWAYS_INLINE void Init(u32 rstart, u32 gstart, u32 bstart)
   {
     r = (((rstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
     g = (((gstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
@@ -813,11 +940,27 @@ struct RGBStepper
     g = g + steps.dgdx;
     b = b + steps.dbdx;
   }
-  ALWAYS_INLINE void StepXY(const RGBSteps& steps, s32 x_count, s32 y_count)
+
+  ALWAYS_INLINE void StepX(const RGBSteps& steps, s32 count)
   {
-    r = r + (steps.drdx * static_cast<u32>(x_count)) + (steps.drdy * static_cast<u32>(y_count));
-    g = g + (steps.dgdx * static_cast<u32>(x_count)) + (steps.dgdy * static_cast<u32>(y_count));
-    b = b + (steps.dbdx * static_cast<u32>(x_count)) + (steps.dbdy * static_cast<u32>(y_count));
+    r = r + static_cast<u32>(static_cast<s32>(steps.drdx) * count);
+    g = g + static_cast<u32>(static_cast<s32>(steps.dgdx) * count);
+    b = b + static_cast<u32>(static_cast<s32>(steps.dbdx) * count);
+  }
+
+  template<bool upside_down>
+  ALWAYS_INLINE void StepY(const RGBSteps& steps)
+  {
+    r = upside_down ? (r - steps.drdy) : (r + steps.drdy);
+    g = upside_down ? (g - steps.dgdy) : (g + steps.dgdy);
+    b = upside_down ? (b - steps.dbdy) : (b + steps.dbdy);
+  }
+
+  ALWAYS_INLINE void StepY(const RGBSteps& steps, s32 count)
+  {
+    r = r + static_cast<u32>(static_cast<s32>(steps.drdy) * count);
+    g = g + static_cast<u32>(static_cast<s32>(steps.dgdy) * count);
+    b = b + static_cast<u32>(static_cast<s32>(steps.dbdy) * count);
   }
 };
 
@@ -841,9 +984,6 @@ template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool
 static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
                      const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
 {
-  if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
-    return;
-
   s32 width = x_bound - x_start;
   s32 current_x = TruncateGPUVertexPosition(x_start);
 
@@ -863,9 +1003,9 @@ static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start
     return;
 
   if constexpr (texture_enable)
-    uv.StepXY(uvstep, x_start, y);
+    uv.StepX(uvstep, x_start);
   if constexpr (shading_enable)
-    rgb.StepXY(rgbstep, x_start, y);
+    rgb.StepX(rgbstep, x_start);
 
   do
   {
@@ -880,137 +1020,6 @@ static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start
   } while (--width > 0);
 }
 
-#else // USE_VECTOR
-
-template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
-         bool dithering_enable>
-static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
-                     const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
-{
-  if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
-    return;
-
-  s32 w = x_bound - x_start;
-  s32 x = TruncateGPUVertexPosition(x_start);
-
-  if (x < static_cast<s32>(g_drawing_area.left))
-  {
-    const s32 delta = static_cast<s32>(g_drawing_area.left) - x;
-    x_start += delta;
-    x += delta;
-    w -= delta;
-  }
-
-  if ((x + w) > (static_cast<s32>(g_drawing_area.right) + 1))
-    w = static_cast<s32>(g_drawing_area.right) + 1 - x;
-
-  if (w <= 0)
-    return;
-
-  // TODO: Precompute.
-
-  const auto clip_left = GSVector4i(g_drawing_area.left);
-  const auto clip_right = GSVector4i(g_drawing_area.right);
-
-  const GSVector4i dr_dx = GSVector4i(rgbstep.drdx * 4);
-  const GSVector4i dg_dx = GSVector4i(rgbstep.dgdx * 4);
-  const GSVector4i db_dx = GSVector4i(rgbstep.dbdx * 4);
-  const GSVector4i du_dx = GSVector4i(uvstep.dudx * 4);
-  const GSVector4i dv_dx = GSVector4i(uvstep.dvdx * 4);
-
-  // TODO: vectorize
-  const GSVector4i dr_dx_offset = GSVector4i(0, rgbstep.drdx, rgbstep.drdx * 2, rgbstep.drdx * 3);
-  const GSVector4i dg_dx_offset = GSVector4i(0, rgbstep.dgdx, rgbstep.dgdx * 2, rgbstep.dgdx * 3);
-  const GSVector4i db_dx_offset = GSVector4i(0, rgbstep.dbdx, rgbstep.dbdx * 2, rgbstep.dbdx * 3);
-  const GSVector4i du_dx_offset = GSVector4i(0, uvstep.dudx, uvstep.dudx * 2, uvstep.dudx * 3);
-  const GSVector4i dv_dx_offset = GSVector4i(0, uvstep.dvdx, uvstep.dvdx * 2, uvstep.dvdx * 3);
-
-  GSVector4i dr, dg, db;
-  if constexpr (shading_enable)
-  {
-    dr = GSVector4i(rgb.r + rgbstep.drdx * x_start).add32(dr_dx_offset);
-    dg = GSVector4i(rgb.g + rgbstep.dgdx * x_start).add32(dg_dx_offset);
-    db = GSVector4i(rgb.b + rgbstep.dbdx * x_start).add32(db_dx_offset);
-  }
-  else
-  {
-    // precompute for flat shading
-    dr = GSVector4i(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
-    dg = GSVector4i((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16);
-    db = GSVector4i(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
-  }
-
-  GSVector4i du = GSVector4i(uv.u + uvstep.dudx * x_start).add32(du_dx_offset);
-  GSVector4i dv = GSVector4i(uv.v + uvstep.dvdx * x_start).add32(dv_dx_offset);
-
-  // TODO: Move to caller.
-  if constexpr (shading_enable)
-  {
-    // TODO: vectorize multiply?
-    dr = dr.add32(GSVector4i(rgbstep.drdy * y));
-    dg = dg.add32(GSVector4i(rgbstep.dgdy * y));
-    db = db.add32(GSVector4i(rgbstep.dbdy * y));
-  }
-
-  if constexpr (texture_enable)
-  {
-    du = du.add32(GSVector4i(uvstep.dudy * y));
-    dv = dv.add32(GSVector4i(uvstep.dvdy * y));
-  }
-
-  const GSVector4i dither =
-    GSVector4i::load<false>(&VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(x) & 3) * 2]);
-
-  GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3));
-  GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4));
-
-  for (s32 count = (w + 3) / 4; count > 0; --count)
-  {
-    // R000 | R000 | R000 | R000
-    // R0G0 | R0G0 | R0G0 | R0G0
-    const GSVector4i r = shading_enable ? dr.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : dr;
-    const GSVector4i g =
-      shading_enable ? dg.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>().sll32<16>() : dg; // get G into the correct position
-    const GSVector4i b = shading_enable ? db.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : db;
-    const GSVector4i u = du.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
-    const GSVector4i v = dv.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
-
-    const GSVector4i rg = r.blend16<0xAA>(g);
-
-    // mask based on what's outside the span
-    auto preserve_mask = wvec.lt32(GSVector4i::zero());
-
-    // clip test, if all pixels are outside, skip
-    preserve_mask = preserve_mask | xvec.lt32(clip_left);
-    preserve_mask = preserve_mask | xvec.gt32(clip_right);
-    if (!preserve_mask.alltrue())
-    {
-      ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
-        cmd, static_cast<u32>(x), static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
-    }
-
-    x += 4;
-
-    xvec = xvec.add32(GSVector4i::cxpr(4));
-    wvec = wvec.sub32(GSVector4i::cxpr(4));
-
-    if constexpr (shading_enable)
-    {
-      dr = dr.add32(dr_dx);
-      dg = dg.add32(dg_dx);
-      db = db.add32(db_dx);
-    }
-
-    if constexpr (texture_enable)
-    {
-      du = du.add32(du_dx);
-      dv = dv.add32(dv_dx);
-    }
-  }
-}
-
-#endif // USE_VECTOR
-
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
          bool dithering_enable>
 ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
@@ -1028,7 +1037,18 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo
 
   if (tp.fill_upside_down)
   {
-    while (current_y > end_y)
+    if (current_y <= end_y)
+      return;
+
+    UVStepper luv = uv;
+    if constexpr (texture_enable)
+      luv.StepY(uvstep, current_y);
+
+    RGBStepper lrgb = rgb;
+    if constexpr (shading_enable)
+      lrgb.StepY(rgbstep, current_y);
+
+    do
     {
       current_y--;
       left_x -= left_x_step;
@@ -1037,16 +1057,37 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo
       const s32 y = TruncateGPUVertexPosition(current_y);
       if (y < static_cast<s32>(g_drawing_area.top))
         break;
-      else if (y > static_cast<s32>(g_drawing_area.bottom))
+
+      // Opposite direction means we need to subtract when stepping instead of adding.
+      if constexpr (texture_enable)
+        luv.StepY<true>(uvstep);
+      if constexpr (shading_enable)
+        lrgb.StepY<true>(rgbstep);
+
+      if (y > static_cast<s32>(g_drawing_area.bottom) ||
+          (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast<u32>(current_y) & 1u)))
+      {
         continue;
+      }
 
       DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
-        cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), uv, uvstep, rgb, rgbstep);
-    }
+        cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep);
+    } while (current_y > end_y);
   }
   else
   {
-    while (current_y < end_y)
+    if (current_y >= end_y)
+      return;
+
+    UVStepper luv = uv;
+    if constexpr (texture_enable)
+      luv.StepY(uvstep, current_y);
+
+    RGBStepper lrgb = rgb;
+    if constexpr (shading_enable)
+      lrgb.StepY(rgbstep, current_y);
+
+    do
     {
       const s32 y = TruncateGPUVertexPosition(current_y);
 
@@ -1054,19 +1095,273 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo
       {
         break;
       }
-      else if (y >= static_cast<s32>(g_drawing_area.top))
+      if (y >= static_cast<s32>(g_drawing_area.top) &&
+          (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast<u32>(current_y) & 1u)))
       {
         DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
-          cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), uv, uvstep, rgb, rgbstep);
+          cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep);
       }
 
       current_y++;
       left_x += left_x_step;
       right_x += right_x_step;
+
+      if constexpr (texture_enable)
+        luv.StepY<false>(uvstep);
+      if constexpr (shading_enable)
+        lrgb.StepY<false>(rgbstep);
+    } while (current_y < end_y);
+  }
+}
+
+#else // USE_VECTOR
+
+namespace {
+template<bool shading_enable, bool texture_enable>
+struct TriangleVectors : PixelVectors<texture_enable>
+{
+  using UnusedField = PixelVectors<texture_enable>::UnusedField;
+
+  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> drdx;
+  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dgdx;
+  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dbdx;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dudx;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dvdx;
+  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> drdx_0123;
+  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dgdx_0123;
+  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dbdx_0123;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dudx_0123;
+  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dvdx_0123;
+
+  TriangleVectors(const GPUBackendDrawCommand* cmd, const UVSteps& uvstep, const RGBSteps& rgbstep)
+    : PixelVectors<texture_enable>(cmd)
+  {
+    if constexpr (shading_enable)
+    {
+      drdx = GSVectorNi(rgbstep.drdx * PIXELS_PER_VEC);
+      dgdx = GSVectorNi(rgbstep.dgdx * PIXELS_PER_VEC);
+      dbdx = GSVectorNi(rgbstep.dbdx * PIXELS_PER_VEC);
+
+      drdx_0123 = GSVectorNi(rgbstep.drdx).mul32l(SPAN_OFFSET_VEC);
+      dgdx_0123 = GSVectorNi(rgbstep.dgdx).mul32l(SPAN_OFFSET_VEC);
+      dbdx_0123 = GSVectorNi(rgbstep.dbdx).mul32l(SPAN_OFFSET_VEC);
+    }
+
+    if constexpr (texture_enable)
+    {
+      dudx = GSVectorNi(uvstep.dudx * PIXELS_PER_VEC);
+      dvdx = GSVectorNi(uvstep.dvdx * PIXELS_PER_VEC);
+      dudx_0123 = GSVectorNi(uvstep.dudx).mul32l(SPAN_OFFSET_VEC);
+      dvdx_0123 = GSVectorNi(uvstep.dvdx).mul32l(SPAN_OFFSET_VEC);
+    }
+  }
+};
+} // namespace
+
+template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+         bool dithering_enable>
+static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
+                     const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep,
+                     const TriangleVectors<shading_enable, texture_enable>& tv)
+{
+  s32 width = x_bound - x_start;
+  s32 current_x = TruncateGPUVertexPosition(x_start);
+
+  // Skip pixels outside of the scissor rectangle.
+  if (current_x < static_cast<s32>(g_drawing_area.left))
+  {
+    const s32 delta = static_cast<s32>(g_drawing_area.left) - current_x;
+    x_start += delta;
+    current_x += delta;
+    width -= delta;
+  }
+
+  if ((current_x + width) > (static_cast<s32>(g_drawing_area.right) + 1))
+    width = static_cast<s32>(g_drawing_area.right) + 1 - current_x;
+
+  if (width <= 0)
+    return;
+
+  GSVectorNi dr, dg, db;
+  if constexpr (shading_enable)
+  {
+    dr = GSVectorNi(rgb.r + rgbstep.drdx * x_start).add32(tv.drdx_0123);
+    dg = GSVectorNi(rgb.g + rgbstep.dgdx * x_start).add32(tv.dgdx_0123);
+    db = GSVectorNi(rgb.b + rgbstep.dbdx * x_start).add32(tv.dbdx_0123);
+  }
+  else
+  {
+    // precompute for flat shading
+    dr = GSVectorNi(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
+    dg = GSVectorNi((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16);
+    db = GSVectorNi(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
+  }
+
+  GSVectorNi du, dv;
+  if constexpr (texture_enable)
+  {
+    du = GSVectorNi(uv.u + uvstep.dudx * x_start).add32(tv.dudx_0123);
+    dv = GSVectorNi(uv.v + uvstep.dvdx * x_start).add32(tv.dvdx_0123);
+  }
+  else
+  {
+    // Hopefully optimized out...
+    du = GSVectorNi::zero();
+    dv = GSVectorNi::zero();
+  }
+
+  const GSVectorNi dither = GSVectorNi::broadcast128<false>(
+    &VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(current_x) & 3) * 2]);
+
+  GSVectorNi xvec = GSVectorNi(current_x).add32(SPAN_OFFSET_VEC);
+  GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC);
+
+  for (s32 count = (width + (PIXELS_PER_VEC - 1)) / PIXELS_PER_VEC; count > 0; --count)
+  {
+    // R000 | R000 | R000 | R000
+    // R0G0 | R0G0 | R0G0 | R0G0
+    const GSVectorNi r = shading_enable ? dr.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : dr;
+    const GSVectorNi g =
+      shading_enable ? dg.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>().sll32<16>() : dg; // get G into the correct position
+    const GSVectorNi b = shading_enable ? db.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : db;
+    const GSVectorNi u = du.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
+    const GSVectorNi v = dv.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
+
+    const GSVectorNi rg = r.blend16<0xAA>(g);
+
+    // mask based on what's outside the span
+    GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero());
+
+    // clip test, if all pixels are outside, skip
+    preserve_mask = preserve_mask | xvec.lt32(tv.clip_left);
+    preserve_mask = preserve_mask | xvec.gt32(tv.clip_right);
+    if (!preserve_mask.alltrue())
+    {
+      ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+        tv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, static_cast<u32>(current_x),
+        static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
+    }
+
+    current_x += PIXELS_PER_VEC;
+
+    xvec = xvec.add32(PIXELS_PER_VEC_VEC);
+    wvec = wvec.sub32(PIXELS_PER_VEC_VEC);
+
+    if constexpr (shading_enable)
+    {
+      dr = dr.add32(tv.drdx);
+      dg = dg.add32(tv.dgdx);
+      db = db.add32(tv.dbdx);
+    }
+
+    if constexpr (texture_enable)
+    {
+      du = du.add32(tv.dudx);
+      dv = dv.add32(tv.dvdx);
     }
   }
 }
 
+template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+         bool dithering_enable>
+ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
+                                                   const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
+                                                   const RGBSteps& rgbstep)
+{
+  static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast<s32>(static_cast<u64>(xfp) >> 32); };
+
+  const u64 left_x_step = tp.step_x[0];
+  const u64 right_x_step = tp.step_x[1];
+  const s32 end_y = tp.end_y;
+  u64 left_x = tp.start_x[0];
+  u64 right_x = tp.start_x[1];
+  s32 current_y = tp.start_y;
+
+  if (tp.fill_upside_down)
+  {
+    if (current_y <= end_y)
+      return;
+
+    UVStepper luv = uv;
+    if constexpr (texture_enable)
+      luv.StepY(uvstep, current_y);
+
+    RGBStepper lrgb = rgb;
+    if constexpr (shading_enable)
+      lrgb.StepY(rgbstep, current_y);
+
+    const TriangleVectors<shading_enable, texture_enable> tv(cmd, uvstep, rgbstep);
+
+    do
+    {
+      current_y--;
+      left_x -= left_x_step;
+      right_x -= right_x_step;
+
+      const s32 y = TruncateGPUVertexPosition(current_y);
+      if (y < static_cast<s32>(g_drawing_area.top))
+        break;
+
+      // Opposite direction means we need to subtract when stepping instead of adding.
+      if constexpr (texture_enable)
+        luv.StepY<true>(uvstep);
+      if constexpr (shading_enable)
+        lrgb.StepY<true>(rgbstep);
+
+      if (y > static_cast<s32>(g_drawing_area.bottom) ||
+          (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast<u32>(current_y) & 1u)))
+      {
+        continue;
+      }
+
+      DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+        cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv);
+    } while (current_y > end_y);
+  }
+  else
+  {
+    if (current_y >= end_y)
+      return;
+
+    UVStepper luv = uv;
+    if constexpr (texture_enable)
+      luv.StepY(uvstep, current_y);
+
+    RGBStepper lrgb = rgb;
+    if constexpr (shading_enable)
+      lrgb.StepY(rgbstep, current_y);
+
+    const TriangleVectors<shading_enable, texture_enable> tv(cmd, uvstep, rgbstep);
+
+    do
+    {
+      const s32 y = TruncateGPUVertexPosition(current_y);
+
+      if (y > static_cast<s32>(g_drawing_area.bottom))
+      {
+        break;
+      }
+      if (y >= static_cast<s32>(g_drawing_area.top) &&
+          (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast<u32>(current_y) & 1u)))
+      {
+        DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+          cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv);
+      }
+
+      current_y++;
+      left_x += left_x_step;
+      right_x += right_x_step;
+
+      if constexpr (texture_enable)
+        luv.StepY<false>(uvstep);
+      if constexpr (shading_enable)
+        lrgb.StepY<false>(rgbstep);
+    } while (current_y < end_y);
+  }
+}
+
+#endif // USE_VECTOR
+
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
          bool dithering_enable>
 static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
@@ -1079,31 +1374,31 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
 #endif
 
   // Sort vertices so that v0 is the top vertex, v1 is the bottom vertex, and v2 is the side vertex.
-  u32 vc = 0;
+  u32 tl = 0;
   if (v1->x <= v0->x)
-    vc = (v2->x <= v1->x) ? 4 : 2;
+    tl = (v2->x <= v1->x) ? 4 : 2;
   else if (v2->x < v0->x)
-    vc = 4;
+    tl = 4;
   else
-    vc = 1;
+    tl = 1;
   if (v2->y < v1->y)
   {
     std::swap(v2, v1);
-    vc = ((vc >> 1) & 0x2) | ((vc << 1) & 0x4) | (vc & 0x1);
+    tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1);
   }
   if (v1->y < v0->y)
   {
     std::swap(v1, v0);
-    vc = ((vc >> 1) & 0x1) | ((vc << 1) & 0x2) | (vc & 0x4);
+    tl = ((tl >> 1) & 0x1) | ((tl << 1) & 0x2) | (tl & 0x4);
   }
   if (v2->y < v1->y)
   {
     std::swap(v2, v1);
-    vc = ((vc >> 1) & 0x2) | ((vc << 1) & 0x4) | (vc & 0x1);
+    tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1);
   }
 
   const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
-  vc = vc >> 1;
+  tl = tl >> 1;
 
   // Invalid size early culling.
   if (static_cast<u32>(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
@@ -1123,8 +1418,8 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
   const s64 base_step = makestep_xy(v2->x - v0->x, v2->y - v0->y);
   const s64 bound_coord_us = (v1->y == v0->y) ? 0 : makestep_xy(v1->x - v0->x, v1->y - v0->y);
   const s64 bound_coord_ls = (v2->y == v1->y) ? 0 : makestep_xy(v2->x - v1->x, v2->y - v1->y);
-  const u32 vo = (vc != 0) ? 1 : 0;
-  const u32 vp = (vc == 2) ? 3 : 0;
+  const u32 vo = (tl != 0) ? 1 : 0;
+  const u32 vp = (tl == 2) ? 3 : 0;
   const bool right_facing = (v1->y == v0->y) ? (v1->x > v0->x) : (bound_coord_us > base_step);
   const u32 rfi = BoolToUInt32(right_facing);
   const u32 ofi = BoolToUInt32(!right_facing);
@@ -1182,21 +1477,28 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
   // Undo the start of the vertex, so that when we add the offset for each line, it starts at the beginning value.
   UVStepper uv;
   RGBStepper rgb;
-  const GPUBackendDrawPolygonCommand::Vertex* core_vertex = vertices[vc];
+  const GPUBackendDrawPolygonCommand::Vertex* top_left_vertex = vertices[tl];
   if constexpr (texture_enable)
   {
-    uv.SetStart(core_vertex->u, core_vertex->v);
-    uv.StepXY(uvstep, -core_vertex->x, -core_vertex->y);
+    uv.Init(top_left_vertex->u, top_left_vertex->v);
+    uv.StepX(uvstep, -top_left_vertex->x);
+    uv.StepY(uvstep, -top_left_vertex->y);
   }
   else
   {
-    // Not actually used, but shut up the compiler. Should get optimized out.
     uv = {};
   }
 
-  rgb.SetStart(core_vertex->r, core_vertex->g, core_vertex->b);
   if constexpr (shading_enable)
-    rgb.StepXY(rgbstep, -core_vertex->x, -core_vertex->y);
+  {
+    rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b);
+    rgb.StepX(rgbstep, -top_left_vertex->x);
+    rgb.StepY(rgbstep, -top_left_vertex->y);
+  }
+  else
+  {
+    rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b);
+  }
 
 #ifdef USE_VECTOR
   BACKUP_VRAM();