From 724b1a7cc4d9e4c0c1bb01bc9e13475a89bbf4da Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <stenzek@gmail.com>
Date: Thu, 31 Mar 2022 02:11:02 +1000
Subject: [PATCH] GPU/SW: SIMD-ify the software renderer

---
 src/core/core.vcxproj               |   6 +
 src/core/core.vcxproj.filters       |   1 +
 src/core/gpu_sw_rasterizer.cpp      |  45 ++
 src/core/gpu_sw_rasterizer.h        |  27 ++
 src/core/gpu_sw_rasterizer.inl      | 615 +++++++++++++++++++++++++++-
 src/core/gpu_sw_rasterizer_avx2.cpp |  12 +
 6 files changed, 699 insertions(+), 7 deletions(-)
 create mode 100644 src/core/gpu_sw_rasterizer_avx2.cpp
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 29574d56a..b23b1cbf3 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -51,6 +51,12 @@
     <ClCompile Include="gpu_sw.cpp" />
     <ClCompile Include="gpu_sw_backend.cpp" />
     <ClCompile Include="gpu_sw_rasterizer.cpp" />
+    <ClCompile Include="gpu_sw_rasterizer_avx2.cpp">
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <AdditionalOptions Condition="$(Configuration.Contains(Clang))">%(AdditionalOptions) -mavx2</AdditionalOptions>
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
     <ClCompile Include="gte.cpp" />
     <ClCompile Include="dma.cpp" />
     <ClCompile Include="gpu.cpp" />
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index 2a3bcaef8..d0dd29200 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -69,6 +69,7 @@
     <ClCompile Include="pine_server.cpp" />
     <ClCompile Include="gdb_server.cpp" />
     <ClCompile Include="gpu_sw_rasterizer.cpp" />
+    <ClCompile Include="gpu_sw_rasterizer_avx2.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="types.h" />
diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp
index 59d5ade53..47177c0da 100644
--- a/src/core/gpu_sw_rasterizer.cpp
+++ b/src/core/gpu_sw_rasterizer.cpp
@@ -7,6 +7,8 @@
 #include "cpuinfo.h"
 
 #include "common/log.h"
+#include "common/string_util.h"
+
 Log_SetChannel(GPU_SW_Rasterizer);
 
 namespace GPU_SW_Rasterizer {
@@ -39,6 +41,13 @@ namespace GPU_SW_Rasterizer {
 #include "gpu_sw_rasterizer.inl"
 }
 
+// Default vector implementation definitions.
+#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
+namespace GPU_SW_Rasterizer::Vector {
+#include "gpu_sw_rasterizer.inl"
+}
+#endif
+
 // Initialize with default implementation.
 namespace GPU_SW_Rasterizer {
 const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions;
@@ -49,4 +58,40 @@ const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions;
 // Declare alternative implementations.
 void GPU_SW_Rasterizer::SelectImplementation()
 {
+  static bool selected = false;
+  if (selected)
+    return;
+
+  selected = true;
+
+#define SELECT_ALTERNATIVE_RASTERIZER(isa)                                                                             \
+  do                                                                                                                   \
+  {                                                                                                                    \
+    INFO_LOG("* Using " #isa " software rasterizer implementation.");                                                  \
+    SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions;                                                     \
+    SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions;                                                       \
+    SelectedDrawLineFunctions = &isa::DrawLineFunctions;                                                               \
+  } while (0)
+
+#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
+  const char* use_isa = std::getenv("SW_USE_ISA");
+
+#ifdef CPU_ARCH_SSE
+  if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
+  {
+    SELECT_ALTERNATIVE_RASTERIZER(AVX2);
+    return;
+  }
+#endif
+
+  if (!use_isa || StringUtil::Strcasecmp(use_isa, "Vector") == 0)
+  {
+    SELECT_ALTERNATIVE_RASTERIZER(Vector);
+    return;
+  }
+#endif
+
+  INFO_LOG("* Using scalar software rasterizer implementation.");
+
+#undef SELECT_ALTERNATIVE_RASTERIZER
 }
diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h
index f3d95f0bf..f183e26d3 100644
--- a/src/core/gpu_sw_rasterizer.h
+++ b/src/core/gpu_sw_rasterizer.h
@@ -1,4 +1,5 @@
 // SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -6,6 +7,7 @@
 #include "gpu.h"
 #include "gpu_types.h"
 
+#include "common/intrin.h"
 #include "common/types.h"
 
 #include <algorithm>
@@ -33,6 +35,11 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP
                                   const GPUBackendDrawLineCommand::Vertex* p1);
 typedef const DrawLineFunction DrawLineFunctionTable[2][2][2];
 
+// Default implementation, compatible with all ISAs.
+extern const DrawRectangleFunctionTable DrawRectangleFunctions;
+extern const DrawTriangleFunctionTable DrawTriangleFunctions;
+extern const DrawLineFunctionTable DrawLineFunctions;
+
 // Current implementation, selected at runtime.
 extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions;
 extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions;
@@ -60,4 +67,24 @@ ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_e
                                          [u8(transparency_enable)][u8(dithering_enable)];
 }
 
+#define DECLARE_ALTERNATIVE_RASTERIZER(isa)                                                                            \
+  namespace isa {                                                                                                      \
+  extern const DrawRectangleFunctionTable DrawRectangleFunctions;                                                      \
+  extern const DrawTriangleFunctionTable DrawTriangleFunctions;                                                        \
+  extern const DrawLineFunctionTable DrawLineFunctions;                                                                \
+  }
+
+// Have to define the symbols globally, because clang won't include them otherwise.
+#if defined(CPU_ARCH_SSE)
+#define ALTERNATIVE_RASTERIZER_LIST() DECLARE_ALTERNATIVE_RASTERIZER(AVX2)
+#else
+#define ALTERNATIVE_RASTERIZER_LIST()
+#endif
+
+ALTERNATIVE_RASTERIZER_LIST()
+
+#undef DECLARE_ALTERNATIVE_RASTERIZER
+
 } // namespace GPU_SW_Rasterizer
+
+// static u32 s_bad_counter = 0;
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index 15871b95b..acf6c5cfe 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -3,15 +3,55 @@
 
 #ifdef __INTELLISENSE__
 
+#include "common/gsvector.h"
 #include "gpu.h"
 #include <algorithm>
 
+#define USE_VECTOR 1
+#define GSVECTOR_HAS_SRLV 1
+
 extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
 
 namespace GPU_SW_Rasterizer {
 
 #endif
 
+// TODO: UpdateVRAM, FillVRAM, etc.
+
+#ifdef USE_VECTOR
+#if 0
+static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
+static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
+#define BACKUP_VRAM()                                                                                                  \
+  do                                                                                                                   \
+  {                                                                                                                    \
+    std::memcpy(s_vram_backup, g_vram, sizeof(g_vram));                                                                \
+    s_bad_counter++;                                                                                                   \
+  } while (0)
+#define CHECK_VRAM(drawer)                                                                                             \
+  do                                                                                                                   \
+  {                                                                                                                    \
+    std::memcpy(s_new_vram, g_vram, sizeof(g_vram));                                                                   \
+    std::memcpy(g_vram, s_vram_backup, sizeof(g_vram));                                                                \
+                                                                                                                       \
+    drawer;                                                                                                            \
+    for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++)                                                      \
+    {                                                                                                                  \
+      if (s_new_vram[vidx] != g_vram[vidx])                                                                            \
+      {                                                                                                                \
+        fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH),        \
+                (vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]);                                                  \
+        AssertMsg(false, "Mismatch");                                                                                  \
+      }                                                                                                                \
+    }                                                                                                                  \
+    /*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/                                                   \
+  } while (0)
+#else
+#define BACKUP_VRAM()
+#define CHECK_VRAM(drawer)
+#endif
+#endif
+
 namespace {
 enum
 {
@@ -49,32 +89,33 @@ struct line_fxp_step
 };
 } // namespace
 
-ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
 {
   return g_vram[VRAM_WIDTH * y + x];
 }
-ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
 {
   return &g_vram[VRAM_WIDTH * y + x];
 }
-ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
 {
   g_vram[VRAM_WIDTH * y + x] = value;
 }
 
-ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
 {
   return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
 }
 
-ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
 {
   return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
 }
 
 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
-ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g,
-                                             u8 color_b, u8 texcoord_x, u8 texcoord_y)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
+                                                              u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
+                                                              u8 texcoord_y)
 {
   u16 color;
   if constexpr (texture_enable)
@@ -215,6 +256,8 @@ ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u
   SetPixel(static_cast<u32>(x), static_cast<u32>(y), color | cmd->params.GetMaskOR());
 }
 
+#ifndef USE_VECTOR
+
 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
 static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
 {
@@ -249,6 +292,413 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
   }
 }
 
+#else // USE_VECTOR
+
+ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
+{
+  GSVector4i offsets = coord_y.sll32<11>();    // y * 2048 (1024 * sizeof(pixel))
+  offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel))
+
+  const u32 o0 = offsets.extract32<0>();
+  const u32 o1 = offsets.extract32<1>();
+  const u32 o2 = offsets.extract32<2>();
+  const u32 o3 = offsets.extract32<3>();
+
+  // TODO: split in two, merge, maybe could be zx loaded instead..
+  u16 p0, p1, p2, p3;
+  std::memcpy(&p0, reinterpret_cast<const u8*>(g_vram) + o0, sizeof(p0));
+  std::memcpy(&p1, reinterpret_cast<const u8*>(g_vram) + o1, sizeof(p1));
+  std::memcpy(&p2, reinterpret_cast<const u8*>(g_vram) + o2, sizeof(p2));
+  std::memcpy(&p3, reinterpret_cast<const u8*>(g_vram) + o3, sizeof(p3));
+  GSVector4i pixels = GSVector4i::load(p0);
+  pixels = pixels.insert16<2>(p1);
+  pixels = pixels.insert16<4>(p2);
+  pixels = pixels.insert16<6>(p3);
+
+  return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices)
+{
+  const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel))
+  const u32 o0 = offsets.extract32<0>();
+  const u32 o1 = offsets.extract32<1>();
+  const u32 o2 = offsets.extract32<2>();
+  const u32 o3 = offsets.extract32<3>();
+
+  // TODO: split in two, merge, maybe could be zx loaded instead..
+  u16 p0, p1, p2, p3;
+  std::memcpy(&p0, reinterpret_cast<const u8*>(g_gpu_clut) + o0, sizeof(p0));
+  std::memcpy(&p1, reinterpret_cast<const u8*>(g_gpu_clut) + o1, sizeof(p1));
+  std::memcpy(&p2, reinterpret_cast<const u8*>(g_gpu_clut) + o2, sizeof(p2));
+  std::memcpy(&p3, reinterpret_cast<const u8*>(g_gpu_clut) + o3, sizeof(p3));
+  GSVector4i pixels = GSVector4i::load(p0);
+  pixels = pixels.insert16<2>(p1);
+  pixels = pixels.insert16<4>(p2);
+  pixels = pixels.insert16<6>(p3);
+
+  return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
+{
+  if (x <= (VRAM_WIDTH - 4))
+  {
+    return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
+  }
+  else
+  {
+    const u16* line = &g_vram[y * VRAM_WIDTH];
+    GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
+    pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
+    return pixels;
+  }
+}
+
+ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
+{
+  if (x <= (VRAM_WIDTH - 4))
+  {
+    GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color);
+  }
+  else
+  {
+    u16* line = &g_vram[y * VRAM_WIDTH];
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>());
+    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>());
+    line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>());
+  }
+}
+
+ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba)
+{
+  rg = rgb5a1 & GSVector4i::cxpr(0x1F);                     // R | R | R | R
+  rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
+  ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F);         // B | B | B | B
+  ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba)
+{
+  GSVector4i res;
+
+  res = rg & GSVector4i::cxpr(0x1F);                       // R | R | R | R
+  res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0));  // RG | RG | RG | RG
+  res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
+  res = res | ba.srl32<16>().sll32<15>();                  // RGBA | RGBA | RGBA | RGBA
+
+  return res;
+}
+
+// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
+static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
+#define P(m, n) static_cast<s16>(DITHER_MATRIX[m][n]), static_cast<s16>(DITHER_MATRIX[m][n])
+#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
+
+  {R(0)}, {R(1)}, {R(2)}, {R(3)}
+
+#undef R
+#undef P
+};
+
+template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
+ALWAYS_INLINE_RELEASE static void
+ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba,
+           GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither)
+{
+  static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK);
+  static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK);
+
+  GSVector4i color;
+
+  if constexpr (texture_enable)
+  {
+    // Apply texture window
+    texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x);
+    texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y);
+
+    const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX());
+    const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY());
+    const GSVector4i palette_x = GSVector4i(cmd->palette.GetXBase());
+    const GSVector4i palette_y = GSVector4i(cmd->palette.GetYBase());
+
+    texcoord_y = base_y.add32(texcoord_y) & coord_mask_y;
+
+    GSVector4i texture_color;
+    switch (cmd->draw_mode.texture_mode)
+    {
+      case GPUTextureMode::Palette4Bit:
+      {
+        GSVector4i load_texcoord_x = texcoord_x.srl32<2>();
+        load_texcoord_x = base_x.add32(load_texcoord_x);
+        load_texcoord_x = load_texcoord_x & coord_mask_x;
+
+        // todo: sse4 path
+        GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>();
+        GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+#ifdef GSVECTOR_HAS_SRLV
+        palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F);
+#else
+        Assert(false && "Fixme");
+#endif
+
+        texture_color = GatherCLUTVector(palette_indices);
+      }
+      break;
+
+      case GPUTextureMode::Palette8Bit:
+      {
+        GSVector4i load_texcoord_x = texcoord_x.srl32<1>();
+        load_texcoord_x = base_x.add32(load_texcoord_x);
+        load_texcoord_x = load_texcoord_x & coord_mask_x;
+
+        GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>();
+        GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+#ifdef GSVECTOR_HAS_SRLV
+        palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF);
+#else
+        Assert(false && "Fixme");
+#endif
+
+        texture_color = GatherCLUTVector(palette_indices);
+      }
+      break;
+
+      default:
+      {
+        texcoord_x = base_x.add32(texcoord_x);
+        texcoord_x = texcoord_x & coord_mask_x;
+        texture_color = GatherVector(texcoord_x, texcoord_y);
+      }
+      break;
+    }
+
+    // check for zero texture colour across the 4 pixels, early out if so
+    const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero());
+    if (texture_transparent_mask.alltrue())
+      return;
+
+    preserve_mask = preserve_mask | texture_transparent_mask;
+
+    if constexpr (raw_texture_enable)
+    {
+      color = texture_color;
+    }
+    else
+    {
+      GSVector4i trg, tba;
+      RGB5A1ToRG_BA(texture_color, trg, tba);
+
+      // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
+      GSVector4i rg = trg.mul16l(vertex_color_rg);
+      GSVector4i ba = tba.mul16l(vertex_color_ba);
+
+      // TODO: Dither
+      // Convert to 5bit.
+      if constexpr (dithering_enable)
+      {
+        rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+        ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+      }
+      else
+      {
+        rg = rg.sra16<7>();
+        ba = ba.sra16<7>();
+      }
+
+      // Bit15 gets passed through as-is.
+      ba = ba.blend16<0xaa>(tba);
+
+      // Clamp to 5bit.
+      static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F);
+      rg = rg.min_u16(colclamp);
+      ba = ba.min_u16(colclamp);
+
+      // And interleave back to 16bpp.
+      color = RG_BAToRGB5A1(rg, ba);
+    }
+  }
+  else
+  {
+    // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
+    if constexpr (dithering_enable)
+    {
+      GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+      GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+
+      // Clamp to 5bit. We use 32bit for BA to set a to zero.
+      rg = rg.min_u16(GSVector4i::cxpr16(0x1F));
+      ba = ba.min_u16(GSVector4i::cxpr(0x1F));
+
+      // And interleave back to 16bpp.
+      color = RG_BAToRGB5A1(rg, ba);
+    }
+    else
+    {
+      // Note that bit15 is set to 0 here, which the shift will do.
+      const GSVector4i rg = vertex_color_rg.srl16<3>();
+      const GSVector4i ba = vertex_color_ba.srl16<3>();
+      color = RG_BAToRGB5A1(rg, ba);
+    }
+  }
+
+  GSVector4i bg_color = LoadVector(start_x, y);
+
+  if constexpr (transparency_enable)
+  {
+    [[maybe_unused]] GSVector4i transparent_mask;
+    if constexpr (texture_enable)
+    {
+      // Compute transparent_mask, ffff per lane if transparent otherwise 0000
+      transparent_mask = color.sra16<15>();
+    }
+
+    // TODO: We don't need to OR color here with 0x8000 for textures.
+    // 0x8000 is added to match serial path.
+
+    GSVector4i blended_color;
+    switch (cmd->draw_mode.transparency_mode)
+    {
+      case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
+      {
+        const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
+        const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
+        const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>();
+        blended_color = res & GSVector4i::cxpr(0xffff);
+      }
+      break;
+
+      case GPUTransparencyMode::BackgroundPlusForeground:
+      {
+        const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
+        const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
+        const GSVector4i sum = fg_bits.add32(bg_bits);
+        const GSVector4i carry =
+          (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u);
+        const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+        blended_color = res & GSVector4i::cxpr(0xffff);
+      }
+      break;
+
+      case GPUTransparencyMode::BackgroundMinusForeground:
+      {
+        const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
+        const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu);
+        const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u));
+        const GSVector4i borrow =
+          diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u);
+        const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
+        blended_color = res & GSVector4i::cxpr(0xffff);
+      }
+      break;
+
+      case GPUTransparencyMode::BackgroundPlusQuarterForeground:
+      default:
+      {
+        const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
+        const GSVector4i fg_bits =
+          ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u);
+        const GSVector4i sum = fg_bits.add32(bg_bits);
+        const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u);
+        const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+        blended_color = res & GSVector4i::cxpr(0xffff);
+      }
+      break;
+    }
+
+    // select blended pixels for transparent pixels, otherwise consider opaque
+    // TODO: SSE2
+    if constexpr (texture_enable)
+      color = color.blend8(blended_color, transparent_mask);
+    else
+      color = blended_color & GSVector4i::cxpr(0x7fff);
+  }
+
+  // TODO: lift out to parent?
+  const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND());
+  const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR());
+
+  GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000
+  mask_bits_set = mask_bits_set.sra16<15>();      // ffff if masked else 0000
+  preserve_mask = preserve_mask | mask_bits_set;  // ffff if preserved else 0000
+
+  bg_color = bg_color & preserve_mask;
+  color = (color | mask_or).andnot(preserve_mask);
+  color = color | bg_color;
+
+  const GSVector4i packed_color = color.pu32();
+  StoreVector(start_x, y, packed_color);
+}
+
+template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
+static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
+{
+  const s32 origin_x = cmd->x;
+  const s32 origin_y = cmd->y;
+
+  const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
+  GSVector4i rg = rgba.xxxxl();                   // RGRG | RGRG | RGRG | RGRG
+  GSVector4i ba = rgba.yyyyl();                   // BABA | BABA | BABA | BABA
+  rg = rg.u8to16();                               // R0G0 | R0G0 | R0G0 | R0G0
+  ba = ba.u8to16();                               // B0A0 | B0A0 | B0A0 | B0A0
+
+  const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3));
+  GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8);
+
+  const GSVector4i clip_left = GSVector4i(g_drawing_area.left);
+  const GSVector4i clip_right = GSVector4i(g_drawing_area.right);
+  const u32 width = cmd->width;
+
+  BACKUP_VRAM();
+
+  for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
+  {
+    const s32 y = origin_y + static_cast<s32>(offset_y);
+    if (y < static_cast<s32>(g_drawing_area.top) || y > static_cast<s32>(g_drawing_area.bottom) ||
+        (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
+    {
+      continue;
+    }
+
+    GSVector4i row_texcoord_x = texcoord_x;
+    GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3));
+    GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+
+    for (u32 offset_x = 0; offset_x < width; offset_x += 4)
+    {
+      const s32 x = origin_x + static_cast<s32>(offset_x);
+
+      // width test
+      GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero());
+
+      // clip test, if all pixels are outside, skip
+      preserve_mask = preserve_mask | xvec.lt32(clip_left);
+      preserve_mask = preserve_mask | xvec.gt32(clip_right);
+      if (!preserve_mask.alltrue())
+      {
+        ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
+          cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero());
+      }
+
+      xvec = xvec.add32(GSVector4i::cxpr(4));
+      wvec = wvec.sub32(GSVector4i::cxpr(4));
+
+      if constexpr (texture_enable)
+        row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF);
+    }
+
+    if constexpr (texture_enable)
+      texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF);
+  }
+
+  CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
+}
+
+#endif // USE_VECTOR
+
 //////////////////////////////////////////////////////////////////////////
 // Polygon and line rasterization ported from Mednafen
 //////////////////////////////////////////////////////////////////////////
@@ -355,6 +805,8 @@ ALWAYS_INLINE_RELEASE static void AddIDeltas_DY(i_group& ig, const i_deltas& idl
   }
 }
 
+#ifndef USE_VECTOR
+
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
          bool dithering_enable>
 ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound,
@@ -401,11 +853,150 @@ ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* c
   } while (--w > 0);
 }
 
+#else // USE_VECTOR
+
+template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+         bool dithering_enable>
+ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound,
+                                           i_group ig, const i_deltas& idl)
+{
+  if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
+    return;
+
+  s32 x_ig_adjust = x_start;
+  s32 w = x_bound - x_start;
+  s32 x = TruncateGPUVertexPosition(x_start);
+
+  if (x < static_cast<s32>(g_drawing_area.left))
+  {
+    s32 delta = static_cast<s32>(g_drawing_area.left) - x;
+    x_ig_adjust += delta;
+    x += delta;
+    w -= delta;
+  }
+
+  if ((x + w) > (static_cast<s32>(g_drawing_area.right) + 1))
+    w = static_cast<s32>(g_drawing_area.right) + 1 - x;
+
+  if (w <= 0)
+    return;
+
+  // TODO: Precompute.
+
+  const auto clip_left = GSVector4i(g_drawing_area.left);
+  const auto clip_right = GSVector4i(g_drawing_area.right);
+
+  const GSVector4i dr_dx = GSVector4i(idl.dr_dx * 4);
+  const GSVector4i dg_dx = GSVector4i(idl.dg_dx * 4);
+  const GSVector4i db_dx = GSVector4i(idl.db_dx * 4);
+  const GSVector4i du_dx = GSVector4i(idl.du_dx * 4);
+  const GSVector4i dv_dx = GSVector4i(idl.dv_dx * 4);
+
+  // TODO: vectorize
+  const GSVector4i dr_dx_offset = GSVector4i(0, idl.dr_dx, idl.dr_dx * 2, idl.dr_dx * 3);
+  const GSVector4i dg_dx_offset = GSVector4i(0, idl.dg_dx, idl.dg_dx * 2, idl.dg_dx * 3);
+  const GSVector4i db_dx_offset = GSVector4i(0, idl.db_dx, idl.db_dx * 2, idl.db_dx * 3);
+  const GSVector4i du_dx_offset = GSVector4i(0, idl.du_dx, idl.du_dx * 2, idl.du_dx * 3);
+  const GSVector4i dv_dx_offset = GSVector4i(0, idl.dv_dx, idl.dv_dx * 2, idl.dv_dx * 3);
+
+  GSVector4i dr, dg, db;
+  if constexpr (shading_enable)
+  {
+    dr = GSVector4i(ig.r + idl.dr_dx * x_ig_adjust).add32(dr_dx_offset);
+    dg = GSVector4i(ig.g + idl.dg_dx * x_ig_adjust).add32(dg_dx_offset);
+    db = GSVector4i(ig.b + idl.db_dx * x_ig_adjust).add32(db_dx_offset);
+  }
+  else
+  {
+    // precompute for flat shading
+    dr = GSVector4i(ig.r >> (COORD_FBS + COORD_POST_PADDING));
+    dg = GSVector4i((ig.g >> (COORD_FBS + COORD_POST_PADDING)) << 16);
+    db = GSVector4i(ig.b >> (COORD_FBS + COORD_POST_PADDING));
+  }
+
+  GSVector4i du = GSVector4i(ig.u + idl.du_dx * x_ig_adjust).add32(du_dx_offset);
+  GSVector4i dv = GSVector4i(ig.v + idl.dv_dx * x_ig_adjust).add32(dv_dx_offset);
+
+  // TODO: Move to caller.
+  if constexpr (shading_enable)
+  {
+    // TODO: vectorize multiply?
+    dr = dr.add32(GSVector4i(idl.dr_dy * y));
+    dg = dg.add32(GSVector4i(idl.dg_dy * y));
+    db = db.add32(GSVector4i(idl.db_dy * y));
+  }
+
+  if constexpr (texture_enable)
+  {
+    du = du.add32(GSVector4i(idl.du_dy * y));
+    dv = dv.add32(GSVector4i(idl.dv_dy * y));
+  }
+
+  const GSVector4i dither =
+    GSVector4i::load<false>(&VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(x) & 3) * 2]);
+
+  GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3));
+  GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+
+  for (s32 count = (w + 3) / 4; count > 0; --count)
+  {
+    // R000 | R000 | R000 | R000
+    // R0G0 | R0G0 | R0G0 | R0G0
+    const GSVector4i r = shading_enable ? dr.srl32<COORD_FBS + COORD_POST_PADDING>() : dr;
+    const GSVector4i g =
+      shading_enable ? dg.srl32<COORD_FBS + COORD_POST_PADDING>().sll32<16>() : dg; // get G into the correct position
+    const GSVector4i b = shading_enable ? db.srl32<COORD_FBS + COORD_POST_PADDING>() : db;
+    const GSVector4i u = du.srl32<COORD_FBS + COORD_POST_PADDING>();
+    const GSVector4i v = dv.srl32<COORD_FBS + COORD_POST_PADDING>();
+
+    // TODO: no-sse4
+    const GSVector4i rg = r.blend16<0xAA>(g);
+
+    // mask based on what's outside the span
+    auto preserve_mask = wvec.lt32(GSVector4i::zero());
+
+    // clip test, if all pixels are outside, skip
+    preserve_mask = preserve_mask | xvec.lt32(clip_left);
+    preserve_mask = preserve_mask | xvec.gt32(clip_right);
+    if (!preserve_mask.alltrue())
+    {
+      ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+        cmd, static_cast<u32>(x), static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
+    }
+
+    x += 4;
+
+    xvec = xvec.add32(GSVector4i::cxpr(4));
+    wvec = wvec.sub32(GSVector4i::cxpr(4));
+
+    if constexpr (shading_enable)
+    {
+      dr = dr.add32(dr_dx);
+      dg = dg.add32(dg_dx);
+      db = db.add32(db_dx);
+    }
+
+    if constexpr (texture_enable)
+    {
+      du = du.add32(du_dx);
+      dv = dv.add32(dv_dx);
+    }
+  }
+}
+
+#endif // USE_VECTOR
+
 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
          bool dithering_enable>
 static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
                          const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
 {
+#if 0
+  const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0;
+  const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1;
+  const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2;
+#endif
+
   u32 core_vertex;
   {
     u32 cvtemp = 0;
@@ -480,6 +1071,10 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
   if (!CalcIDeltas<shading_enable, texture_enable>(idl, v0, v1, v2))
     return;
 
+#ifdef USE_VECTOR
+  BACKUP_VRAM();
+#endif
+
   const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
 
   i_group ig;
@@ -591,6 +1186,12 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
       }
     }
   }
+
+#ifdef USE_VECTOR
+  CHECK_VRAM(
+    GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable]
+                                            [dithering_enable](cmd, orig_v0, orig_v1, orig_v2));
+#endif
 }
 
 ALWAYS_INLINE_RELEASE static s64 LineDivide(s64 delta, s32 dk)
diff --git a/src/core/gpu_sw_rasterizer_avx2.cpp b/src/core/gpu_sw_rasterizer_avx2.cpp
new file mode 100644
index 000000000..c145ee98c
--- /dev/null
+++ b/src/core/gpu_sw_rasterizer_avx2.cpp
@@ -0,0 +1,12 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "gpu_sw_rasterizer.h"
+
+#include "common/assert.h"
+#include "common/gsvector.h"
+
+namespace GPU_SW_Rasterizer::AVX2 {
+#define USE_VECTOR 1
+#include "gpu_sw_rasterizer.inl"
+}