From c46ec398dccd901e394f4d3bae6ec305ba05c018 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Thu, 26 Sep 2024 18:29:22 +1000
Subject: [PATCH] GPU: Move software fill/write/copy into rasterizer namespace

---
 src/core/gpu.cpp               | 191 +--------------------------
 src/core/gpu.h                 |  22 +---
 src/core/gpu_hw.cpp            |   1 +
 src/core/gpu_sw_backend.cpp    | 176 +------------------------
 src/core/gpu_sw_rasterizer.cpp |  44 ++++---
 src/core/gpu_sw_rasterizer.h   |  26 ++--
 src/core/gpu_sw_rasterizer.inl | 231 +++++++++++++++++++++++++++++++++
 src/core/gpu_types.h           |   2 -
 8 files changed, 282 insertions(+), 411 deletions(-)
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index 6de97d785..7c3723105 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -4,6 +4,7 @@
 #include "gpu.h"
 #include "dma.h"
 #include "gpu_shadergen.h"
+#include "gpu_sw_rasterizer.h"
 #include "host.h"
 #include "interrupt_controller.h"
 #include "settings.h"
@@ -72,6 +73,7 @@ static void JoinScreenshotThreads();
 
 GPU::GPU()
 {
+  GPU_SW_Rasterizer::SelectImplementation();
   ResetStatistics();
 }
 
@@ -1529,195 +1531,6 @@ void GPU::ClearDisplay()
   DestroyDeinterlaceTextures();
 }
 
-void GPU::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
-{
-}
-
-void GPU::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
-{
-  const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
-  const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
-  constexpr u32 vector_width = 8;
-  const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
-
-  if ((x + width) <= VRAM_WIDTH && !IsInterlacedRenderingEnabled())
-  {
-    for (u32 yoffs = 0; yoffs < height; yoffs++)
-    {
-      const u32 row = (y + yoffs) % VRAM_HEIGHT;
-
-      u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
-      u32 xoffs = 0;
-      for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
-        GSVector4i::store<false>(row_ptr, fill);
-      for (; xoffs < width; xoffs++)
-        *(row_ptr++) = color16;
-    }
-  }
-  else if (IsInterlacedRenderingEnabled())
-  {
-    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
-    if (IsCRTCScanlinePending())
-      SynchronizeCRTC();
-
-    const u32 active_field = GetActiveLineLSB();
-    if ((x + width) <= VRAM_WIDTH)
-    {
-      for (u32 yoffs = 0; yoffs < height; yoffs++)
-      {
-        const u32 row = (y + yoffs) % VRAM_HEIGHT;
-        if ((row & u32(1)) == active_field)
-          continue;
-
-        u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
-        u32 xoffs = 0;
-        for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
-          GSVector4i::store<false>(row_ptr, fill);
-        for (; xoffs < width; xoffs++)
-          *(row_ptr++) = color16;
-      }
-    }
-    else
-    {
-      for (u32 yoffs = 0; yoffs < height; yoffs++)
-      {
-        const u32 row = (y + yoffs) % VRAM_HEIGHT;
-        if ((row & u32(1)) == active_field)
-          continue;
-
-        u16* row_ptr = &g_vram[row * VRAM_WIDTH];
-        for (u32 xoffs = 0; xoffs < width; xoffs++)
-        {
-          const u32 col = (x + xoffs) % VRAM_WIDTH;
-          row_ptr[col] = color16;
-        }
-      }
-    }
-  }
-  else
-  {
-    for (u32 yoffs = 0; yoffs < height; yoffs++)
-    {
-      const u32 row = (y + yoffs) % VRAM_HEIGHT;
-      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
-      for (u32 xoffs = 0; xoffs < width; xoffs++)
-      {
-        const u32 col = (x + xoffs) % VRAM_WIDTH;
-        row_ptr[col] = color16;
-      }
-    }
-  }
-}
-
-void GPU::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
-{
-  // Fast path when the copy is not oversized.
-  if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask)
-  {
-    const u16* src_ptr = static_cast<const u16*>(data);
-    u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
-    for (u32 yoffs = 0; yoffs < height; yoffs++)
-    {
-      std::copy_n(src_ptr, width, dst_ptr);
-      src_ptr += width;
-      dst_ptr += VRAM_WIDTH;
-    }
-  }
-  else
-  {
-    // Slow path when we need to handle wrap-around.
-    // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
-    const u16* src_ptr = static_cast<const u16*>(data);
-    const u16 mask_and = check_mask ? 0x8000 : 0;
-    const u16 mask_or = set_mask ? 0x8000 : 0;
-
-    for (u32 row = 0; row < height;)
-    {
-      u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
-      for (u32 col = 0; col < width;)
-      {
-        // TODO: Handle unaligned reads...
-        u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
-        if (((*pixel_ptr) & mask_and) == 0)
-          *pixel_ptr = *(src_ptr++) | mask_or;
-      }
-    }
-  }
-}
-
-void GPU::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
-{
-  // Break up oversized copies. This behavior has not been verified on console.
-  if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
-  {
-    u32 remaining_rows = height;
-    u32 current_src_y = src_y;
-    u32 current_dst_y = dst_y;
-    while (remaining_rows > 0)
-    {
-      const u32 rows_to_copy =
-        std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
-
-      u32 remaining_columns = width;
-      u32 current_src_x = src_x;
-      u32 current_dst_x = dst_x;
-      while (remaining_columns > 0)
-      {
-        const u32 columns_to_copy =
-          std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
-        CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy);
-        current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
-        current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
-        remaining_columns -= columns_to_copy;
-      }
-
-      current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
-      current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
-      remaining_rows -= rows_to_copy;
-    }
-
-    return;
-  }
-
-  // This doesn't have a fast path, but do we really need one? It's not common.
-  const u16 mask_and = m_GPUSTAT.GetMaskAND();
-  const u16 mask_or = m_GPUSTAT.GetMaskOR();
-
-  // Copy in reverse when src_x < dst_x, this is verified on console.
-  if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
-  {
-    for (u32 row = 0; row < height; row++)
-    {
-      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-
-      for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
-      {
-        const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
-        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
-        if ((*dst_pixel_ptr & mask_and) == 0)
-          *dst_pixel_ptr = src_pixel | mask_or;
-      }
-    }
-  }
-  else
-  {
-    for (u32 row = 0; row < height; row++)
-    {
-      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-
-      for (u32 col = 0; col < width; col++)
-      {
-        const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
-        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
-        if ((*dst_pixel_ptr & mask_and) == 0)
-          *dst_pixel_ptr = src_pixel | mask_or;
-      }
-    }
-  }
-}
-
 void GPU::SetClampedDrawingArea()
 {
   if (m_drawing_area.left > m_drawing_area.right || m_drawing_area.top > m_drawing_area.bottom) [[unlikely]]
diff --git a/src/core/gpu.h b/src/core/gpu.h
index fa76559f8..03283ca13 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -322,10 +322,10 @@ protected:
   bool IsCLUTValid() const;
 
   // Rendering in the backend
-  virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height);
-  virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color);
-  virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask);
-  virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height);
+  virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height) = 0;
+  virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) = 0;
+  virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) = 0;
+  virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) = 0;
   virtual void DispatchRenderCommand() = 0;
   virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0;
   virtual void UpdateDisplay() = 0;
@@ -416,6 +416,8 @@ protected:
 
   union GPUSTAT
   {
+    // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
+
     u32 bits;
     BitField<u32, u8, 0, 4> texture_page_x_base;
     BitField<u32, u8, 4, 1> texture_page_y_base;
@@ -459,18 +461,6 @@ protected:
       static constexpr u32 ACTIVE = (1 << 19) | (1 << 22);
       return ((bits & ACTIVE) == ACTIVE);
     }
-
-    // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
-    ALWAYS_INLINE u16 GetMaskAND() const
-    {
-      // return check_mask_before_draw ? 0x8000 : 0x0000;
-      return Truncate16((bits << 3) & 0x8000);
-    }
-    ALWAYS_INLINE u16 GetMaskOR() const
-    {
-      // return set_mask_while_drawing ? 0x8000 : 0x0000;
-      return Truncate16((bits << 4) & 0x8000);
-    }
   } m_GPUSTAT = {};
 
   struct DrawMode
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index f01fa0d2d..ea0f24d48 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -6,6 +6,7 @@
 #include "cpu_pgxp.h"
 #include "gpu_hw_shadergen.h"
 #include "gpu_sw_backend.h"
+#include "gpu_sw_rasterizer.h"
 #include "host.h"
 #include "settings.h"
 #include "system.h"
diff --git a/src/core/gpu_sw_backend.cpp b/src/core/gpu_sw_backend.cpp
index 6168f0d39..57e5e5196 100644
--- a/src/core/gpu_sw_backend.cpp
+++ b/src/core/gpu_sw_backend.cpp
@@ -16,8 +16,6 @@ GPU_SW_Backend::~GPU_SW_Backend() = default;
 
 bool GPU_SW_Backend::Initialize(bool force_thread)
 {
-  GPU_SW_Rasterizer::SelectImplementation();
-
   return GPUBackend::Initialize(force_thread);
 }
 
@@ -59,186 +57,20 @@ void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd)
 
 void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
 {
-  const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
-  const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
-  constexpr u32 vector_width = 8;
-  const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
-
-  if ((x + width) <= VRAM_WIDTH && !params.interlaced_rendering)
-  {
-    for (u32 yoffs = 0; yoffs < height; yoffs++)
-    {
-      const u32 row = (y + yoffs) % VRAM_HEIGHT;
-
-      u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
-      u32 xoffs = 0;
-      for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
-        GSVector4i::store<false>(row_ptr, fill);
-      for (; xoffs < width; xoffs++)
-        *(row_ptr++) = color16;
-    }
-  }
-  else if (params.interlaced_rendering)
-  {
-    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
-    const u32 active_field = params.active_line_lsb;
-
-    if ((x + width) <= VRAM_WIDTH)
-    {
-      for (u32 yoffs = 0; yoffs < height; yoffs++)
-      {
-        const u32 row = (y + yoffs) % VRAM_HEIGHT;
-        if ((row & u32(1)) == active_field)
-          continue;
-
-        u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
-        u32 xoffs = 0;
-        for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
-          GSVector4i::store<false>(row_ptr, fill);
-        for (; xoffs < width; xoffs++)
-          *(row_ptr++) = color16;
-      }
-    }
-    else
-    {
-      for (u32 yoffs = 0; yoffs < height; yoffs++)
-      {
-        const u32 row = (y + yoffs) % VRAM_HEIGHT;
-        if ((row & u32(1)) == active_field)
-          continue;
-
-        u16* row_ptr = &g_vram[row * VRAM_WIDTH];
-        for (u32 xoffs = 0; xoffs < width; xoffs++)
-        {
-          const u32 col = (x + xoffs) % VRAM_WIDTH;
-          row_ptr[col] = color16;
-        }
-      }
-    }
-  }
-  else
-  {
-    for (u32 yoffs = 0; yoffs < height; yoffs++)
-    {
-      const u32 row = (y + yoffs) % VRAM_HEIGHT;
-      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
-      for (u32 xoffs = 0; xoffs < width; xoffs++)
-      {
-        const u32 col = (x + xoffs) % VRAM_WIDTH;
-        row_ptr[col] = color16;
-      }
-    }
-  }
+  GPU_SW_Rasterizer::FillVRAM(x, y, width, height, color, params.interlaced_rendering, params.active_line_lsb);
 }
 
 void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
                                 GPUBackendCommandParameters params)
 {
-  // Fast path when the copy is not oversized.
-  if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !params.IsMaskingEnabled())
-  {
-    const u16* src_ptr = static_cast<const u16*>(data);
-    u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
-    for (u32 yoffs = 0; yoffs < height; yoffs++)
-    {
-      std::copy_n(src_ptr, width, dst_ptr);
-      src_ptr += width;
-      dst_ptr += VRAM_WIDTH;
-    }
-  }
-  else
-  {
-    // Slow path when we need to handle wrap-around.
-    const u16* src_ptr = static_cast<const u16*>(data);
-    const u16 mask_and = params.GetMaskAND();
-    const u16 mask_or = params.GetMaskOR();
-
-    for (u32 row = 0; row < height;)
-    {
-      u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
-      for (u32 col = 0; col < width;)
-      {
-        // TODO: Handle unaligned reads...
-        u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
-        if (((*pixel_ptr) & mask_and) == 0)
-          *pixel_ptr = *(src_ptr++) | mask_or;
-      }
-    }
-  }
+  GPU_SW_Rasterizer::WriteVRAM(x, y, width, height, data, params.set_mask_while_drawing, params.check_mask_before_draw);
 }
 
 void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
                               GPUBackendCommandParameters params)
 {
-  // Break up oversized copies. This behavior has not been verified on console.
-  if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
-  {
-    u32 remaining_rows = height;
-    u32 current_src_y = src_y;
-    u32 current_dst_y = dst_y;
-    while (remaining_rows > 0)
-    {
-      const u32 rows_to_copy =
-        std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
-
-      u32 remaining_columns = width;
-      u32 current_src_x = src_x;
-      u32 current_dst_x = dst_x;
-      while (remaining_columns > 0)
-      {
-        const u32 columns_to_copy =
-          std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
-        CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, params);
-        current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
-        current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
-        remaining_columns -= columns_to_copy;
-      }
-
-      current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
-      current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
-      remaining_rows -= rows_to_copy;
-    }
-
-    return;
-  }
-
-  // This doesn't have a fast path, but do we really need one? It's not common.
-  const u16 mask_and = params.GetMaskAND();
-  const u16 mask_or = params.GetMaskOR();
-
-  // Copy in reverse when src_x < dst_x, this is verified on console.
-  if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
-  {
-    for (u32 row = 0; row < height; row++)
-    {
-      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-
-      for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
-      {
-        const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
-        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
-        if ((*dst_pixel_ptr & mask_and) == 0)
-          *dst_pixel_ptr = src_pixel | mask_or;
-      }
-    }
-  }
-  else
-  {
-    for (u32 row = 0; row < height; row++)
-    {
-      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
-
-      for (u32 col = 0; col < width; col++)
-      {
-        const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
-        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
-        if ((*dst_pixel_ptr & mask_and) == 0)
-          *dst_pixel_ptr = src_pixel | mask_or;
-      }
-    }
-  }
+  GPU_SW_Rasterizer::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing,
+                              params.check_mask_before_draw);
 }
 
 void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp
index 816cf3076..87fd65134 100644
--- a/src/core/gpu_sw_rasterizer.cpp
+++ b/src/core/gpu_sw_rasterizer.cpp
@@ -12,11 +12,6 @@
 LOG_CHANNEL(GPU_SW_Rasterizer);
 
 namespace GPU_SW_Rasterizer {
-// Default implementation, compatible with all ISAs.
-extern const DrawRectangleFunctionTable DrawRectangleFunctions;
-extern const DrawTriangleFunctionTable DrawTriangleFunctions;
-extern const DrawLineFunctionTable DrawLineFunctions;
-
 constinit const DitherLUT g_dither_lut = []() constexpr {
   DitherLUT lut = {};
   for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
@@ -33,30 +28,33 @@ constinit const DitherLUT g_dither_lut = []() constexpr {
   return lut;
 }();
 
+const DrawRectangleFunctionTable* DrawRectangleFunctions = nullptr;
+const DrawTriangleFunctionTable* DrawTriangleFunctions = nullptr;
+const DrawLineFunctionTable* DrawLineFunctions = nullptr;
+FillVRAMFunction FillVRAM = nullptr;
+WriteVRAMFunction WriteVRAM = nullptr;
+CopyVRAMFunction CopyVRAM = nullptr;
 GPUDrawingArea g_drawing_area = {};
 } // namespace GPU_SW_Rasterizer
 
-// Default implementation definitions.
-namespace GPU_SW_Rasterizer {
+// Default scalar implementation definitions.
+namespace GPU_SW_Rasterizer::Scalar {
+namespace {
 #include "gpu_sw_rasterizer.inl"
 }
+} // namespace GPU_SW_Rasterizer::Scalar
 
 // Default vector implementation definitions.
 #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
 namespace GPU_SW_Rasterizer::SIMD {
+namespace {
 #define USE_VECTOR 1
 #include "gpu_sw_rasterizer.inl"
 #undef USE_VECTOR
+} // namespace
 } // namespace GPU_SW_Rasterizer::SIMD
 #endif
 
-// Initialize with default implementation.
-namespace GPU_SW_Rasterizer {
-const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions;
-const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions = &DrawTriangleFunctions;
-const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions;
-} // namespace GPU_SW_Rasterizer
-
 // Declare alternative implementations.
 void GPU_SW_Rasterizer::SelectImplementation()
 {
@@ -66,13 +64,16 @@ void GPU_SW_Rasterizer::SelectImplementation()
 
   selected = true;
 
-#define SELECT_ALTERNATIVE_RASTERIZER(isa)                                                                             \
+#define SELECT_IMPLEMENTATION(isa)                                                                                     \
   do                                                                                                                   \
   {                                                                                                                    \
     INFO_LOG("Using " #isa " software rasterizer implementation.");                                                    \
-    SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions;                                                     \
-    SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions;                                                       \
-    SelectedDrawLineFunctions = &isa::DrawLineFunctions;                                                               \
+    DrawRectangleFunctions = &isa::DrawRectangleFunctions;                                                             \
+    DrawTriangleFunctions = &isa::DrawTriangleFunctions;                                                               \
+    DrawLineFunctions = &isa::DrawLineFunctions;                                                                       \
+    FillVRAM = &isa::FillVRAMImpl;                                                                                     \
+    WriteVRAM = &isa::WriteVRAMImpl;                                                                                   \
+    CopyVRAM = &isa::CopyVRAMImpl;                                                                                     \
   } while (0)
 
 #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
@@ -83,19 +84,20 @@ void GPU_SW_Rasterizer::SelectImplementation()
 #if defined(CPU_ARCH_SSE) && defined(_MSC_VER) && 0
   if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
   {
-    SELECT_ALTERNATIVE_RASTERIZER(AVX2);
+    SELECT_IMPLEMENTATION(AVX2);
     return;
   }
 #endif
 
   if (!use_isa || StringUtil::Strcasecmp(use_isa, "SIMD") == 0)
   {
-    SELECT_ALTERNATIVE_RASTERIZER(SIMD);
+    SELECT_IMPLEMENTATION(SIMD);
     return;
   }
 #endif
 
   INFO_LOG("Using scalar software rasterizer implementation.");
+  SELECT_IMPLEMENTATION(Scalar);
 
-#undef SELECT_ALTERNATIVE_RASTERIZER
+#undef SELECT_IMPLEMENTATION
 }
diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h
index a3eebe062..3861d9945 100644
--- a/src/core/gpu_sw_rasterizer.h
+++ b/src/core/gpu_sw_rasterizer.h
@@ -34,34 +34,38 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP
                                   const GPUBackendDrawLineCommand::Vertex* p1);
 typedef const DrawLineFunction DrawLineFunctionTable[2][2];
 
-// Default implementation, compatible with all ISAs.
-extern const DrawRectangleFunctionTable DrawRectangleFunctions;
-extern const DrawTriangleFunctionTable DrawTriangleFunctions;
-extern const DrawLineFunctionTable DrawLineFunctions;
+using FillVRAMFunction = void (*)(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb);
+using WriteVRAMFunction = void (*)(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask,
+                                   bool check_mask);
+using CopyVRAMFunction = void (*)(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
+                                  bool check_mask);
 
 // Current implementation, selected at runtime.
-extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions;
-extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions;
-extern const DrawLineFunctionTable* SelectedDrawLineFunctions;
+extern const DrawRectangleFunctionTable* DrawRectangleFunctions;
+extern const DrawTriangleFunctionTable* DrawTriangleFunctions;
+extern const DrawLineFunctionTable* DrawLineFunctions;
+extern FillVRAMFunction FillVRAM;
+extern WriteVRAMFunction WriteVRAM;
+extern CopyVRAMFunction CopyVRAM;
 
 extern void SelectImplementation();
 
 ALWAYS_INLINE static DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable)
 {
-  return (*SelectedDrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)];
+  return (*DrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)];
 }
 
 ALWAYS_INLINE static DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
                                                                     bool transparency_enable)
 {
-  return (*SelectedDrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
+  return (*DrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
 }
 
 ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
                                                                   bool raw_texture_enable, bool transparency_enable)
 {
-  return (*SelectedDrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)]
-                                         [u8(transparency_enable)];
+  return (
+    *DrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
 }
 
 #define DECLARE_ALTERNATIVE_RASTERIZER(isa)                                                                            \
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index b49d344f7..0a1ed95e3 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -1519,6 +1519,237 @@ constinit const DrawTriangleFunctionTable DrawTriangleFunctions = {
    {{&DrawTriangle<true, true, false, false>, &DrawTriangle<true, true, false, true>},
     {&DrawTriangle<true, true, true, false>, &DrawTriangle<true, true, true, true>}}}};
 
+static void FillVRAMImpl(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb)
+{
+#ifdef USE_VECTOR
+  const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
+  const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
+  constexpr u32 vector_width = 8;
+  const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
+
+  if ((x + width) <= VRAM_WIDTH && !interlaced)
+  {
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+
+      u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
+      u32 xoffs = 0;
+      for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
+        GSVector4i::store<false>(row_ptr, fill);
+      for (; xoffs < width; xoffs++)
+        *(row_ptr++) = color16;
+    }
+  }
+  else if (interlaced)
+  {
+    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
+    const u32 active_field = active_line_lsb;
+
+    if ((x + width) <= VRAM_WIDTH)
+    {
+      for (u32 yoffs = 0; yoffs < height; yoffs++)
+      {
+        const u32 row = (y + yoffs) % VRAM_HEIGHT;
+        if ((row & u32(1)) == active_field)
+          continue;
+
+        u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
+        u32 xoffs = 0;
+        for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
+          GSVector4i::store<false>(row_ptr, fill);
+        for (; xoffs < width; xoffs++)
+          *(row_ptr++) = color16;
+      }
+    }
+    else
+    {
+      for (u32 yoffs = 0; yoffs < height; yoffs++)
+      {
+        const u32 row = (y + yoffs) % VRAM_HEIGHT;
+        if ((row & u32(1)) == active_field)
+          continue;
+
+        u16* row_ptr = &g_vram[row * VRAM_WIDTH];
+        for (u32 xoffs = 0; xoffs < width; xoffs++)
+        {
+          const u32 col = (x + xoffs) % VRAM_WIDTH;
+          row_ptr[col] = color16;
+        }
+      }
+    }
+  }
+  else
+  {
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
+      for (u32 xoffs = 0; xoffs < width; xoffs++)
+      {
+        const u32 col = (x + xoffs) % VRAM_WIDTH;
+        row_ptr[col] = color16;
+      }
+    }
+  }
+#else
+  const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
+  if ((x + width) <= VRAM_WIDTH && !interlaced)
+  {
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16);
+    }
+  }
+  else if (interlaced)
+  {
+    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
+    const u32 active_field = active_line_lsb;
+
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      if ((row & u32(1)) == active_field)
+        continue;
+
+      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
+      for (u32 xoffs = 0; xoffs < width; xoffs++)
+      {
+        const u32 col = (x + xoffs) % VRAM_WIDTH;
+        row_ptr[col] = color16;
+      }
+    }
+  }
+  else
+  {
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
+      for (u32 xoffs = 0; xoffs < width; xoffs++)
+      {
+        const u32 col = (x + xoffs) % VRAM_WIDTH;
+        row_ptr[col] = color16;
+      }
+    }
+  }
+#endif
+}
+
+static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
+{
+  // TODO: Vector implementation
+
+  // Fast path when the copy is not oversized.
+  if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask)
+  {
+    const u16* src_ptr = static_cast<const u16*>(data);
+    u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      std::copy_n(src_ptr, width, dst_ptr);
+      src_ptr += width;
+      dst_ptr += VRAM_WIDTH;
+    }
+  }
+  else
+  {
+    // Slow path when we need to handle wrap-around.
+    // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
+    const u16* src_ptr = static_cast<const u16*>(data);
+    const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
+    const u16 mask_or = set_mask ? 0x8000u : 0x0000u;
+
+    for (u32 row = 0; row < height;)
+    {
+      u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
+      for (u32 col = 0; col < width;)
+      {
+        // TODO: Handle unaligned reads...
+        u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
+        if (((*pixel_ptr) & mask_and) == 0)
+          *pixel_ptr = *(src_ptr++) | mask_or;
+      }
+    }
+  }
+}
+
+static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
+                         bool check_mask)
+{
+  // TODO: Vector implementation.
+
+  // Break up oversized copies. This behavior has not been verified on console.
+  if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
+  {
+    u32 remaining_rows = height;
+    u32 current_src_y = src_y;
+    u32 current_dst_y = dst_y;
+    while (remaining_rows > 0)
+    {
+      const u32 rows_to_copy =
+        std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
+
+      u32 remaining_columns = width;
+      u32 current_src_x = src_x;
+      u32 current_dst_x = dst_x;
+      while (remaining_columns > 0)
+      {
+        const u32 columns_to_copy =
+          std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
+        CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, set_mask,
+                 check_mask);
+        current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
+        current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
+        remaining_columns -= columns_to_copy;
+      }
+
+      current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
+      current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
+      remaining_rows -= rows_to_copy;
+    }
+
+    return;
+  }
+
+  // This doesn't have a fast path, but do we really need one? It's not common.
+  const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
+  const u16 mask_or = set_mask ? 0x8000u : 0x0000u;
+
+  // Copy in reverse when src_x < dst_x, this is verified on console.
+  if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
+  {
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+
+      for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
+      {
+        const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
+        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
+        *dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
+      }
+    }
+  }
+  else
+  {
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+
+      for (u32 col = 0; col < width; col++)
+      {
+        const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
+        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
+        *dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
+      }
+    }
+  }
+}
+
 #ifdef __INTELLISENSE__
 }
 #endif
diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h
index 99fa8932f..95e153e5e 100644
--- a/src/core/gpu_types.h
+++ b/src/core/gpu_types.h
@@ -275,8 +275,6 @@ union GPUBackendCommandParameters
   BitField<u8, bool, 2, 1> set_mask_while_drawing;
   BitField<u8, bool, 3, 1> check_mask_before_draw;
 
-  ALWAYS_INLINE bool IsMaskingEnabled() const { return (bits & 12u) != 0u; }
-
   // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
   u16 GetMaskAND() const
   {