From 5725a0360bd4235f003db56c488ab2bf1540af78 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 6 Dec 2024 18:21:56 +1000
Subject: [PATCH] GPU: Use A1BGR5 format for SW/HashCache if available

---
 src/core/gpu_hw_texture_cache.cpp | 108 ++++---------
 src/core/gpu_sw.cpp               | 248 +++++-------------------------
 src/core/gpu_sw.h                 |   7 +-
 src/core/gpu_sw_rasterizer.inl    |   4 +-
 src/core/gpu_types.h              |  97 ++++++++++++
 5 files changed, 178 insertions(+), 286 deletions(-)
diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp
index 44de7df8e..a9b76131a 100644
--- a/src/core/gpu_hw_texture_cache.cpp
+++ b/src/core/gpu_hw_texture_cache.cpp
@@ -781,6 +781,8 @@ void GPUTextureCache::SetHashCacheTextureFormat()
   // Prefer 16-bit texture formats where possible.
   if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1))
     s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1;
+  else if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::A1BGR5))
+    s_state.hash_cache_texture_format = GPUTexture::Format::A1BGR5;
   else
     s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8;
 
@@ -1080,70 +1082,6 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg
   return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()];
 }
 
-template<GPUTexture::Format format>
-ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16)
-{
-  if constexpr (format == GPUTexture::Format::RGBA8)
-  {
-    const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
-    std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
-    dest += sizeof(c32);
-  }
-  else if constexpr (format == GPUTexture::Format::RGB5A1)
-  {
-    const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
-    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
-    dest += sizeof(repacked);
-  }
-}
-
-#ifdef CPU_ARCH_SIMD
-
-ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
-{
-  return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
-}
-
-ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
-{
-  static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
-
-  const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
-  const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
-  const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
-  const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
-
-  return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a;
-}
-
-template<GPUTexture::Format format>
-ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16)
-{
-  if constexpr (format == GPUTexture::Format::RGBA8)
-  {
-    const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
-    const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
-
-    GSVector4i::store<false>(dest, low);
-    dest += sizeof(GSVector4i);
-
-    GSVector4i::store<false>(dest, high);
-    dest += sizeof(GSVector4i);
-  }
-  else if constexpr (format == GPUTexture::Format::RGB5A1)
-  {
-    static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
-
-    const GSVector4i repacked =
-      (c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
-
-    GSVector4i::store<false>(dest, repacked);
-    dest += sizeof(GSVector4i);
-  }
-}
-
-#endif
-
 template<GPUTexture::Format format>
 void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest,
                                      u32 dest_stride)
@@ -1175,17 +1113,17 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
         c16[5] = palette[(pp >> 4) & 0x0F];
         c16[6] = palette[(pp >> 8) & 0x0F];
         c16[7] = palette[pp >> 12];
-        WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
+        ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
       }
 #endif
 
       for (; x < vram_width; x++)
       {
         const u32 pp = *(page_ptr++);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp & 0x0F]);
-        WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
-        WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp >> 12]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 12]);
       }
 
       page += VRAM_WIDTH;
@@ -1206,7 +1144,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
         if (offs == 0)
           texel = *(page_ptr++);
 
-        WriteDecodedTexel<format>(dest_ptr, palette[texel & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0x0F]);
         texel >>= 4;
 
         offs = (offs + 1) % 4;
@@ -1251,15 +1189,15 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
         pp = *(page_ptr++);
         c16[6] = palette[pp & 0xFF];
         c16[7] = palette[(pp >> 8) & 0xFF];
-        WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
+        ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
       }
 #endif
 
       for (; x < vram_width; x++)
       {
         const u32 pp = *(page_ptr++);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp & 0xFF]);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp >> 8]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0xFF]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 8]);
       }
 
       page += VRAM_WIDTH;
@@ -1280,7 +1218,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
         if (offs == 0)
           texel = *(page_ptr++);
 
-        WriteDecodedTexel<format>(dest_ptr, palette[texel & 0xFF]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0xFF]);
         texel >>= 8;
 
         offs ^= 1;
@@ -1307,13 +1245,13 @@ void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8
 #ifdef CPU_ARCH_SIMD
     for (; x < aligned_width; x += pixels_per_vec)
     {
-      WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
+      ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
       page_ptr += pixels_per_vec;
     }
 #endif
 
     for (; x < width; x++)
-      WriteDecodedTexel<format>(dest_ptr, *(page_ptr++));
+      ConvertVRAMPixel<format>(dest_ptr, *(page_ptr++));
 
     page += VRAM_WIDTH;
     dest += dest_stride;
@@ -1359,6 +1297,24 @@ void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, co
         DefaultCaseIsUnreachable()
     }
   }
+  else if (dest_format == GPUTexture::Format::A1BGR5)
+  {
+    switch (mode)
+    {
+      case GPUTextureMode::Palette4Bit:
+        DecodeTexture4<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
+        break;
+      case GPUTextureMode::Palette8Bit:
+        DecodeTexture8<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
+        break;
+      case GPUTextureMode::Direct16Bit:
+      case GPUTextureMode::Reserved_Direct16Bit:
+        DecodeTexture16<GPUTexture::Format::A1BGR5>(page_ptr, width, height, dest, dest_stride);
+        break;
+
+        DefaultCaseIsUnreachable()
+    }
+  }
   else
   {
     Panic("Unsupported texture format.");
diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp
index 9159dbddc..3020246d6 100644
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@@ -41,10 +41,8 @@ bool GPU_SW::Initialize(Error* error)
   if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread))
     return false;
 
-  static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1,
-                                                         GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8};
-  static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8,
-                                                         GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1};
+  static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB5A1, GPUTexture::Format::A1BGR5,
+                                                         GPUTexture::Format::RGB565, GPUTexture::Format::RGBA8};
   for (const GPUTexture::Format format : formats_for_16bit)
   {
     if (g_gpu_device->SupportsTextureFormat(format))
@@ -53,15 +51,10 @@ bool GPU_SW::Initialize(Error* error)
       break;
     }
   }
-  for (const GPUTexture::Format format : formats_for_24bit)
-  {
-    if (g_gpu_device->SupportsTextureFormat(format))
-    {
-      m_24bit_display_format = format;
-      break;
-    }
-  }
 
+  // RGBA8 will always be supported, hence we'll find one.
+  INFO_LOG("Using {} format for 16-bit display", GPUTexture::GetFormatName(m_16bit_display_format));
+  Assert(m_16bit_display_format != GPUTexture::Format::Unknown);
   return true;
 }
 
@@ -108,129 +101,43 @@ GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format
   return m_upload_texture.get();
 }
 
-template<GPUTexture::Format out_format, typename out_type>
-static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width);
-
-template<GPUTexture::Format out_format, typename out_type>
-static out_type VRAM16ToOutput(u16 value);
-
-template<>
-ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(u16 value)
-{
-  return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10);
-}
-
-template<>
-ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(u16 value)
-{
-  return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11);
-}
-
-template<>
-ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(u16 value)
-{
-  const u32 value32 = ZeroExtend32(value);
-  const u32 r = (value32 & 31u) << 3;
-  const u32 g = ((value32 >> 5) & 31u) << 3;
-  const u32 b = ((value32 >> 10) & 31u) << 3;
-  const u32 a = ((value >> 15) != 0) ? 255 : 0;
-  return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (ZeroExtend32(a) << 24);
-}
-
-template<>
-ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(u16 value)
-{
-  const u32 value32 = ZeroExtend32(value);
-  const u32 r = (value32 & 31u) << 3;
-  const u32 g = ((value32 >> 5) & 31u) << 3;
-  const u32 b = ((value32 >> 10) & 31u) << 3;
-  return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u);
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB5A1, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
-{
-  u32 col = 0;
-
-  const u32 aligned_width = Common::AlignDownPow2(width, 8);
-  for (; col < aligned_width; col += 8)
-  {
-    constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
-    GSVector4i value = GSVector4i::load<false>(src_ptr);
-    src_ptr += 8;
-    GSVector4i a = value & GSVector4i::cxpr16(0x3E0);
-    GSVector4i b = value.srl16<10>() & single_mask;
-    GSVector4i c = (value & single_mask).sll16<10>();
-    value = (a | b) | c;
-    GSVector4i::store<false>(dst_ptr, value);
-    dst_ptr += 8;
-  }
-
-  for (; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(*(src_ptr++));
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB565, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
-{
-  u32 col = 0;
-
-  const u32 aligned_width = Common::AlignDownPow2(width, 8);
-  for (; col < aligned_width; col += 8)
-  {
-    constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
-    GSVector4i value = GSVector4i::load<false>(src_ptr);
-    src_ptr += 8;
-    GSVector4i a = (value & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
-    GSVector4i b = (value & GSVector4i::cxpr16(0x20)).sll16<1>();  // (value & 0x20) << 1
-    GSVector4i c = (value.srl16<10>() & single_mask);              // ((value >> 10) & 0x1F)
-    GSVector4i d = (value & single_mask).sll16<11>();              // ((value & 0x1F) << 11)
-    value = (((a | b) | c) | d);
-    GSVector4i::store<false>(dst_ptr, value);
-    dst_ptr += 8;
-  }
-
-  for (; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(*(src_ptr++));
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGBA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
-{
-  for (u32 col = 0; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(*(src_ptr++));
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::BGRA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
-{
-  for (u32 col = 0; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(*(src_ptr++));
-}
-
 template<GPUTexture::Format display_format>
 ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip)
 {
-  using OutputPixelType =
-    std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
-                       u16>;
-
   GPUTexture* texture = GetDisplayTexture(width, height, display_format);
   if (!texture) [[unlikely]]
     return false;
 
-  u32 dst_stride = width * sizeof(OutputPixelType);
+  u32 dst_stride = Common::AlignUpPow2(width * texture->GetPixelSize(), 4);
   u8* dst_ptr = m_upload_buffer.data();
   const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
 
   // Fast path when not wrapping around.
   if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
   {
+    [[maybe_unused]] constexpr u32 pixels_per_vec = 8;
+    [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec);
+
     const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x];
     const u32 src_step = VRAM_WIDTH << line_skip;
+
     for (u32 row = 0; row < height; row++)
     {
-      CopyOutRow16<display_format>(src_ptr, reinterpret_cast<OutputPixelType*>(dst_ptr), width);
+      const u16* src_row_ptr = src_ptr;
+      u8* dst_row_ptr = dst_ptr;
+      u32 x = 0;
+
+#ifdef CPU_ARCH_SIMD
+      for (; x < aligned_width; x += pixels_per_vec)
+      {
+        ConvertVRAMPixels<display_format>(dst_row_ptr, GSVector4i::load<false>(src_row_ptr));
+        src_row_ptr += pixels_per_vec;
+      }
+#endif
+
+      for (; x < width; x++)
+        ConvertVRAMPixel<display_format>(dst_row_ptr, *(src_row_ptr++));
+
       src_ptr += src_step;
       dst_ptr += dst_stride;
     }
@@ -242,10 +149,10 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
     for (u32 row = 0; row < height; row++)
     {
       const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
-      OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
+      u8* dst_row_ptr = dst_ptr;
 
       for (u32 col = src_x; col < end_x; col++)
-        *(dst_row_ptr++) = VRAM16ToOutput<display_format, OutputPixelType>(src_row_ptr[col % VRAM_WIDTH]);
+        ConvertVRAMPixel<display_format>(dst_row_ptr, src_row_ptr[col % VRAM_WIDTH]);
 
       src_y += y_step;
       dst_ptr += dst_stride;
@@ -260,18 +167,13 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
   return true;
 }
 
-template<GPUTexture::Format display_format>
 ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip)
 {
-  using OutputPixelType =
-    std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
-                       u16>;
-
-  GPUTexture* texture = GetDisplayTexture(width, height, display_format);
+  GPUTexture* texture = GetDisplayTexture(width, height, FORMAT_FOR_24BIT);
   if (!texture) [[unlikely]]
     return false;
 
-  u32 dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4);
+  u32 dst_stride = width * sizeof(u32);
   u8* dst_ptr = m_upload_buffer.data();
   const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
 
@@ -281,52 +183,14 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
     const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16);
     for (u32 row = 0; row < height; row++)
     {
-      if constexpr (display_format == GPUTexture::Format::RGBA8)
+      const u8* src_row_ptr = src_ptr;
+      u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
+      for (u32 col = 0; col < width; col++)
       {
-        const u8* src_row_ptr = src_ptr;
-        u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = *(src_row_ptr++);
-          *(dst_row_ptr++) = *(src_row_ptr++);
-          *(dst_row_ptr++) = *(src_row_ptr++);
-          *(dst_row_ptr++) = 0xFF;
-        }
-      }
-      else if constexpr (display_format == GPUTexture::Format::BGRA8)
-      {
-        const u8* src_row_ptr = src_ptr;
-        u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = src_row_ptr[2];
-          *(dst_row_ptr++) = src_row_ptr[1];
-          *(dst_row_ptr++) = src_row_ptr[0];
-          *(dst_row_ptr++) = 0xFF;
-          src_row_ptr += 3;
-        }
-      }
-      else if constexpr (display_format == GPUTexture::Format::RGB565)
-      {
-        const u8* src_row_ptr = src_ptr;
-        u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 11) |
-                             ((static_cast<u16>(src_row_ptr[1]) >> 2) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
-          src_row_ptr += 3;
-        }
-      }
-      else if constexpr (display_format == GPUTexture::Format::RGB5A1)
-      {
-        const u8* src_row_ptr = src_ptr;
-        u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 10) |
-                             ((static_cast<u16>(src_row_ptr[1]) >> 3) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
-          src_row_ptr += 3;
-        }
+        *(dst_row_ptr++) = *(src_row_ptr++);
+        *(dst_row_ptr++) = *(src_row_ptr++);
+        *(dst_row_ptr++) = *(src_row_ptr++);
+        *(dst_row_ptr++) = 0xFF;
       }
 
       src_ptr += src_stride;
@@ -340,7 +204,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
     for (u32 row = 0; row < height; row++)
     {
       const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
-      OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
+      u32* dst_row_ptr = reinterpret_cast<u32*>(dst_ptr);
 
       for (u32 col = 0; col < width; col++)
       {
@@ -350,22 +214,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
         const u8 shift = static_cast<u8>(col & 1u) * 8;
         const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift);
 
-        if constexpr (display_format == GPUTexture::Format::RGBA8)
-        {
-          *(dst_row_ptr++) = rgb | 0xFF000000u;
-        }
-        else if constexpr (display_format == GPUTexture::Format::BGRA8)
-        {
-          *(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u;
-        }
-        else if constexpr (display_format == GPUTexture::Format::RGB565)
-        {
-          *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000);
-        }
-        else if constexpr (display_format == GPUTexture::Format::RGB5A1)
-        {
-          *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000);
-        }
+        *(dst_row_ptr++) = rgb | 0xFF000000u;
       }
 
       src_y += y_step;
@@ -392,6 +241,9 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
       case GPUTexture::Format::RGB5A1:
         return CopyOut15Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, width, height, line_skip);
 
+      case GPUTexture::Format::A1BGR5:
+        return CopyOut15Bit<GPUTexture::Format::A1BGR5>(src_x, src_y, width, height, line_skip);
+
       case GPUTexture::Format::RGB565:
         return CopyOut15Bit<GPUTexture::Format::RGB565>(src_x, src_y, width, height, line_skip);
 
@@ -407,23 +259,7 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
   }
   else
   {
-    switch (m_24bit_display_format)
-    {
-      case GPUTexture::Format::RGB5A1:
-        return CopyOut24Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, skip_x, width, height, line_skip);
-
-      case GPUTexture::Format::RGB565:
-        return CopyOut24Bit<GPUTexture::Format::RGB565>(src_x, src_y, skip_x, width, height, line_skip);
-
-      case GPUTexture::Format::RGBA8:
-        return CopyOut24Bit<GPUTexture::Format::RGBA8>(src_x, src_y, skip_x, width, height, line_skip);
-
-      case GPUTexture::Format::BGRA8:
-        return CopyOut24Bit<GPUTexture::Format::BGRA8>(src_x, src_y, skip_x, width, height, line_skip);
-
-      default:
-        UnreachableCode();
-    }
+    return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip);
   }
 }
 
diff --git a/src/core/gpu_sw.h b/src/core/gpu_sw.h
index 2251843aa..b99bdd168 100644
--- a/src/core/gpu_sw.h
+++ b/src/core/gpu_sw.h
@@ -45,7 +45,6 @@ protected:
   template<GPUTexture::Format display_format>
   bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip);
 
-  template<GPUTexture::Format display_format>
   bool CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip);
 
   bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit);
@@ -57,11 +56,13 @@ protected:
   void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
   void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;
 
+private:
+  static constexpr GPUTexture::Format FORMAT_FOR_24BIT = GPUTexture::Format::RGBA8; // RGBA8 always supported.
+
   GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format);
 
   FixedHeapArray<u8, GPU_MAX_DISPLAY_WIDTH * GPU_MAX_DISPLAY_HEIGHT * sizeof(u32)> m_upload_buffer;
-  GPUTexture::Format m_16bit_display_format = GPUTexture::Format::RGB565;
-  GPUTexture::Format m_24bit_display_format = GPUTexture::Format::RGBA8;
+  GPUTexture::Format m_16bit_display_format = GPUTexture::Format::Unknown;
   std::unique_ptr<GPUTexture> m_upload_texture;
 
   GPU_SW_Backend m_backend;
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index 803dbfdef..d470a93a6 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -3,8 +3,10 @@
 
 #ifdef __INTELLISENSE__
 
-#include "common/gsvector.h"
 #include "gpu.h"
+
+#include "common/gsvector.h"
+
 #include <algorithm>
 
 #define USE_VECTOR 1
diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h
index 137264ec5..4782272b0 100644
--- a/src/core/gpu_types.h
+++ b/src/core/gpu_types.h
@@ -5,6 +5,8 @@
 
 #include "types.h"
 
+#include "util/gpu_texture.h"
+
 #include "common/bitfield.h"
 #include "common/bitutils.h"
 #include "common/gsvector.h"
@@ -249,6 +251,101 @@ ALWAYS_INLINE static constexpr u16 VRAMRGBA8888ToRGBA5551(u32 color)
   return Truncate16(r | (g << 5) | (b << 10) | (a << 15));
 }
 
+#ifdef CPU_ARCH_SIMD
+
+ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
+{
+  return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
+}
+
+ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
+{
+  static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
+
+  const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
+  const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
+  const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
+  const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
+
+  return r | g.sll32<8>() | b.sll32<16>() | a;
+}
+
+template<GPUTexture::Format format>
+ALWAYS_INLINE static void ConvertVRAMPixels(u8*& dest, GSVector4i c16)
+{
+  if constexpr (format == GPUTexture::Format::RGBA8)
+  {
+    const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
+    const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
+
+    GSVector4i::store<false>(dest, low);
+    dest += sizeof(GSVector4i);
+
+    GSVector4i::store<false>(dest, high);
+    dest += sizeof(GSVector4i);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB5A1)
+  {
+    static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
+
+    const GSVector4i repacked =
+      (c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
+
+    GSVector4i::store<false>(dest, repacked);
+    dest += sizeof(GSVector4i);
+  }
+  else if constexpr (format == GPUTexture::Format::A1BGR5)
+  {
+    const GSVector4i repacked = (c16 & GSVector4i::cxpr16(static_cast<s16>(0x3E0))).sll16<1>() |
+                                (c16.srl16<9>() & GSVector4i::cxpr16(0x3E)) |
+                                (c16 & GSVector4i::cxpr16(0x1F)).sll16<11>() | c16.srl16<15>();
+
+    GSVector4i::store<false>(dest, repacked);
+    dest += sizeof(GSVector4i);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB565)
+  {
+    constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
+    const GSVector4i a = (c16 & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
+    const GSVector4i b = (c16 & GSVector4i::cxpr16(0x20)).sll16<1>();  // (value & 0x20) << 1
+    const GSVector4i c = (c16.srl16<10>() & single_mask);              // ((value >> 10) & 0x1F)
+    const GSVector4i d = (c16 & single_mask).sll16<11>();              // ((value & 0x1F) << 11)
+    GSVector4i::store<false>(dest, (((a | b) | c) | d));
+    dest += sizeof(GSVector4i);
+  }
+}
+
+#endif
+
+template<GPUTexture::Format format>
+ALWAYS_INLINE static void ConvertVRAMPixel(u8*& dest, u16 c16)
+{
+  if constexpr (format == GPUTexture::Format::RGBA8)
+  {
+    const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
+    std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
+    dest += sizeof(c32);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB5A1)
+  {
+    const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
+    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
+    dest += sizeof(repacked);
+  }
+  else if constexpr (format == GPUTexture::Format::A1BGR5)
+  {
+    const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 >> 9) & 0x3E) | ((c16 & 0x1F) << 11) | (c16 >> 15);
+    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
+    dest += sizeof(repacked);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB565)
+  {
+    const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 & 0x20) << 1) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 11);
+    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
+    dest += sizeof(repacked);
+  }
+}
+
 union GPUVertexPosition
 {
   u32 bits;