GPU: Use A1BGR5 format for SW/HashCache if available

2024-12-06 18:21:56 +10:00 · 2024-12-06 18:21:56 +10:00 · 5725a0360b
parent 8c5fadafba
commit 5725a0360b
5 changed files with 178 additions and 286 deletions
--- a/src/core/gpu_hw_texture_cache.cpp
+++ b/src/core/gpu_hw_texture_cache.cpp
@ -781,6 +781,8 @@ void GPUTextureCache::SetHashCacheTextureFormat()
  // Prefer 16-bit texture formats where possible.
  if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1))
    s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1;
+  else if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::A1BGR5))
+    s_state.hash_cache_texture_format = GPUTexture::Format::A1BGR5;
  else
    s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8;

@ -1080,70 +1082,6 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg
  return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()];
 }

-template<GPUTexture::Format format>
-ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16)
-{
-  if constexpr (format == GPUTexture::Format::RGBA8)
-  {
-    const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
-    std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
-    dest += sizeof(c32);
-  }
-  else if constexpr (format == GPUTexture::Format::RGB5A1)
-  {
-    const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
-    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
-    dest += sizeof(repacked);
-  }
-}
-
-#ifdef CPU_ARCH_SIMD
-
-ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
-{
-  return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
-}
-
-ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
-{
-  static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
-
-  const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
-  const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
-  const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
-  const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
-
-  return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a;
-}
-
-template<GPUTexture::Format format>
-ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16)
-{
-  if constexpr (format == GPUTexture::Format::RGBA8)
-  {
-    const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
-    const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
-
-    GSVector4i::store<false>(dest, low);
-    dest += sizeof(GSVector4i);
-
-    GSVector4i::store<false>(dest, high);
-    dest += sizeof(GSVector4i);
-  }
-  else if constexpr (format == GPUTexture::Format::RGB5A1)
-  {
-    static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
-
-    const GSVector4i repacked =
-      (c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
-
-    GSVector4i::store<false>(dest, repacked);
-    dest += sizeof(GSVector4i);
-  }
-}
-
-#endif
-
 template<GPUTexture::Format format>
 void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest,
                                     u32 dest_stride)
@ -1175,17 +1113,17 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
        c16[5] = palette[(pp >> 4) & 0x0F];
        c16[6] = palette[(pp >> 8) & 0x0F];
        c16[7] = palette[pp >> 12];
-        WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
+        ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
      }
 #endif

      for (; x < vram_width; x++)
      {
        const u32 pp = *(page_ptr++);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp & 0x0F]);
-        WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
-        WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp >> 12]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 12]);
      }

      page += VRAM_WIDTH;
@ -1206,7 +1144,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
        if (offs == 0)
          texel = *(page_ptr++);

-        WriteDecodedTexel<format>(dest_ptr, palette[texel & 0x0F]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0x0F]);
        texel >>= 4;

        offs = (offs + 1) % 4;
@ -1251,15 +1189,15 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
        pp = *(page_ptr++);
        c16[6] = palette[pp & 0xFF];
        c16[7] = palette[(pp >> 8) & 0xFF];
-        WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
+        ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
      }
 #endif

      for (; x < vram_width; x++)
      {
        const u32 pp = *(page_ptr++);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp & 0xFF]);
-        WriteDecodedTexel<format>(dest_ptr, palette[pp >> 8]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0xFF]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 8]);
      }

      page += VRAM_WIDTH;
@ -1280,7 +1218,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
        if (offs == 0)
          texel = *(page_ptr++);

-        WriteDecodedTexel<format>(dest_ptr, palette[texel & 0xFF]);
+        ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0xFF]);
        texel >>= 8;

        offs ^= 1;
@ -1307,13 +1245,13 @@ void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8
 #ifdef CPU_ARCH_SIMD
    for (; x < aligned_width; x += pixels_per_vec)
    {
-      WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
+      ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
      page_ptr += pixels_per_vec;
    }
 #endif

    for (; x < width; x++)
-      WriteDecodedTexel<format>(dest_ptr, *(page_ptr++));
+      ConvertVRAMPixel<format>(dest_ptr, *(page_ptr++));

    page += VRAM_WIDTH;
    dest += dest_stride;
@ -1359,6 +1297,24 @@ void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, co
        DefaultCaseIsUnreachable()
    }
  }
+  else if (dest_format == GPUTexture::Format::A1BGR5)
+  {
+    switch (mode)
+    {
+      case GPUTextureMode::Palette4Bit:
+        DecodeTexture4<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
+        break;
+      case GPUTextureMode::Palette8Bit:
+        DecodeTexture8<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
+        break;
+      case GPUTextureMode::Direct16Bit:
+      case GPUTextureMode::Reserved_Direct16Bit:
+        DecodeTexture16<GPUTexture::Format::A1BGR5>(page_ptr, width, height, dest, dest_stride);
+        break;
+
+        DefaultCaseIsUnreachable()
+    }
+  }
  else
  {
    Panic("Unsupported texture format.");
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@ -41,10 +41,8 @@ bool GPU_SW::Initialize(Error* error)
  if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread))
    return false;

-  static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1,
-                                                         GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8};
-  static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8,
-                                                         GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1};
+  static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB5A1, GPUTexture::Format::A1BGR5,
+                                                         GPUTexture::Format::RGB565, GPUTexture::Format::RGBA8};
  for (const GPUTexture::Format format : formats_for_16bit)
  {
    if (g_gpu_device->SupportsTextureFormat(format))
@ -53,15 +51,10 @@ bool GPU_SW::Initialize(Error* error)
      break;
    }
  }
-  for (const GPUTexture::Format format : formats_for_24bit)
-  {
-    if (g_gpu_device->SupportsTextureFormat(format))
-    {
-      m_24bit_display_format = format;
-      break;
-    }
-  }

+  // RGBA8 will always be supported, hence we'll find one.
+  INFO_LOG("Using {} format for 16-bit display", GPUTexture::GetFormatName(m_16bit_display_format));
+  Assert(m_16bit_display_format != GPUTexture::Format::Unknown);
  return true;
 }

@ -108,129 +101,43 @@ GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format
  return m_upload_texture.get();
 }

-template<GPUTexture::Format out_format, typename out_type>
-static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width);
-
-template<GPUTexture::Format out_format, typename out_type>
-static out_type VRAM16ToOutput(u16 value);
-
-template<>
-ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(u16 value)
-{
-  return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10);
-}
-
-template<>
-ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(u16 value)
-{
-  return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11);
-}
-
-template<>
-ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(u16 value)
-{
-  const u32 value32 = ZeroExtend32(value);
-  const u32 r = (value32 & 31u) << 3;
-  const u32 g = ((value32 >> 5) & 31u) << 3;
-  const u32 b = ((value32 >> 10) & 31u) << 3;
-  const u32 a = ((value >> 15) != 0) ? 255 : 0;
-  return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (ZeroExtend32(a) << 24);
-}
-
-template<>
-ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(u16 value)
-{
-  const u32 value32 = ZeroExtend32(value);
-  const u32 r = (value32 & 31u) << 3;
-  const u32 g = ((value32 >> 5) & 31u) << 3;
-  const u32 b = ((value32 >> 10) & 31u) << 3;
-  return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u);
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB5A1, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
-{
-  u32 col = 0;
-
-  const u32 aligned_width = Common::AlignDownPow2(width, 8);
-  for (; col < aligned_width; col += 8)
-  {
-    constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
-    GSVector4i value = GSVector4i::load<false>(src_ptr);
-    src_ptr += 8;
-    GSVector4i a = value & GSVector4i::cxpr16(0x3E0);
-    GSVector4i b = value.srl16<10>() & single_mask;
-    GSVector4i c = (value & single_mask).sll16<10>();
-    value = (a | b) | c;
-    GSVector4i::store<false>(dst_ptr, value);
-    dst_ptr += 8;
-  }
-
-  for (; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(*(src_ptr++));
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB565, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
-{
-  u32 col = 0;
-
-  const u32 aligned_width = Common::AlignDownPow2(width, 8);
-  for (; col < aligned_width; col += 8)
-  {
-    constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
-    GSVector4i value = GSVector4i::load<false>(src_ptr);
-    src_ptr += 8;
-    GSVector4i a = (value & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
-    GSVector4i b = (value & GSVector4i::cxpr16(0x20)).sll16<1>();  // (value & 0x20) << 1
-    GSVector4i c = (value.srl16<10>() & single_mask);              // ((value >> 10) & 0x1F)
-    GSVector4i d = (value & single_mask).sll16<11>();              // ((value & 0x1F) << 11)
-    value = (((a | b) | c) | d);
-    GSVector4i::store<false>(dst_ptr, value);
-    dst_ptr += 8;
-  }
-
-  for (; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(*(src_ptr++));
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGBA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
-{
-  for (u32 col = 0; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(*(src_ptr++));
-}
-
-template<>
-ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::BGRA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
-{
-  for (u32 col = 0; col < width; col++)
-    *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(*(src_ptr++));
-}
-
 template<GPUTexture::Format display_format>
 ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip)
 {
-  using OutputPixelType =
-    std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
-                       u16>;
-
  GPUTexture* texture = GetDisplayTexture(width, height, display_format);
  if (!texture) [[unlikely]]
    return false;

-  u32 dst_stride = width * sizeof(OutputPixelType);
+  u32 dst_stride = Common::AlignUpPow2(width * texture->GetPixelSize(), 4);
  u8* dst_ptr = m_upload_buffer.data();
  const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);

  // Fast path when not wrapping around.
  if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
  {
+    [[maybe_unused]] constexpr u32 pixels_per_vec = 8;
+    [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec);
+
    const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x];
    const u32 src_step = VRAM_WIDTH << line_skip;
+
    for (u32 row = 0; row < height; row++)
    {
-      CopyOutRow16<display_format>(src_ptr, reinterpret_cast<OutputPixelType*>(dst_ptr), width);
+      const u16* src_row_ptr = src_ptr;
+      u8* dst_row_ptr = dst_ptr;
+      u32 x = 0;
+
+#ifdef CPU_ARCH_SIMD
+      for (; x < aligned_width; x += pixels_per_vec)
+      {
+        ConvertVRAMPixels<display_format>(dst_row_ptr, GSVector4i::load<false>(src_row_ptr));
+        src_row_ptr += pixels_per_vec;
+      }
+#endif
+
+      for (; x < width; x++)
+        ConvertVRAMPixel<display_format>(dst_row_ptr, *(src_row_ptr++));
+
      src_ptr += src_step;
      dst_ptr += dst_stride;
    }
@ -242,10 +149,10 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
    for (u32 row = 0; row < height; row++)
    {
      const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
-      OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
+      u8* dst_row_ptr = dst_ptr;

      for (u32 col = src_x; col < end_x; col++)
-        *(dst_row_ptr++) = VRAM16ToOutput<display_format, OutputPixelType>(src_row_ptr[col % VRAM_WIDTH]);
+        ConvertVRAMPixel<display_format>(dst_row_ptr, src_row_ptr[col % VRAM_WIDTH]);

      src_y += y_step;
      dst_ptr += dst_stride;
@ -260,18 +167,13 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
  return true;
 }

-template<GPUTexture::Format display_format>
 ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip)
 {
-  using OutputPixelType =
-    std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
-                       u16>;
-
-  GPUTexture* texture = GetDisplayTexture(width, height, display_format);
+  GPUTexture* texture = GetDisplayTexture(width, height, FORMAT_FOR_24BIT);
  if (!texture) [[unlikely]]
    return false;

-  u32 dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4);
+  u32 dst_stride = width * sizeof(u32);
  u8* dst_ptr = m_upload_buffer.data();
  const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);

@ -280,8 +182,6 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
    const u8* src_ptr = reinterpret_cast<const u8*>(&g_vram[src_y * VRAM_WIDTH + src_x]) + (skip_x * 3);
    const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16);
    for (u32 row = 0; row < height; row++)
-    {
-      if constexpr (display_format == GPUTexture::Format::RGBA8)
    {
      const u8* src_row_ptr = src_ptr;
      u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
@ -292,42 +192,6 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
        *(dst_row_ptr++) = *(src_row_ptr++);
        *(dst_row_ptr++) = 0xFF;
      }
-      }
-      else if constexpr (display_format == GPUTexture::Format::BGRA8)
-      {
-        const u8* src_row_ptr = src_ptr;
-        u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = src_row_ptr[2];
-          *(dst_row_ptr++) = src_row_ptr[1];
-          *(dst_row_ptr++) = src_row_ptr[0];
-          *(dst_row_ptr++) = 0xFF;
-          src_row_ptr += 3;
-        }
-      }
-      else if constexpr (display_format == GPUTexture::Format::RGB565)
-      {
-        const u8* src_row_ptr = src_ptr;
-        u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 11) |
-                             ((static_cast<u16>(src_row_ptr[1]) >> 2) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
-          src_row_ptr += 3;
-        }
-      }
-      else if constexpr (display_format == GPUTexture::Format::RGB5A1)
-      {
-        const u8* src_row_ptr = src_ptr;
-        u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
-        for (u32 col = 0; col < width; col++)
-        {
-          *(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 10) |
-                             ((static_cast<u16>(src_row_ptr[1]) >> 3) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
-          src_row_ptr += 3;
-        }
-      }

      src_ptr += src_stride;
      dst_ptr += dst_stride;
@ -340,7 +204,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
    for (u32 row = 0; row < height; row++)
    {
      const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
-      OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
+      u32* dst_row_ptr = reinterpret_cast<u32*>(dst_ptr);

      for (u32 col = 0; col < width; col++)
      {
@ -350,23 +214,8 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
        const u8 shift = static_cast<u8>(col & 1u) * 8;
        const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift);

-        if constexpr (display_format == GPUTexture::Format::RGBA8)
-        {
        *(dst_row_ptr++) = rgb | 0xFF000000u;
      }
-        else if constexpr (display_format == GPUTexture::Format::BGRA8)
-        {
-          *(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u;
-        }
-        else if constexpr (display_format == GPUTexture::Format::RGB565)
-        {
-          *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000);
-        }
-        else if constexpr (display_format == GPUTexture::Format::RGB5A1)
-        {
-          *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000);
-        }
-      }

      src_y += y_step;
      dst_ptr += dst_stride;
@ -392,6 +241,9 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
      case GPUTexture::Format::RGB5A1:
        return CopyOut15Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, width, height, line_skip);

+      case GPUTexture::Format::A1BGR5:
+        return CopyOut15Bit<GPUTexture::Format::A1BGR5>(src_x, src_y, width, height, line_skip);
+
      case GPUTexture::Format::RGB565:
        return CopyOut15Bit<GPUTexture::Format::RGB565>(src_x, src_y, width, height, line_skip);

@ -407,23 +259,7 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
  }
  else
  {
-    switch (m_24bit_display_format)
-    {
-      case GPUTexture::Format::RGB5A1:
-        return CopyOut24Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, skip_x, width, height, line_skip);
-
-      case GPUTexture::Format::RGB565:
-        return CopyOut24Bit<GPUTexture::Format::RGB565>(src_x, src_y, skip_x, width, height, line_skip);
-
-      case GPUTexture::Format::RGBA8:
-        return CopyOut24Bit<GPUTexture::Format::RGBA8>(src_x, src_y, skip_x, width, height, line_skip);
-
-      case GPUTexture::Format::BGRA8:
-        return CopyOut24Bit<GPUTexture::Format::BGRA8>(src_x, src_y, skip_x, width, height, line_skip);
-
-      default:
-        UnreachableCode();
-    }
+    return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip);
  }
 }

--- a/src/core/gpu_sw.h
+++ b/src/core/gpu_sw.h
@ -45,7 +45,6 @@ protected:
  template<GPUTexture::Format display_format>
  bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip);

-  template<GPUTexture::Format display_format>
  bool CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip);

  bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit);
@ -57,11 +56,13 @@ protected:
  void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
  void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;

+private:
+  static constexpr GPUTexture::Format FORMAT_FOR_24BIT = GPUTexture::Format::RGBA8; // RGBA8 always supported.
+
  GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format);

  FixedHeapArray<u8, GPU_MAX_DISPLAY_WIDTH * GPU_MAX_DISPLAY_HEIGHT * sizeof(u32)> m_upload_buffer;
-  GPUTexture::Format m_16bit_display_format = GPUTexture::Format::RGB565;
-  GPUTexture::Format m_24bit_display_format = GPUTexture::Format::RGBA8;
+  GPUTexture::Format m_16bit_display_format = GPUTexture::Format::Unknown;
  std::unique_ptr<GPUTexture> m_upload_texture;

  GPU_SW_Backend m_backend;
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@ -3,8 +3,10 @@

 #ifdef __INTELLISENSE__

-#include "common/gsvector.h"
 #include "gpu.h"
+
+#include "common/gsvector.h"
+
 #include <algorithm>

 #define USE_VECTOR 1
--- a/src/core/gpu_types.h
+++ b/src/core/gpu_types.h
@ -5,6 +5,8 @@

 #include "types.h"

+#include "util/gpu_texture.h"
+
 #include "common/bitfield.h"
 #include "common/bitutils.h"
 #include "common/gsvector.h"
@ -249,6 +251,101 @@ ALWAYS_INLINE static constexpr u16 VRAMRGBA8888ToRGBA5551(u32 color)
  return Truncate16(r | (g << 5) | (b << 10) | (a << 15));
 }

+#ifdef CPU_ARCH_SIMD
+
+ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
+{
+  return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
+}
+
+ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
+{
+  static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
+
+  const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
+  const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
+  const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
+  const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
+
+  return r | g.sll32<8>() | b.sll32<16>() | a;
+}
+
+template<GPUTexture::Format format>
+ALWAYS_INLINE static void ConvertVRAMPixels(u8*& dest, GSVector4i c16)
+{
+  if constexpr (format == GPUTexture::Format::RGBA8)
+  {
+    const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
+    const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
+
+    GSVector4i::store<false>(dest, low);
+    dest += sizeof(GSVector4i);
+
+    GSVector4i::store<false>(dest, high);
+    dest += sizeof(GSVector4i);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB5A1)
+  {
+    static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
+
+    const GSVector4i repacked =
+      (c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
+
+    GSVector4i::store<false>(dest, repacked);
+    dest += sizeof(GSVector4i);
+  }
+  else if constexpr (format == GPUTexture::Format::A1BGR5)
+  {
+    const GSVector4i repacked = (c16 & GSVector4i::cxpr16(static_cast<s16>(0x3E0))).sll16<1>() |
+                                (c16.srl16<9>() & GSVector4i::cxpr16(0x3E)) |
+                                (c16 & GSVector4i::cxpr16(0x1F)).sll16<11>() | c16.srl16<15>();
+
+    GSVector4i::store<false>(dest, repacked);
+    dest += sizeof(GSVector4i);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB565)
+  {
+    constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
+    const GSVector4i a = (c16 & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
+    const GSVector4i b = (c16 & GSVector4i::cxpr16(0x20)).sll16<1>();  // (value & 0x20) << 1
+    const GSVector4i c = (c16.srl16<10>() & single_mask);              // ((value >> 10) & 0x1F)
+    const GSVector4i d = (c16 & single_mask).sll16<11>();              // ((value & 0x1F) << 11)
+    GSVector4i::store<false>(dest, (((a | b) | c) | d));
+    dest += sizeof(GSVector4i);
+  }
+}
+
+#endif
+
+template<GPUTexture::Format format>
+ALWAYS_INLINE static void ConvertVRAMPixel(u8*& dest, u16 c16)
+{
+  if constexpr (format == GPUTexture::Format::RGBA8)
+  {
+    const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
+    std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
+    dest += sizeof(c32);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB5A1)
+  {
+    const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
+    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
+    dest += sizeof(repacked);
+  }
+  else if constexpr (format == GPUTexture::Format::A1BGR5)
+  {
+    const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 >> 9) & 0x3E) | ((c16 & 0x1F) << 11) | (c16 >> 15);
+    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
+    dest += sizeof(repacked);
+  }
+  else if constexpr (format == GPUTexture::Format::RGB565)
+  {
+    const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 & 0x20) << 1) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 11);
+    std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
+    dest += sizeof(repacked);
+  }
+}
+
 union GPUVertexPosition
 {
  u32 bits;