From 5725a0360bd4235f003db56c488ab2bf1540af78 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 6 Dec 2024 18:21:56 +1000 Subject: [PATCH] GPU: Use A1BGR5 format for SW/HashCache if available --- src/core/gpu_hw_texture_cache.cpp | 108 ++++--------- src/core/gpu_sw.cpp | 248 +++++------------------------- src/core/gpu_sw.h | 7 +- src/core/gpu_sw_rasterizer.inl | 4 +- src/core/gpu_types.h | 97 ++++++++++++ 5 files changed, 178 insertions(+), 286 deletions(-) diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp index 44de7df8e..a9b76131a 100644 --- a/src/core/gpu_hw_texture_cache.cpp +++ b/src/core/gpu_hw_texture_cache.cpp @@ -781,6 +781,8 @@ void GPUTextureCache::SetHashCacheTextureFormat() // Prefer 16-bit texture formats where possible. if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1)) s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1; + else if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::A1BGR5)) + s_state.hash_cache_texture_format = GPUTexture::Format::A1BGR5; else s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8; @@ -1080,70 +1082,6 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()]; } -template -ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16) -{ - if constexpr (format == GPUTexture::Format::RGBA8) - { - const u32 c32 = VRAMRGBA5551ToRGBA8888(c16); - std::memcpy(std::assume_aligned(dest), &c32, sizeof(c32)); - dest += sizeof(c32); - } - else if constexpr (format == GPUTexture::Format::RGB5A1) - { - const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10); - std::memcpy(std::assume_aligned(dest), &repacked, sizeof(repacked)); - dest += sizeof(repacked); - } -} - -#ifdef CPU_ARCH_SIMD - -ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val) -{ - return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>(); -} - -ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val) -{ - static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F); - - const GSVector4i r = VRAM5BitTo8Bit(val & cmask); - const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask)); - const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask)); - const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>(); - - return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a; -} - -template -ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16) -{ - if constexpr (format == GPUTexture::Format::RGBA8) - { - const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16()); - const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16()); - - GSVector4i::store(dest, low); - dest += sizeof(GSVector4i); - - GSVector4i::store(dest, high); - dest += sizeof(GSVector4i); - } - else if constexpr (format == GPUTexture::Format::RGB5A1) - { - static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F); - - const GSVector4i repacked = - (c16 & GSVector4i::cxpr16(static_cast(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>(); - - GSVector4i::store(dest, repacked); - dest += sizeof(GSVector4i); - } -} - -#endif - template void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride) @@ -1175,17 +1113,17 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi c16[5] = palette[(pp >> 4) & 0x0F]; c16[6] = palette[(pp >> 8) & 0x0F]; c16[7] = palette[pp >> 12]; - WriteDecodedTexels(dest_ptr, GSVector4i::load(c16)); + ConvertVRAMPixels(dest_ptr, GSVector4i::load(c16)); } #endif for (; x < vram_width; x++) { const u32 pp = *(page_ptr++); - WriteDecodedTexel(dest_ptr, palette[pp & 0x0F]); - WriteDecodedTexel(dest_ptr, palette[(pp >> 4) & 0x0F]); - WriteDecodedTexel(dest_ptr, palette[(pp >> 8) & 0x0F]); - WriteDecodedTexel(dest_ptr, palette[pp >> 12]); + ConvertVRAMPixel(dest_ptr, palette[pp & 0x0F]); + ConvertVRAMPixel(dest_ptr, palette[(pp >> 4) & 0x0F]); + ConvertVRAMPixel(dest_ptr, palette[(pp >> 8) & 0x0F]); + ConvertVRAMPixel(dest_ptr, palette[pp >> 12]); } page += VRAM_WIDTH; @@ -1206,7 +1144,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi if (offs == 0) texel = *(page_ptr++); - WriteDecodedTexel(dest_ptr, palette[texel & 0x0F]); + ConvertVRAMPixel(dest_ptr, palette[texel & 0x0F]); texel >>= 4; offs = (offs + 1) % 4; @@ -1251,15 +1189,15 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi pp = *(page_ptr++); c16[6] = palette[pp & 0xFF]; c16[7] = palette[(pp >> 8) & 0xFF]; - WriteDecodedTexels(dest_ptr, GSVector4i::load(c16)); + ConvertVRAMPixels(dest_ptr, GSVector4i::load(c16)); } #endif for (; x < vram_width; x++) { const u32 pp = *(page_ptr++); - WriteDecodedTexel(dest_ptr, palette[pp & 0xFF]); - WriteDecodedTexel(dest_ptr, palette[pp >> 8]); + ConvertVRAMPixel(dest_ptr, palette[pp & 0xFF]); + ConvertVRAMPixel(dest_ptr, palette[pp >> 8]); } page += VRAM_WIDTH; @@ -1280,7 +1218,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi if (offs == 0) texel = *(page_ptr++); - WriteDecodedTexel(dest_ptr, palette[texel & 0xFF]); + ConvertVRAMPixel(dest_ptr, palette[texel & 0xFF]); texel >>= 8; offs ^= 1; @@ -1307,13 +1245,13 @@ void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8 #ifdef CPU_ARCH_SIMD for (; x < aligned_width; x += pixels_per_vec) { - WriteDecodedTexels(dest_ptr, GSVector4i::load(page_ptr)); + ConvertVRAMPixels(dest_ptr, GSVector4i::load(page_ptr)); page_ptr += pixels_per_vec; } #endif for (; x < width; x++) - WriteDecodedTexel(dest_ptr, *(page_ptr++)); + ConvertVRAMPixel(dest_ptr, *(page_ptr++)); page += VRAM_WIDTH; dest += dest_stride; @@ -1359,6 +1297,24 @@ void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, co DefaultCaseIsUnreachable() } } + else if (dest_format == GPUTexture::Format::A1BGR5) + { + switch (mode) + { + case GPUTextureMode::Palette4Bit: + DecodeTexture4(page_ptr, palette, width, height, dest, dest_stride); + break; + case GPUTextureMode::Palette8Bit: + DecodeTexture8(page_ptr, palette, width, height, dest, dest_stride); + break; + case GPUTextureMode::Direct16Bit: + case GPUTextureMode::Reserved_Direct16Bit: + DecodeTexture16(page_ptr, width, height, dest, dest_stride); + break; + + DefaultCaseIsUnreachable() + } + } else { Panic("Unsupported texture format."); diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp index 9159dbddc..3020246d6 100644 --- a/src/core/gpu_sw.cpp +++ b/src/core/gpu_sw.cpp @@ -41,10 +41,8 @@ bool GPU_SW::Initialize(Error* error) if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread)) return false; - static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1, - GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8}; - static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8, - GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1}; + static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB5A1, GPUTexture::Format::A1BGR5, + GPUTexture::Format::RGB565, GPUTexture::Format::RGBA8}; for (const GPUTexture::Format format : formats_for_16bit) { if (g_gpu_device->SupportsTextureFormat(format)) @@ -53,15 +51,10 @@ bool GPU_SW::Initialize(Error* error) break; } } - for (const GPUTexture::Format format : formats_for_24bit) - { - if (g_gpu_device->SupportsTextureFormat(format)) - { - m_24bit_display_format = format; - break; - } - } + // RGBA8 will always be supported, hence we'll find one. + INFO_LOG("Using {} format for 16-bit display", GPUTexture::GetFormatName(m_16bit_display_format)); + Assert(m_16bit_display_format != GPUTexture::Format::Unknown); return true; } @@ -108,129 +101,43 @@ GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format return m_upload_texture.get(); } -template -static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width); - -template -static out_type VRAM16ToOutput(u16 value); - -template<> -ALWAYS_INLINE u16 VRAM16ToOutput(u16 value) -{ - return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10); -} - -template<> -ALWAYS_INLINE u16 VRAM16ToOutput(u16 value) -{ - return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11); -} - -template<> -ALWAYS_INLINE u32 VRAM16ToOutput(u16 value) -{ - const u32 value32 = ZeroExtend32(value); - const u32 r = (value32 & 31u) << 3; - const u32 g = ((value32 >> 5) & 31u) << 3; - const u32 b = ((value32 >> 10) & 31u) << 3; - const u32 a = ((value >> 15) != 0) ? 255 : 0; - return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (ZeroExtend32(a) << 24); -} - -template<> -ALWAYS_INLINE u32 VRAM16ToOutput(u16 value) -{ - const u32 value32 = ZeroExtend32(value); - const u32 r = (value32 & 31u) << 3; - const u32 g = ((value32 >> 5) & 31u) << 3; - const u32 b = ((value32 >> 10) & 31u) << 3; - return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u); -} - -template<> -ALWAYS_INLINE void CopyOutRow16(const u16* src_ptr, u16* dst_ptr, u32 width) -{ - u32 col = 0; - - const u32 aligned_width = Common::AlignDownPow2(width, 8); - for (; col < aligned_width; col += 8) - { - constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F); - GSVector4i value = GSVector4i::load(src_ptr); - src_ptr += 8; - GSVector4i a = value & GSVector4i::cxpr16(0x3E0); - GSVector4i b = value.srl16<10>() & single_mask; - GSVector4i c = (value & single_mask).sll16<10>(); - value = (a | b) | c; - GSVector4i::store(dst_ptr, value); - dst_ptr += 8; - } - - for (; col < width; col++) - *(dst_ptr++) = VRAM16ToOutput(*(src_ptr++)); -} - -template<> -ALWAYS_INLINE void CopyOutRow16(const u16* src_ptr, u16* dst_ptr, u32 width) -{ - u32 col = 0; - - const u32 aligned_width = Common::AlignDownPow2(width, 8); - for (; col < aligned_width; col += 8) - { - constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F); - GSVector4i value = GSVector4i::load(src_ptr); - src_ptr += 8; - GSVector4i a = (value & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1 - GSVector4i b = (value & GSVector4i::cxpr16(0x20)).sll16<1>(); // (value & 0x20) << 1 - GSVector4i c = (value.srl16<10>() & single_mask); // ((value >> 10) & 0x1F) - GSVector4i d = (value & single_mask).sll16<11>(); // ((value & 0x1F) << 11) - value = (((a | b) | c) | d); - GSVector4i::store(dst_ptr, value); - dst_ptr += 8; - } - - for (; col < width; col++) - *(dst_ptr++) = VRAM16ToOutput(*(src_ptr++)); -} - -template<> -ALWAYS_INLINE void CopyOutRow16(const u16* src_ptr, u32* dst_ptr, u32 width) -{ - for (u32 col = 0; col < width; col++) - *(dst_ptr++) = VRAM16ToOutput(*(src_ptr++)); -} - -template<> -ALWAYS_INLINE void CopyOutRow16(const u16* src_ptr, u32* dst_ptr, u32 width) -{ - for (u32 col = 0; col < width; col++) - *(dst_ptr++) = VRAM16ToOutput(*(src_ptr++)); -} - template ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip) { - using OutputPixelType = - std::conditional_t; - GPUTexture* texture = GetDisplayTexture(width, height, display_format); if (!texture) [[unlikely]] return false; - u32 dst_stride = width * sizeof(OutputPixelType); + u32 dst_stride = Common::AlignUpPow2(width * texture->GetPixelSize(), 4); u8* dst_ptr = m_upload_buffer.data(); const bool mapped = texture->Map(reinterpret_cast(&dst_ptr), &dst_stride, 0, 0, width, height); // Fast path when not wrapping around. if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT) { + [[maybe_unused]] constexpr u32 pixels_per_vec = 8; + [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec); + const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x]; const u32 src_step = VRAM_WIDTH << line_skip; + for (u32 row = 0; row < height; row++) { - CopyOutRow16(src_ptr, reinterpret_cast(dst_ptr), width); + const u16* src_row_ptr = src_ptr; + u8* dst_row_ptr = dst_ptr; + u32 x = 0; + +#ifdef CPU_ARCH_SIMD + for (; x < aligned_width; x += pixels_per_vec) + { + ConvertVRAMPixels(dst_row_ptr, GSVector4i::load(src_row_ptr)); + src_row_ptr += pixels_per_vec; + } +#endif + + for (; x < width; x++) + ConvertVRAMPixel(dst_row_ptr, *(src_row_ptr++)); + src_ptr += src_step; dst_ptr += dst_stride; } @@ -242,10 +149,10 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, for (u32 row = 0; row < height; row++) { const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH]; - OutputPixelType* dst_row_ptr = reinterpret_cast(dst_ptr); + u8* dst_row_ptr = dst_ptr; for (u32 col = src_x; col < end_x; col++) - *(dst_row_ptr++) = VRAM16ToOutput(src_row_ptr[col % VRAM_WIDTH]); + ConvertVRAMPixel(dst_row_ptr, src_row_ptr[col % VRAM_WIDTH]); src_y += y_step; dst_ptr += dst_stride; @@ -260,18 +167,13 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, return true; } -template ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip) { - using OutputPixelType = - std::conditional_t; - - GPUTexture* texture = GetDisplayTexture(width, height, display_format); + GPUTexture* texture = GetDisplayTexture(width, height, FORMAT_FOR_24BIT); if (!texture) [[unlikely]] return false; - u32 dst_stride = Common::AlignUpPow2(width * sizeof(OutputPixelType), 4); + u32 dst_stride = width * sizeof(u32); u8* dst_ptr = m_upload_buffer.data(); const bool mapped = texture->Map(reinterpret_cast(&dst_ptr), &dst_stride, 0, 0, width, height); @@ -281,52 +183,14 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16); for (u32 row = 0; row < height; row++) { - if constexpr (display_format == GPUTexture::Format::RGBA8) + const u8* src_row_ptr = src_ptr; + u8* dst_row_ptr = reinterpret_cast(dst_ptr); + for (u32 col = 0; col < width; col++) { - const u8* src_row_ptr = src_ptr; - u8* dst_row_ptr = reinterpret_cast(dst_ptr); - for (u32 col = 0; col < width; col++) - { - *(dst_row_ptr++) = *(src_row_ptr++); - *(dst_row_ptr++) = *(src_row_ptr++); - *(dst_row_ptr++) = *(src_row_ptr++); - *(dst_row_ptr++) = 0xFF; - } - } - else if constexpr (display_format == GPUTexture::Format::BGRA8) - { - const u8* src_row_ptr = src_ptr; - u8* dst_row_ptr = reinterpret_cast(dst_ptr); - for (u32 col = 0; col < width; col++) - { - *(dst_row_ptr++) = src_row_ptr[2]; - *(dst_row_ptr++) = src_row_ptr[1]; - *(dst_row_ptr++) = src_row_ptr[0]; - *(dst_row_ptr++) = 0xFF; - src_row_ptr += 3; - } - } - else if constexpr (display_format == GPUTexture::Format::RGB565) - { - const u8* src_row_ptr = src_ptr; - u16* dst_row_ptr = reinterpret_cast(dst_ptr); - for (u32 col = 0; col < width; col++) - { - *(dst_row_ptr++) = ((static_cast(src_row_ptr[0]) >> 3) << 11) | - ((static_cast(src_row_ptr[1]) >> 2) << 5) | (static_cast(src_row_ptr[2]) >> 3); - src_row_ptr += 3; - } - } - else if constexpr (display_format == GPUTexture::Format::RGB5A1) - { - const u8* src_row_ptr = src_ptr; - u16* dst_row_ptr = reinterpret_cast(dst_ptr); - for (u32 col = 0; col < width; col++) - { - *(dst_row_ptr++) = ((static_cast(src_row_ptr[0]) >> 3) << 10) | - ((static_cast(src_row_ptr[1]) >> 3) << 5) | (static_cast(src_row_ptr[2]) >> 3); - src_row_ptr += 3; - } + *(dst_row_ptr++) = *(src_row_ptr++); + *(dst_row_ptr++) = *(src_row_ptr++); + *(dst_row_ptr++) = *(src_row_ptr++); + *(dst_row_ptr++) = 0xFF; } src_ptr += src_stride; @@ -340,7 +204,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x for (u32 row = 0; row < height; row++) { const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH]; - OutputPixelType* dst_row_ptr = reinterpret_cast(dst_ptr); + u32* dst_row_ptr = reinterpret_cast(dst_ptr); for (u32 col = 0; col < width; col++) { @@ -350,22 +214,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x const u8 shift = static_cast(col & 1u) * 8; const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift); - if constexpr (display_format == GPUTexture::Format::RGBA8) - { - *(dst_row_ptr++) = rgb | 0xFF000000u; - } - else if constexpr (display_format == GPUTexture::Format::BGRA8) - { - *(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u; - } - else if constexpr (display_format == GPUTexture::Format::RGB565) - { - *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000); - } - else if constexpr (display_format == GPUTexture::Format::RGB5A1) - { - *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000); - } + *(dst_row_ptr++) = rgb | 0xFF000000u; } src_y += y_step; @@ -392,6 +241,9 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3 case GPUTexture::Format::RGB5A1: return CopyOut15Bit(src_x, src_y, width, height, line_skip); + case GPUTexture::Format::A1BGR5: + return CopyOut15Bit(src_x, src_y, width, height, line_skip); + case GPUTexture::Format::RGB565: return CopyOut15Bit(src_x, src_y, width, height, line_skip); @@ -407,23 +259,7 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3 } else { - switch (m_24bit_display_format) - { - case GPUTexture::Format::RGB5A1: - return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip); - - case GPUTexture::Format::RGB565: - return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip); - - case GPUTexture::Format::RGBA8: - return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip); - - case GPUTexture::Format::BGRA8: - return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip); - - default: - UnreachableCode(); - } + return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip); } } diff --git a/src/core/gpu_sw.h b/src/core/gpu_sw.h index 2251843aa..b99bdd168 100644 --- a/src/core/gpu_sw.h +++ b/src/core/gpu_sw.h @@ -45,7 +45,6 @@ protected: template bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip); - template bool CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip); bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit); @@ -57,11 +56,13 @@ protected: void FillBackendCommandParameters(GPUBackendCommand* cmd) const; void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const; +private: + static constexpr GPUTexture::Format FORMAT_FOR_24BIT = GPUTexture::Format::RGBA8; // RGBA8 always supported. + GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format); FixedHeapArray m_upload_buffer; - GPUTexture::Format m_16bit_display_format = GPUTexture::Format::RGB565; - GPUTexture::Format m_24bit_display_format = GPUTexture::Format::RGBA8; + GPUTexture::Format m_16bit_display_format = GPUTexture::Format::Unknown; std::unique_ptr m_upload_texture; GPU_SW_Backend m_backend; diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index 803dbfdef..d470a93a6 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -3,8 +3,10 @@ #ifdef __INTELLISENSE__ -#include "common/gsvector.h" #include "gpu.h" + +#include "common/gsvector.h" + #include #define USE_VECTOR 1 diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h index 137264ec5..4782272b0 100644 --- a/src/core/gpu_types.h +++ b/src/core/gpu_types.h @@ -5,6 +5,8 @@ #include "types.h" +#include "util/gpu_texture.h" + #include "common/bitfield.h" #include "common/bitutils.h" #include "common/gsvector.h" @@ -249,6 +251,101 @@ ALWAYS_INLINE static constexpr u16 VRAMRGBA8888ToRGBA5551(u32 color) return Truncate16(r | (g << 5) | (b << 10) | (a << 15)); } +#ifdef CPU_ARCH_SIMD + +ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val) +{ + return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>(); +} + +ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val) +{ + static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F); + + const GSVector4i r = VRAM5BitTo8Bit(val & cmask); + const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask)); + const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask)); + const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>(); + + return r | g.sll32<8>() | b.sll32<16>() | a; +} + +template +ALWAYS_INLINE static void ConvertVRAMPixels(u8*& dest, GSVector4i c16) +{ + if constexpr (format == GPUTexture::Format::RGBA8) + { + const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16()); + const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16()); + + GSVector4i::store(dest, low); + dest += sizeof(GSVector4i); + + GSVector4i::store(dest, high); + dest += sizeof(GSVector4i); + } + else if constexpr (format == GPUTexture::Format::RGB5A1) + { + static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F); + + const GSVector4i repacked = + (c16 & GSVector4i::cxpr16(static_cast(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>(); + + GSVector4i::store(dest, repacked); + dest += sizeof(GSVector4i); + } + else if constexpr (format == GPUTexture::Format::A1BGR5) + { + const GSVector4i repacked = (c16 & GSVector4i::cxpr16(static_cast(0x3E0))).sll16<1>() | + (c16.srl16<9>() & GSVector4i::cxpr16(0x3E)) | + (c16 & GSVector4i::cxpr16(0x1F)).sll16<11>() | c16.srl16<15>(); + + GSVector4i::store(dest, repacked); + dest += sizeof(GSVector4i); + } + else if constexpr (format == GPUTexture::Format::RGB565) + { + constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F); + const GSVector4i a = (c16 & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1 + const GSVector4i b = (c16 & GSVector4i::cxpr16(0x20)).sll16<1>(); // (value & 0x20) << 1 + const GSVector4i c = (c16.srl16<10>() & single_mask); // ((value >> 10) & 0x1F) + const GSVector4i d = (c16 & single_mask).sll16<11>(); // ((value & 0x1F) << 11) + GSVector4i::store(dest, (((a | b) | c) | d)); + dest += sizeof(GSVector4i); + } +} + +#endif + +template +ALWAYS_INLINE static void ConvertVRAMPixel(u8*& dest, u16 c16) +{ + if constexpr (format == GPUTexture::Format::RGBA8) + { + const u32 c32 = VRAMRGBA5551ToRGBA8888(c16); + std::memcpy(std::assume_aligned(dest), &c32, sizeof(c32)); + dest += sizeof(c32); + } + else if constexpr (format == GPUTexture::Format::RGB5A1) + { + const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10); + std::memcpy(std::assume_aligned(dest), &repacked, sizeof(repacked)); + dest += sizeof(repacked); + } + else if constexpr (format == GPUTexture::Format::A1BGR5) + { + const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 >> 9) & 0x3E) | ((c16 & 0x1F) << 11) | (c16 >> 15); + std::memcpy(std::assume_aligned(dest), &repacked, sizeof(repacked)); + dest += sizeof(repacked); + } + else if constexpr (format == GPUTexture::Format::RGB565) + { + const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 & 0x20) << 1) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 11); + std::memcpy(std::assume_aligned(dest), &repacked, sizeof(repacked)); + dest += sizeof(repacked); + } +} + union GPUVertexPosition { u32 bits;