GPU: Use A1BGR5 format for SW/HashCache if available

This commit is contained in:
Stenzek 2024-12-06 18:21:56 +10:00
parent 8c5fadafba
commit 5725a0360b
No known key found for this signature in database
5 changed files with 178 additions and 286 deletions

View File

@ -781,6 +781,8 @@ void GPUTextureCache::SetHashCacheTextureFormat()
// Prefer 16-bit texture formats where possible. // Prefer 16-bit texture formats where possible.
if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1)) if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1))
s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1; s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1;
else if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::A1BGR5))
s_state.hash_cache_texture_format = GPUTexture::Format::A1BGR5;
else else
s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8; s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8;
@ -1080,70 +1082,6 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg
return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()]; return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()];
} }
template<GPUTexture::Format format>
ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16)
{
if constexpr (format == GPUTexture::Format::RGBA8)
{
const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
dest += sizeof(c32);
}
else if constexpr (format == GPUTexture::Format::RGB5A1)
{
const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
dest += sizeof(repacked);
}
}
#ifdef CPU_ARCH_SIMD
ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
{
return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
}
ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
{
static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a;
}
template<GPUTexture::Format format>
ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16)
{
if constexpr (format == GPUTexture::Format::RGBA8)
{
const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
GSVector4i::store<false>(dest, low);
dest += sizeof(GSVector4i);
GSVector4i::store<false>(dest, high);
dest += sizeof(GSVector4i);
}
else if constexpr (format == GPUTexture::Format::RGB5A1)
{
static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
const GSVector4i repacked =
(c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
GSVector4i::store<false>(dest, repacked);
dest += sizeof(GSVector4i);
}
}
#endif
template<GPUTexture::Format format> template<GPUTexture::Format format>
void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest,
u32 dest_stride) u32 dest_stride)
@ -1175,17 +1113,17 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
c16[5] = palette[(pp >> 4) & 0x0F]; c16[5] = palette[(pp >> 4) & 0x0F];
c16[6] = palette[(pp >> 8) & 0x0F]; c16[6] = palette[(pp >> 8) & 0x0F];
c16[7] = palette[pp >> 12]; c16[7] = palette[pp >> 12];
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16)); ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
} }
#endif #endif
for (; x < vram_width; x++) for (; x < vram_width; x++)
{ {
const u32 pp = *(page_ptr++); const u32 pp = *(page_ptr++);
WriteDecodedTexel<format>(dest_ptr, palette[pp & 0x0F]); ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]); ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]); ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[pp >> 12]); ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 12]);
} }
page += VRAM_WIDTH; page += VRAM_WIDTH;
@ -1206,7 +1144,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
if (offs == 0) if (offs == 0)
texel = *(page_ptr++); texel = *(page_ptr++);
WriteDecodedTexel<format>(dest_ptr, palette[texel & 0x0F]); ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0x0F]);
texel >>= 4; texel >>= 4;
offs = (offs + 1) % 4; offs = (offs + 1) % 4;
@ -1251,15 +1189,15 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
pp = *(page_ptr++); pp = *(page_ptr++);
c16[6] = palette[pp & 0xFF]; c16[6] = palette[pp & 0xFF];
c16[7] = palette[(pp >> 8) & 0xFF]; c16[7] = palette[(pp >> 8) & 0xFF];
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16)); ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
} }
#endif #endif
for (; x < vram_width; x++) for (; x < vram_width; x++)
{ {
const u32 pp = *(page_ptr++); const u32 pp = *(page_ptr++);
WriteDecodedTexel<format>(dest_ptr, palette[pp & 0xFF]); ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0xFF]);
WriteDecodedTexel<format>(dest_ptr, palette[pp >> 8]); ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 8]);
} }
page += VRAM_WIDTH; page += VRAM_WIDTH;
@ -1280,7 +1218,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
if (offs == 0) if (offs == 0)
texel = *(page_ptr++); texel = *(page_ptr++);
WriteDecodedTexel<format>(dest_ptr, palette[texel & 0xFF]); ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0xFF]);
texel >>= 8; texel >>= 8;
offs ^= 1; offs ^= 1;
@ -1307,13 +1245,13 @@ void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8
#ifdef CPU_ARCH_SIMD #ifdef CPU_ARCH_SIMD
for (; x < aligned_width; x += pixels_per_vec) for (; x < aligned_width; x += pixels_per_vec)
{ {
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<false>(page_ptr)); ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
page_ptr += pixels_per_vec; page_ptr += pixels_per_vec;
} }
#endif #endif
for (; x < width; x++) for (; x < width; x++)
WriteDecodedTexel<format>(dest_ptr, *(page_ptr++)); ConvertVRAMPixel<format>(dest_ptr, *(page_ptr++));
page += VRAM_WIDTH; page += VRAM_WIDTH;
dest += dest_stride; dest += dest_stride;
@ -1359,6 +1297,24 @@ void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, co
DefaultCaseIsUnreachable() DefaultCaseIsUnreachable()
} }
} }
else if (dest_format == GPUTexture::Format::A1BGR5)
{
switch (mode)
{
case GPUTextureMode::Palette4Bit:
DecodeTexture4<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Palette8Bit:
DecodeTexture8<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Direct16Bit:
case GPUTextureMode::Reserved_Direct16Bit:
DecodeTexture16<GPUTexture::Format::A1BGR5>(page_ptr, width, height, dest, dest_stride);
break;
DefaultCaseIsUnreachable()
}
}
else else
{ {
Panic("Unsupported texture format."); Panic("Unsupported texture format.");

View File

@ -41,10 +41,8 @@ bool GPU_SW::Initialize(Error* error)
if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread)) if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread))
return false; return false;
static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1, static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB5A1, GPUTexture::Format::A1BGR5,
GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8}; GPUTexture::Format::RGB565, GPUTexture::Format::RGBA8};
static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8,
GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1};
for (const GPUTexture::Format format : formats_for_16bit) for (const GPUTexture::Format format : formats_for_16bit)
{ {
if (g_gpu_device->SupportsTextureFormat(format)) if (g_gpu_device->SupportsTextureFormat(format))
@ -53,15 +51,10 @@ bool GPU_SW::Initialize(Error* error)
break; break;
} }
} }
for (const GPUTexture::Format format : formats_for_24bit)
{
if (g_gpu_device->SupportsTextureFormat(format))
{
m_24bit_display_format = format;
break;
}
}
// RGBA8 will always be supported, hence we'll find one.
INFO_LOG("Using {} format for 16-bit display", GPUTexture::GetFormatName(m_16bit_display_format));
Assert(m_16bit_display_format != GPUTexture::Format::Unknown);
return true; return true;
} }
@ -108,129 +101,43 @@ GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format
return m_upload_texture.get(); return m_upload_texture.get();
} }
template<GPUTexture::Format out_format, typename out_type>
static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width);
template<GPUTexture::Format out_format, typename out_type>
static out_type VRAM16ToOutput(u16 value);
template<>
ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(u16 value)
{
return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10);
}
template<>
ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(u16 value)
{
return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11);
}
template<>
ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(u16 value)
{
const u32 value32 = ZeroExtend32(value);
const u32 r = (value32 & 31u) << 3;
const u32 g = ((value32 >> 5) & 31u) << 3;
const u32 b = ((value32 >> 10) & 31u) << 3;
const u32 a = ((value >> 15) != 0) ? 255 : 0;
return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (ZeroExtend32(a) << 24);
}
template<>
ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(u16 value)
{
const u32 value32 = ZeroExtend32(value);
const u32 r = (value32 & 31u) << 3;
const u32 g = ((value32 >> 5) & 31u) << 3;
const u32 b = ((value32 >> 10) & 31u) << 3;
return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u);
}
template<>
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB5A1, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
{
u32 col = 0;
const u32 aligned_width = Common::AlignDownPow2(width, 8);
for (; col < aligned_width; col += 8)
{
constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
GSVector4i value = GSVector4i::load<false>(src_ptr);
src_ptr += 8;
GSVector4i a = value & GSVector4i::cxpr16(0x3E0);
GSVector4i b = value.srl16<10>() & single_mask;
GSVector4i c = (value & single_mask).sll16<10>();
value = (a | b) | c;
GSVector4i::store<false>(dst_ptr, value);
dst_ptr += 8;
}
for (; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(*(src_ptr++));
}
template<>
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB565, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
{
u32 col = 0;
const u32 aligned_width = Common::AlignDownPow2(width, 8);
for (; col < aligned_width; col += 8)
{
constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
GSVector4i value = GSVector4i::load<false>(src_ptr);
src_ptr += 8;
GSVector4i a = (value & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
GSVector4i b = (value & GSVector4i::cxpr16(0x20)).sll16<1>(); // (value & 0x20) << 1
GSVector4i c = (value.srl16<10>() & single_mask); // ((value >> 10) & 0x1F)
GSVector4i d = (value & single_mask).sll16<11>(); // ((value & 0x1F) << 11)
value = (((a | b) | c) | d);
GSVector4i::store<false>(dst_ptr, value);
dst_ptr += 8;
}
for (; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(*(src_ptr++));
}
template<>
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGBA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
{
for (u32 col = 0; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(*(src_ptr++));
}
template<>
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::BGRA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
{
for (u32 col = 0; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(*(src_ptr++));
}
template<GPUTexture::Format display_format> template<GPUTexture::Format display_format>
ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip) ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip)
{ {
using OutputPixelType =
std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
u16>;
GPUTexture* texture = GetDisplayTexture(width, height, display_format); GPUTexture* texture = GetDisplayTexture(width, height, display_format);
if (!texture) [[unlikely]] if (!texture) [[unlikely]]
return false; return false;
u32 dst_stride = width * sizeof(OutputPixelType); u32 dst_stride = Common::AlignUpPow2(width * texture->GetPixelSize(), 4);
u8* dst_ptr = m_upload_buffer.data(); u8* dst_ptr = m_upload_buffer.data();
const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height); const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
// Fast path when not wrapping around. // Fast path when not wrapping around.
if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT) if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
{ {
[[maybe_unused]] constexpr u32 pixels_per_vec = 8;
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec);
const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x]; const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x];
const u32 src_step = VRAM_WIDTH << line_skip; const u32 src_step = VRAM_WIDTH << line_skip;
for (u32 row = 0; row < height; row++) for (u32 row = 0; row < height; row++)
{ {
CopyOutRow16<display_format>(src_ptr, reinterpret_cast<OutputPixelType*>(dst_ptr), width); const u16* src_row_ptr = src_ptr;
u8* dst_row_ptr = dst_ptr;
u32 x = 0;
#ifdef CPU_ARCH_SIMD
for (; x < aligned_width; x += pixels_per_vec)
{
ConvertVRAMPixels<display_format>(dst_row_ptr, GSVector4i::load<false>(src_row_ptr));
src_row_ptr += pixels_per_vec;
}
#endif
for (; x < width; x++)
ConvertVRAMPixel<display_format>(dst_row_ptr, *(src_row_ptr++));
src_ptr += src_step; src_ptr += src_step;
dst_ptr += dst_stride; dst_ptr += dst_stride;
} }
@ -242,10 +149,10 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
for (u32 row = 0; row < height; row++) for (u32 row = 0; row < height; row++)
{ {
const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH]; const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr); u8* dst_row_ptr = dst_ptr;
for (u32 col = src_x; col < end_x; col++) for (u32 col = src_x; col < end_x; col++)
*(dst_row_ptr++) = VRAM16ToOutput<display_format, OutputPixelType>(src_row_ptr[col % VRAM_WIDTH]); ConvertVRAMPixel<display_format>(dst_row_ptr, src_row_ptr[col % VRAM_WIDTH]);
src_y += y_step; src_y += y_step;
dst_ptr += dst_stride; dst_ptr += dst_stride;
@ -260,18 +167,13 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
return true; return true;
} }
template<GPUTexture::Format display_format>
ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip) ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip)
{ {
using OutputPixelType = GPUTexture* texture = GetDisplayTexture(width, height, FORMAT_FOR_24BIT);
std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
u16>;
GPUTexture* texture = GetDisplayTexture(width, height, display_format);
if (!texture) [[unlikely]] if (!texture) [[unlikely]]
return false; return false;
u32 dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4); u32 dst_stride = width * sizeof(u32);
u8* dst_ptr = m_upload_buffer.data(); u8* dst_ptr = m_upload_buffer.data();
const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height); const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
@ -281,52 +183,14 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16); const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16);
for (u32 row = 0; row < height; row++) for (u32 row = 0; row < height; row++)
{ {
if constexpr (display_format == GPUTexture::Format::RGBA8) const u8* src_row_ptr = src_ptr;
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{ {
const u8* src_row_ptr = src_ptr; *(dst_row_ptr++) = *(src_row_ptr++);
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr); *(dst_row_ptr++) = *(src_row_ptr++);
for (u32 col = 0; col < width; col++) *(dst_row_ptr++) = *(src_row_ptr++);
{ *(dst_row_ptr++) = 0xFF;
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = 0xFF;
}
}
else if constexpr (display_format == GPUTexture::Format::BGRA8)
{
const u8* src_row_ptr = src_ptr;
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = src_row_ptr[2];
*(dst_row_ptr++) = src_row_ptr[1];
*(dst_row_ptr++) = src_row_ptr[0];
*(dst_row_ptr++) = 0xFF;
src_row_ptr += 3;
}
}
else if constexpr (display_format == GPUTexture::Format::RGB565)
{
const u8* src_row_ptr = src_ptr;
u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 11) |
((static_cast<u16>(src_row_ptr[1]) >> 2) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
src_row_ptr += 3;
}
}
else if constexpr (display_format == GPUTexture::Format::RGB5A1)
{
const u8* src_row_ptr = src_ptr;
u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 10) |
((static_cast<u16>(src_row_ptr[1]) >> 3) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
src_row_ptr += 3;
}
} }
src_ptr += src_stride; src_ptr += src_stride;
@ -340,7 +204,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
for (u32 row = 0; row < height; row++) for (u32 row = 0; row < height; row++)
{ {
const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH]; const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr); u32* dst_row_ptr = reinterpret_cast<u32*>(dst_ptr);
for (u32 col = 0; col < width; col++) for (u32 col = 0; col < width; col++)
{ {
@ -350,22 +214,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
const u8 shift = static_cast<u8>(col & 1u) * 8; const u8 shift = static_cast<u8>(col & 1u) * 8;
const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift); const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift);
if constexpr (display_format == GPUTexture::Format::RGBA8) *(dst_row_ptr++) = rgb | 0xFF000000u;
{
*(dst_row_ptr++) = rgb | 0xFF000000u;
}
else if constexpr (display_format == GPUTexture::Format::BGRA8)
{
*(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u;
}
else if constexpr (display_format == GPUTexture::Format::RGB565)
{
*(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000);
}
else if constexpr (display_format == GPUTexture::Format::RGB5A1)
{
*(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000);
}
} }
src_y += y_step; src_y += y_step;
@ -392,6 +241,9 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
case GPUTexture::Format::RGB5A1: case GPUTexture::Format::RGB5A1:
return CopyOut15Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, width, height, line_skip); return CopyOut15Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, width, height, line_skip);
case GPUTexture::Format::A1BGR5:
return CopyOut15Bit<GPUTexture::Format::A1BGR5>(src_x, src_y, width, height, line_skip);
case GPUTexture::Format::RGB565: case GPUTexture::Format::RGB565:
return CopyOut15Bit<GPUTexture::Format::RGB565>(src_x, src_y, width, height, line_skip); return CopyOut15Bit<GPUTexture::Format::RGB565>(src_x, src_y, width, height, line_skip);
@ -407,23 +259,7 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
} }
else else
{ {
switch (m_24bit_display_format) return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip);
{
case GPUTexture::Format::RGB5A1:
return CopyOut24Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, skip_x, width, height, line_skip);
case GPUTexture::Format::RGB565:
return CopyOut24Bit<GPUTexture::Format::RGB565>(src_x, src_y, skip_x, width, height, line_skip);
case GPUTexture::Format::RGBA8:
return CopyOut24Bit<GPUTexture::Format::RGBA8>(src_x, src_y, skip_x, width, height, line_skip);
case GPUTexture::Format::BGRA8:
return CopyOut24Bit<GPUTexture::Format::BGRA8>(src_x, src_y, skip_x, width, height, line_skip);
default:
UnreachableCode();
}
} }
} }

View File

@ -45,7 +45,6 @@ protected:
template<GPUTexture::Format display_format> template<GPUTexture::Format display_format>
bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip); bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip);
template<GPUTexture::Format display_format>
bool CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip); bool CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip);
bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit); bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit);
@ -57,11 +56,13 @@ protected:
void FillBackendCommandParameters(GPUBackendCommand* cmd) const; void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const; void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;
private:
static constexpr GPUTexture::Format FORMAT_FOR_24BIT = GPUTexture::Format::RGBA8; // RGBA8 always supported.
GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format); GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format);
FixedHeapArray<u8, GPU_MAX_DISPLAY_WIDTH * GPU_MAX_DISPLAY_HEIGHT * sizeof(u32)> m_upload_buffer; FixedHeapArray<u8, GPU_MAX_DISPLAY_WIDTH * GPU_MAX_DISPLAY_HEIGHT * sizeof(u32)> m_upload_buffer;
GPUTexture::Format m_16bit_display_format = GPUTexture::Format::RGB565; GPUTexture::Format m_16bit_display_format = GPUTexture::Format::Unknown;
GPUTexture::Format m_24bit_display_format = GPUTexture::Format::RGBA8;
std::unique_ptr<GPUTexture> m_upload_texture; std::unique_ptr<GPUTexture> m_upload_texture;
GPU_SW_Backend m_backend; GPU_SW_Backend m_backend;

View File

@ -3,8 +3,10 @@
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
#include "common/gsvector.h"
#include "gpu.h" #include "gpu.h"
#include "common/gsvector.h"
#include <algorithm> #include <algorithm>
#define USE_VECTOR 1 #define USE_VECTOR 1

View File

@ -5,6 +5,8 @@
#include "types.h" #include "types.h"
#include "util/gpu_texture.h"
#include "common/bitfield.h" #include "common/bitfield.h"
#include "common/bitutils.h" #include "common/bitutils.h"
#include "common/gsvector.h" #include "common/gsvector.h"
@ -249,6 +251,101 @@ ALWAYS_INLINE static constexpr u16 VRAMRGBA8888ToRGBA5551(u32 color)
return Truncate16(r | (g << 5) | (b << 10) | (a << 15)); return Truncate16(r | (g << 5) | (b << 10) | (a << 15));
} }
#ifdef CPU_ARCH_SIMD
ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
{
return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
}
ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
{
static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
return r | g.sll32<8>() | b.sll32<16>() | a;
}
template<GPUTexture::Format format>
ALWAYS_INLINE static void ConvertVRAMPixels(u8*& dest, GSVector4i c16)
{
if constexpr (format == GPUTexture::Format::RGBA8)
{
const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
GSVector4i::store<false>(dest, low);
dest += sizeof(GSVector4i);
GSVector4i::store<false>(dest, high);
dest += sizeof(GSVector4i);
}
else if constexpr (format == GPUTexture::Format::RGB5A1)
{
static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
const GSVector4i repacked =
(c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
GSVector4i::store<false>(dest, repacked);
dest += sizeof(GSVector4i);
}
else if constexpr (format == GPUTexture::Format::A1BGR5)
{
const GSVector4i repacked = (c16 & GSVector4i::cxpr16(static_cast<s16>(0x3E0))).sll16<1>() |
(c16.srl16<9>() & GSVector4i::cxpr16(0x3E)) |
(c16 & GSVector4i::cxpr16(0x1F)).sll16<11>() | c16.srl16<15>();
GSVector4i::store<false>(dest, repacked);
dest += sizeof(GSVector4i);
}
else if constexpr (format == GPUTexture::Format::RGB565)
{
constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
const GSVector4i a = (c16 & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
const GSVector4i b = (c16 & GSVector4i::cxpr16(0x20)).sll16<1>(); // (value & 0x20) << 1
const GSVector4i c = (c16.srl16<10>() & single_mask); // ((value >> 10) & 0x1F)
const GSVector4i d = (c16 & single_mask).sll16<11>(); // ((value & 0x1F) << 11)
GSVector4i::store<false>(dest, (((a | b) | c) | d));
dest += sizeof(GSVector4i);
}
}
#endif
template<GPUTexture::Format format>
ALWAYS_INLINE static void ConvertVRAMPixel(u8*& dest, u16 c16)
{
if constexpr (format == GPUTexture::Format::RGBA8)
{
const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
dest += sizeof(c32);
}
else if constexpr (format == GPUTexture::Format::RGB5A1)
{
const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
dest += sizeof(repacked);
}
else if constexpr (format == GPUTexture::Format::A1BGR5)
{
const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 >> 9) & 0x3E) | ((c16 & 0x1F) << 11) | (c16 >> 15);
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
dest += sizeof(repacked);
}
else if constexpr (format == GPUTexture::Format::RGB565)
{
const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 & 0x20) << 1) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 11);
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
dest += sizeof(repacked);
}
}
union GPUVertexPosition union GPUVertexPosition
{ {
u32 bits; u32 bits;