GPU: Use A1BGR5 format for SW/HashCache if available
This commit is contained in:
parent
8c5fadafba
commit
5725a0360b
|
@ -781,6 +781,8 @@ void GPUTextureCache::SetHashCacheTextureFormat()
|
|||
// Prefer 16-bit texture formats where possible.
|
||||
if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1))
|
||||
s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1;
|
||||
else if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::A1BGR5))
|
||||
s_state.hash_cache_texture_format = GPUTexture::Format::A1BGR5;
|
||||
else
|
||||
s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8;
|
||||
|
||||
|
@ -1080,70 +1082,6 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg
|
|||
return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()];
|
||||
}
|
||||
|
||||
template<GPUTexture::Format format>
|
||||
ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16)
|
||||
{
|
||||
if constexpr (format == GPUTexture::Format::RGBA8)
|
||||
{
|
||||
const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
|
||||
std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
|
||||
dest += sizeof(c32);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::RGB5A1)
|
||||
{
|
||||
const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
|
||||
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
|
||||
dest += sizeof(repacked);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CPU_ARCH_SIMD
|
||||
|
||||
ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
|
||||
{
|
||||
return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
|
||||
{
|
||||
static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
|
||||
|
||||
const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
|
||||
const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
|
||||
const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
|
||||
const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
|
||||
|
||||
return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a;
|
||||
}
|
||||
|
||||
template<GPUTexture::Format format>
|
||||
ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16)
|
||||
{
|
||||
if constexpr (format == GPUTexture::Format::RGBA8)
|
||||
{
|
||||
const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
|
||||
const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
|
||||
|
||||
GSVector4i::store<false>(dest, low);
|
||||
dest += sizeof(GSVector4i);
|
||||
|
||||
GSVector4i::store<false>(dest, high);
|
||||
dest += sizeof(GSVector4i);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::RGB5A1)
|
||||
{
|
||||
static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
|
||||
|
||||
const GSVector4i repacked =
|
||||
(c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
|
||||
|
||||
GSVector4i::store<false>(dest, repacked);
|
||||
dest += sizeof(GSVector4i);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<GPUTexture::Format format>
|
||||
void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest,
|
||||
u32 dest_stride)
|
||||
|
@ -1175,17 +1113,17 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
|
|||
c16[5] = palette[(pp >> 4) & 0x0F];
|
||||
c16[6] = palette[(pp >> 8) & 0x0F];
|
||||
c16[7] = palette[pp >> 12];
|
||||
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
|
||||
ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < vram_width; x++)
|
||||
{
|
||||
const u32 pp = *(page_ptr++);
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[pp & 0x0F]);
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[pp >> 12]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0x0F]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 12]);
|
||||
}
|
||||
|
||||
page += VRAM_WIDTH;
|
||||
|
@ -1206,7 +1144,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
|
|||
if (offs == 0)
|
||||
texel = *(page_ptr++);
|
||||
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[texel & 0x0F]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0x0F]);
|
||||
texel >>= 4;
|
||||
|
||||
offs = (offs + 1) % 4;
|
||||
|
@ -1251,15 +1189,15 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
|
|||
pp = *(page_ptr++);
|
||||
c16[6] = palette[pp & 0xFF];
|
||||
c16[7] = palette[(pp >> 8) & 0xFF];
|
||||
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
|
||||
ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<true>(c16));
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < vram_width; x++)
|
||||
{
|
||||
const u32 pp = *(page_ptr++);
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[pp & 0xFF]);
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[pp >> 8]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[pp & 0xFF]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[pp >> 8]);
|
||||
}
|
||||
|
||||
page += VRAM_WIDTH;
|
||||
|
@ -1280,7 +1218,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
|
|||
if (offs == 0)
|
||||
texel = *(page_ptr++);
|
||||
|
||||
WriteDecodedTexel<format>(dest_ptr, palette[texel & 0xFF]);
|
||||
ConvertVRAMPixel<format>(dest_ptr, palette[texel & 0xFF]);
|
||||
texel >>= 8;
|
||||
|
||||
offs ^= 1;
|
||||
|
@ -1307,13 +1245,13 @@ void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8
|
|||
#ifdef CPU_ARCH_SIMD
|
||||
for (; x < aligned_width; x += pixels_per_vec)
|
||||
{
|
||||
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
|
||||
ConvertVRAMPixels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
|
||||
page_ptr += pixels_per_vec;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < width; x++)
|
||||
WriteDecodedTexel<format>(dest_ptr, *(page_ptr++));
|
||||
ConvertVRAMPixel<format>(dest_ptr, *(page_ptr++));
|
||||
|
||||
page += VRAM_WIDTH;
|
||||
dest += dest_stride;
|
||||
|
@ -1359,6 +1297,24 @@ void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, co
|
|||
DefaultCaseIsUnreachable()
|
||||
}
|
||||
}
|
||||
else if (dest_format == GPUTexture::Format::A1BGR5)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case GPUTextureMode::Palette4Bit:
|
||||
DecodeTexture4<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
|
||||
break;
|
||||
case GPUTextureMode::Palette8Bit:
|
||||
DecodeTexture8<GPUTexture::Format::A1BGR5>(page_ptr, palette, width, height, dest, dest_stride);
|
||||
break;
|
||||
case GPUTextureMode::Direct16Bit:
|
||||
case GPUTextureMode::Reserved_Direct16Bit:
|
||||
DecodeTexture16<GPUTexture::Format::A1BGR5>(page_ptr, width, height, dest, dest_stride);
|
||||
break;
|
||||
|
||||
DefaultCaseIsUnreachable()
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Panic("Unsupported texture format.");
|
||||
|
|
|
@ -41,10 +41,8 @@ bool GPU_SW::Initialize(Error* error)
|
|||
if (!GPU::Initialize(error) || !m_backend.Initialize(g_settings.gpu_use_thread))
|
||||
return false;
|
||||
|
||||
static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1,
|
||||
GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8};
|
||||
static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8,
|
||||
GPUTexture::Format::RGB565, GPUTexture::Format::RGB5A1};
|
||||
static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB5A1, GPUTexture::Format::A1BGR5,
|
||||
GPUTexture::Format::RGB565, GPUTexture::Format::RGBA8};
|
||||
for (const GPUTexture::Format format : formats_for_16bit)
|
||||
{
|
||||
if (g_gpu_device->SupportsTextureFormat(format))
|
||||
|
@ -53,15 +51,10 @@ bool GPU_SW::Initialize(Error* error)
|
|||
break;
|
||||
}
|
||||
}
|
||||
for (const GPUTexture::Format format : formats_for_24bit)
|
||||
{
|
||||
if (g_gpu_device->SupportsTextureFormat(format))
|
||||
{
|
||||
m_24bit_display_format = format;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// RGBA8 will always be supported, hence we'll find one.
|
||||
INFO_LOG("Using {} format for 16-bit display", GPUTexture::GetFormatName(m_16bit_display_format));
|
||||
Assert(m_16bit_display_format != GPUTexture::Format::Unknown);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -108,129 +101,43 @@ GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format
|
|||
return m_upload_texture.get();
|
||||
}
|
||||
|
||||
template<GPUTexture::Format out_format, typename out_type>
|
||||
static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width);
|
||||
|
||||
template<GPUTexture::Format out_format, typename out_type>
|
||||
static out_type VRAM16ToOutput(u16 value);
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(u16 value)
|
||||
{
|
||||
return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10);
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(u16 value)
|
||||
{
|
||||
return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11);
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(u16 value)
|
||||
{
|
||||
const u32 value32 = ZeroExtend32(value);
|
||||
const u32 r = (value32 & 31u) << 3;
|
||||
const u32 g = ((value32 >> 5) & 31u) << 3;
|
||||
const u32 b = ((value32 >> 10) & 31u) << 3;
|
||||
const u32 a = ((value >> 15) != 0) ? 255 : 0;
|
||||
return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (ZeroExtend32(a) << 24);
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(u16 value)
|
||||
{
|
||||
const u32 value32 = ZeroExtend32(value);
|
||||
const u32 r = (value32 & 31u) << 3;
|
||||
const u32 g = ((value32 >> 5) & 31u) << 3;
|
||||
const u32 b = ((value32 >> 10) & 31u) << 3;
|
||||
return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u);
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB5A1, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
|
||||
{
|
||||
u32 col = 0;
|
||||
|
||||
const u32 aligned_width = Common::AlignDownPow2(width, 8);
|
||||
for (; col < aligned_width; col += 8)
|
||||
{
|
||||
constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
|
||||
GSVector4i value = GSVector4i::load<false>(src_ptr);
|
||||
src_ptr += 8;
|
||||
GSVector4i a = value & GSVector4i::cxpr16(0x3E0);
|
||||
GSVector4i b = value.srl16<10>() & single_mask;
|
||||
GSVector4i c = (value & single_mask).sll16<10>();
|
||||
value = (a | b) | c;
|
||||
GSVector4i::store<false>(dst_ptr, value);
|
||||
dst_ptr += 8;
|
||||
}
|
||||
|
||||
for (; col < width; col++)
|
||||
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB5A1, u16>(*(src_ptr++));
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB565, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
|
||||
{
|
||||
u32 col = 0;
|
||||
|
||||
const u32 aligned_width = Common::AlignDownPow2(width, 8);
|
||||
for (; col < aligned_width; col += 8)
|
||||
{
|
||||
constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
|
||||
GSVector4i value = GSVector4i::load<false>(src_ptr);
|
||||
src_ptr += 8;
|
||||
GSVector4i a = (value & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
|
||||
GSVector4i b = (value & GSVector4i::cxpr16(0x20)).sll16<1>(); // (value & 0x20) << 1
|
||||
GSVector4i c = (value.srl16<10>() & single_mask); // ((value >> 10) & 0x1F)
|
||||
GSVector4i d = (value & single_mask).sll16<11>(); // ((value & 0x1F) << 11)
|
||||
value = (((a | b) | c) | d);
|
||||
GSVector4i::store<false>(dst_ptr, value);
|
||||
dst_ptr += 8;
|
||||
}
|
||||
|
||||
for (; col < width; col++)
|
||||
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(*(src_ptr++));
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGBA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
|
||||
{
|
||||
for (u32 col = 0; col < width; col++)
|
||||
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(*(src_ptr++));
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::BGRA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
|
||||
{
|
||||
for (u32 col = 0; col < width; col++)
|
||||
*(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(*(src_ptr++));
|
||||
}
|
||||
|
||||
template<GPUTexture::Format display_format>
|
||||
ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip)
|
||||
{
|
||||
using OutputPixelType =
|
||||
std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
|
||||
u16>;
|
||||
|
||||
GPUTexture* texture = GetDisplayTexture(width, height, display_format);
|
||||
if (!texture) [[unlikely]]
|
||||
return false;
|
||||
|
||||
u32 dst_stride = width * sizeof(OutputPixelType);
|
||||
u32 dst_stride = Common::AlignUpPow2(width * texture->GetPixelSize(), 4);
|
||||
u8* dst_ptr = m_upload_buffer.data();
|
||||
const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
|
||||
|
||||
// Fast path when not wrapping around.
|
||||
if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
|
||||
{
|
||||
[[maybe_unused]] constexpr u32 pixels_per_vec = 8;
|
||||
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec);
|
||||
|
||||
const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x];
|
||||
const u32 src_step = VRAM_WIDTH << line_skip;
|
||||
|
||||
for (u32 row = 0; row < height; row++)
|
||||
{
|
||||
CopyOutRow16<display_format>(src_ptr, reinterpret_cast<OutputPixelType*>(dst_ptr), width);
|
||||
const u16* src_row_ptr = src_ptr;
|
||||
u8* dst_row_ptr = dst_ptr;
|
||||
u32 x = 0;
|
||||
|
||||
#ifdef CPU_ARCH_SIMD
|
||||
for (; x < aligned_width; x += pixels_per_vec)
|
||||
{
|
||||
ConvertVRAMPixels<display_format>(dst_row_ptr, GSVector4i::load<false>(src_row_ptr));
|
||||
src_row_ptr += pixels_per_vec;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < width; x++)
|
||||
ConvertVRAMPixel<display_format>(dst_row_ptr, *(src_row_ptr++));
|
||||
|
||||
src_ptr += src_step;
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
|
@ -242,10 +149,10 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
|
|||
for (u32 row = 0; row < height; row++)
|
||||
{
|
||||
const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||
OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
|
||||
u8* dst_row_ptr = dst_ptr;
|
||||
|
||||
for (u32 col = src_x; col < end_x; col++)
|
||||
*(dst_row_ptr++) = VRAM16ToOutput<display_format, OutputPixelType>(src_row_ptr[col % VRAM_WIDTH]);
|
||||
ConvertVRAMPixel<display_format>(dst_row_ptr, src_row_ptr[col % VRAM_WIDTH]);
|
||||
|
||||
src_y += y_step;
|
||||
dst_ptr += dst_stride;
|
||||
|
@ -260,18 +167,13 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width,
|
|||
return true;
|
||||
}
|
||||
|
||||
template<GPUTexture::Format display_format>
|
||||
ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip)
|
||||
{
|
||||
using OutputPixelType =
|
||||
std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
|
||||
u16>;
|
||||
|
||||
GPUTexture* texture = GetDisplayTexture(width, height, display_format);
|
||||
GPUTexture* texture = GetDisplayTexture(width, height, FORMAT_FOR_24BIT);
|
||||
if (!texture) [[unlikely]]
|
||||
return false;
|
||||
|
||||
u32 dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4);
|
||||
u32 dst_stride = width * sizeof(u32);
|
||||
u8* dst_ptr = m_upload_buffer.data();
|
||||
const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
|
||||
|
||||
|
@ -280,8 +182,6 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
|
|||
const u8* src_ptr = reinterpret_cast<const u8*>(&g_vram[src_y * VRAM_WIDTH + src_x]) + (skip_x * 3);
|
||||
const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16);
|
||||
for (u32 row = 0; row < height; row++)
|
||||
{
|
||||
if constexpr (display_format == GPUTexture::Format::RGBA8)
|
||||
{
|
||||
const u8* src_row_ptr = src_ptr;
|
||||
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
|
||||
|
@ -292,42 +192,6 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
|
|||
*(dst_row_ptr++) = *(src_row_ptr++);
|
||||
*(dst_row_ptr++) = 0xFF;
|
||||
}
|
||||
}
|
||||
else if constexpr (display_format == GPUTexture::Format::BGRA8)
|
||||
{
|
||||
const u8* src_row_ptr = src_ptr;
|
||||
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
|
||||
for (u32 col = 0; col < width; col++)
|
||||
{
|
||||
*(dst_row_ptr++) = src_row_ptr[2];
|
||||
*(dst_row_ptr++) = src_row_ptr[1];
|
||||
*(dst_row_ptr++) = src_row_ptr[0];
|
||||
*(dst_row_ptr++) = 0xFF;
|
||||
src_row_ptr += 3;
|
||||
}
|
||||
}
|
||||
else if constexpr (display_format == GPUTexture::Format::RGB565)
|
||||
{
|
||||
const u8* src_row_ptr = src_ptr;
|
||||
u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
|
||||
for (u32 col = 0; col < width; col++)
|
||||
{
|
||||
*(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 11) |
|
||||
((static_cast<u16>(src_row_ptr[1]) >> 2) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
|
||||
src_row_ptr += 3;
|
||||
}
|
||||
}
|
||||
else if constexpr (display_format == GPUTexture::Format::RGB5A1)
|
||||
{
|
||||
const u8* src_row_ptr = src_ptr;
|
||||
u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
|
||||
for (u32 col = 0; col < width; col++)
|
||||
{
|
||||
*(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 10) |
|
||||
((static_cast<u16>(src_row_ptr[1]) >> 3) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
|
||||
src_row_ptr += 3;
|
||||
}
|
||||
}
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
|
@ -340,7 +204,7 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
|
|||
for (u32 row = 0; row < height; row++)
|
||||
{
|
||||
const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||
OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
|
||||
u32* dst_row_ptr = reinterpret_cast<u32*>(dst_ptr);
|
||||
|
||||
for (u32 col = 0; col < width; col++)
|
||||
{
|
||||
|
@ -350,23 +214,8 @@ ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x
|
|||
const u8 shift = static_cast<u8>(col & 1u) * 8;
|
||||
const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift);
|
||||
|
||||
if constexpr (display_format == GPUTexture::Format::RGBA8)
|
||||
{
|
||||
*(dst_row_ptr++) = rgb | 0xFF000000u;
|
||||
}
|
||||
else if constexpr (display_format == GPUTexture::Format::BGRA8)
|
||||
{
|
||||
*(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u;
|
||||
}
|
||||
else if constexpr (display_format == GPUTexture::Format::RGB565)
|
||||
{
|
||||
*(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000);
|
||||
}
|
||||
else if constexpr (display_format == GPUTexture::Format::RGB5A1)
|
||||
{
|
||||
*(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000);
|
||||
}
|
||||
}
|
||||
|
||||
src_y += y_step;
|
||||
dst_ptr += dst_stride;
|
||||
|
@ -392,6 +241,9 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
|
|||
case GPUTexture::Format::RGB5A1:
|
||||
return CopyOut15Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, width, height, line_skip);
|
||||
|
||||
case GPUTexture::Format::A1BGR5:
|
||||
return CopyOut15Bit<GPUTexture::Format::A1BGR5>(src_x, src_y, width, height, line_skip);
|
||||
|
||||
case GPUTexture::Format::RGB565:
|
||||
return CopyOut15Bit<GPUTexture::Format::RGB565>(src_x, src_y, width, height, line_skip);
|
||||
|
||||
|
@ -407,23 +259,7 @@ bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u3
|
|||
}
|
||||
else
|
||||
{
|
||||
switch (m_24bit_display_format)
|
||||
{
|
||||
case GPUTexture::Format::RGB5A1:
|
||||
return CopyOut24Bit<GPUTexture::Format::RGB5A1>(src_x, src_y, skip_x, width, height, line_skip);
|
||||
|
||||
case GPUTexture::Format::RGB565:
|
||||
return CopyOut24Bit<GPUTexture::Format::RGB565>(src_x, src_y, skip_x, width, height, line_skip);
|
||||
|
||||
case GPUTexture::Format::RGBA8:
|
||||
return CopyOut24Bit<GPUTexture::Format::RGBA8>(src_x, src_y, skip_x, width, height, line_skip);
|
||||
|
||||
case GPUTexture::Format::BGRA8:
|
||||
return CopyOut24Bit<GPUTexture::Format::BGRA8>(src_x, src_y, skip_x, width, height, line_skip);
|
||||
|
||||
default:
|
||||
UnreachableCode();
|
||||
}
|
||||
return CopyOut24Bit(src_x, src_y, skip_x, width, height, line_skip);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ protected:
|
|||
template<GPUTexture::Format display_format>
|
||||
bool CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip);
|
||||
|
||||
template<GPUTexture::Format display_format>
|
||||
bool CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip);
|
||||
|
||||
bool CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit);
|
||||
|
@ -57,11 +56,13 @@ protected:
|
|||
void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
|
||||
void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const;
|
||||
|
||||
private:
|
||||
static constexpr GPUTexture::Format FORMAT_FOR_24BIT = GPUTexture::Format::RGBA8; // RGBA8 always supported.
|
||||
|
||||
GPUTexture* GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format);
|
||||
|
||||
FixedHeapArray<u8, GPU_MAX_DISPLAY_WIDTH * GPU_MAX_DISPLAY_HEIGHT * sizeof(u32)> m_upload_buffer;
|
||||
GPUTexture::Format m_16bit_display_format = GPUTexture::Format::RGB565;
|
||||
GPUTexture::Format m_24bit_display_format = GPUTexture::Format::RGBA8;
|
||||
GPUTexture::Format m_16bit_display_format = GPUTexture::Format::Unknown;
|
||||
std::unique_ptr<GPUTexture> m_upload_texture;
|
||||
|
||||
GPU_SW_Backend m_backend;
|
||||
|
|
|
@ -3,8 +3,10 @@
|
|||
|
||||
#ifdef __INTELLISENSE__
|
||||
|
||||
#include "common/gsvector.h"
|
||||
#include "gpu.h"
|
||||
|
||||
#include "common/gsvector.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#define USE_VECTOR 1
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
|
||||
#include "types.h"
|
||||
|
||||
#include "util/gpu_texture.h"
|
||||
|
||||
#include "common/bitfield.h"
|
||||
#include "common/bitutils.h"
|
||||
#include "common/gsvector.h"
|
||||
|
@ -249,6 +251,101 @@ ALWAYS_INLINE static constexpr u16 VRAMRGBA8888ToRGBA5551(u32 color)
|
|||
return Truncate16(r | (g << 5) | (b << 10) | (a << 15));
|
||||
}
|
||||
|
||||
#ifdef CPU_ARCH_SIMD
|
||||
|
||||
ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
|
||||
{
|
||||
return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
|
||||
{
|
||||
static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
|
||||
|
||||
const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
|
||||
const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
|
||||
const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
|
||||
const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
|
||||
|
||||
return r | g.sll32<8>() | b.sll32<16>() | a;
|
||||
}
|
||||
|
||||
template<GPUTexture::Format format>
|
||||
ALWAYS_INLINE static void ConvertVRAMPixels(u8*& dest, GSVector4i c16)
|
||||
{
|
||||
if constexpr (format == GPUTexture::Format::RGBA8)
|
||||
{
|
||||
const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
|
||||
const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
|
||||
|
||||
GSVector4i::store<false>(dest, low);
|
||||
dest += sizeof(GSVector4i);
|
||||
|
||||
GSVector4i::store<false>(dest, high);
|
||||
dest += sizeof(GSVector4i);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::RGB5A1)
|
||||
{
|
||||
static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
|
||||
|
||||
const GSVector4i repacked =
|
||||
(c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
|
||||
|
||||
GSVector4i::store<false>(dest, repacked);
|
||||
dest += sizeof(GSVector4i);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::A1BGR5)
|
||||
{
|
||||
const GSVector4i repacked = (c16 & GSVector4i::cxpr16(static_cast<s16>(0x3E0))).sll16<1>() |
|
||||
(c16.srl16<9>() & GSVector4i::cxpr16(0x3E)) |
|
||||
(c16 & GSVector4i::cxpr16(0x1F)).sll16<11>() | c16.srl16<15>();
|
||||
|
||||
GSVector4i::store<false>(dest, repacked);
|
||||
dest += sizeof(GSVector4i);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::RGB565)
|
||||
{
|
||||
constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
|
||||
const GSVector4i a = (c16 & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
|
||||
const GSVector4i b = (c16 & GSVector4i::cxpr16(0x20)).sll16<1>(); // (value & 0x20) << 1
|
||||
const GSVector4i c = (c16.srl16<10>() & single_mask); // ((value >> 10) & 0x1F)
|
||||
const GSVector4i d = (c16 & single_mask).sll16<11>(); // ((value & 0x1F) << 11)
|
||||
GSVector4i::store<false>(dest, (((a | b) | c) | d));
|
||||
dest += sizeof(GSVector4i);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<GPUTexture::Format format>
|
||||
ALWAYS_INLINE static void ConvertVRAMPixel(u8*& dest, u16 c16)
|
||||
{
|
||||
if constexpr (format == GPUTexture::Format::RGBA8)
|
||||
{
|
||||
const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
|
||||
std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
|
||||
dest += sizeof(c32);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::RGB5A1)
|
||||
{
|
||||
const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
|
||||
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
|
||||
dest += sizeof(repacked);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::A1BGR5)
|
||||
{
|
||||
const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 >> 9) & 0x3E) | ((c16 & 0x1F) << 11) | (c16 >> 15);
|
||||
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
|
||||
dest += sizeof(repacked);
|
||||
}
|
||||
else if constexpr (format == GPUTexture::Format::RGB565)
|
||||
{
|
||||
const u16 repacked = ((c16 & 0x3E0) << 1) | ((c16 & 0x20) << 1) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 11);
|
||||
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
|
||||
dest += sizeof(repacked);
|
||||
}
|
||||
}
|
||||
|
||||
union GPUVertexPosition
|
||||
{
|
||||
u32 bits;
|
||||
|
|
Loading…
Reference in New Issue