GPU/HW: Vectorize texture conversion routines

~100% speed up for RGBA8, ~50% for RGB5A1.
This commit is contained in:
Stenzek 2024-11-30 14:01:58 +10:00
parent fa4dc381ed
commit c6e2235ee2
No known key found for this signature in database
1 changed files with 227 additions and 52 deletions

View File

@ -242,6 +242,7 @@ static bool ShouldTrackVRAMWrites();
static bool IsDumpingVRAMWriteTextures();
static void UpdateVRAMTrackingState();
static void SetHashCacheTextureFormat();
static bool CompilePipelines();
static void DestroyPipelines();
@ -280,11 +281,14 @@ static void RemoveVRAMWrite(VRAMWrite* entry);
static void DumpTexturesFromVRAMWrite(VRAMWrite* entry);
static void DumpTextureFromPage(const Source* src);
static void DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u32* dest, u32 dest_stride,
u32 width, u32 height);
static void DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u32* dest, u32 dest_stride);
static void DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u32* dest, u32 dest_stride);
static void DecodeTexture16(const u16* page, u32 width, u32 height, u32* dest, u32 dest_stride);
static void DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u8* dest, u32 dest_stride,
u32 width, u32 height, GPUTexture::Format dest_format);
template<GPUTexture::Format dest_format>
static void DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride);
template<GPUTexture::Format dest_format>
static void DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride);
template<GPUTexture::Format dest_format>
static void DecodeTexture16(const u16* page, u32 width, u32 height, u8* dest, u32 dest_stride);
static void DecodeTexture(u8 page, GPUTexturePaletteReg palette, GPUTextureMode mode, GPUTexture* texture);
static std::optional<TextureReplacementType> GetTextureReplacementTypeFromFileTitle(const std::string_view file_title);
@ -512,6 +516,7 @@ struct GPUTextureCacheState
VRAMWrite* last_vram_write = nullptr;
bool track_vram_writes = false;
GPUTexture::Format hash_cache_texture_format = GPUTexture::Format::Unknown;
HashCache hash_cache;
/// List of candidates for purging when the hash cache gets too large.
@ -568,6 +573,7 @@ bool GPUTextureCache::IsDumpingVRAMWriteTextures()
bool GPUTextureCache::Initialize()
{
SetHashCacheTextureFormat();
LoadLocalConfiguration(false, false);
UpdateVRAMTrackingState();
if (!CompilePipelines())
@ -770,6 +776,21 @@ void GPUTextureCache::Shutdown()
s_state.game_id = {};
}
void GPUTextureCache::SetHashCacheTextureFormat()
{
#if 0
// Prefer 16-bit texture formats where possible.
if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1))
s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1;
else
s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8;
INFO_LOG("Using {} format for hash cache entries.", GPUTexture::GetFormatName(s_state.hash_cache_texture_format));
#else
s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8;
#endif
}
bool GPUTextureCache::CompilePipelines()
{
if (!g_settings.texture_replacements.enable_texture_replacements)
@ -1063,29 +1084,116 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg
return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()];
}
// TODO: Vectorize these with gather.
void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u32* dest,
template<GPUTexture::Format format>
ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16)
{
if constexpr (format == GPUTexture::Format::RGBA8)
{
const u32 c32 = VRAMRGBA5551ToRGBA8888(c16);
std::memcpy(std::assume_aligned<sizeof(c32)>(dest), &c32, sizeof(c32));
dest += sizeof(c32);
}
else if constexpr (format == GPUTexture::Format::RGB5A1)
{
const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10);
std::memcpy(std::assume_aligned<sizeof(repacked)>(dest), &repacked, sizeof(repacked));
dest += sizeof(repacked);
}
}
#ifdef CPU_ARCH_SIMD
ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val)
{
return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>();
}
ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val)
{
static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F);
const GSVector4i r = VRAM5BitTo8Bit(val & cmask);
const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask));
const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask));
const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>();
return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a;
}
template<GPUTexture::Format format>
ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16)
{
if constexpr (format == GPUTexture::Format::RGBA8)
{
const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16());
const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16());
GSVector4i::store<false>(dest, low);
dest += sizeof(GSVector4i);
GSVector4i::store<false>(dest, high);
dest += sizeof(GSVector4i);
}
else if constexpr (format == GPUTexture::Format::RGB5A1)
{
static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F);
const GSVector4i repacked =
(c16 & GSVector4i::cxpr16(static_cast<s16>(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>();
GSVector4i::store<false>(dest, repacked);
dest += sizeof(GSVector4i);
}
}
#endif
template<GPUTexture::Format format>
void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest,
u32 dest_stride)
{
if ((width % 4u) == 0)
{
const u32 vram_width = width / 4;
[[maybe_unused]] constexpr u32 vram_pixels_per_vec = 2;
[[maybe_unused]] const u32 aligned_vram_width = Common::AlignDownPow2(vram_width, vram_pixels_per_vec);
for (u32 y = 0; y < height; y++)
{
const u16* page_ptr = page;
u32* dest_ptr = dest;
u8* dest_ptr = dest;
u32 x = 0;
for (u32 x = 0; x < vram_width; x++)
#ifdef CPU_ARCH_SIMD
for (; x < aligned_vram_width; x += vram_pixels_per_vec)
{
// No variable shift without AVX, kinda pointless to vectorize the extract...
alignas(VECTOR_ALIGNMENT) u16 c16[vram_pixels_per_vec * 4];
u32 pp = *(page_ptr++);
c16[0] = palette[pp & 0x0F];
c16[1] = palette[(pp >> 4) & 0x0F];
c16[2] = palette[(pp >> 8) & 0x0F];
c16[3] = palette[pp >> 12];
pp = *(page_ptr++);
c16[4] = palette[pp & 0x0F];
c16[5] = palette[(pp >> 4) & 0x0F];
c16[6] = palette[(pp >> 8) & 0x0F];
c16[7] = palette[pp >> 12];
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
}
#endif
for (; x < vram_width; x++)
{
const u32 pp = *(page_ptr++);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp & 0x0F]);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[(pp >> 4) & 0x0F]);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[(pp >> 8) & 0x0F]);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp >> 12]);
WriteDecodedTexel<format>(dest_ptr, palette[pp & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 4) & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[(pp >> 8) & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[pp >> 12]);
}
page += VRAM_WIDTH;
dest = reinterpret_cast<u32*>(reinterpret_cast<u8*>(dest) + dest_stride);
dest += dest_stride;
}
}
else
@ -1093,7 +1201,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
for (u32 y = 0; y < height; y++)
{
const u16* page_ptr = page;
u32* dest_ptr = dest;
u8* dest_ptr = dest;
u32 offs = 0;
u16 texel = 0;
@ -1102,37 +1210,64 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi
if (offs == 0)
texel = *(page_ptr++);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[texel & 0x0F]);
WriteDecodedTexel<format>(dest_ptr, palette[texel & 0x0F]);
texel >>= 4;
offs = (offs + 1) % 4;
}
page += VRAM_WIDTH;
dest = reinterpret_cast<u32*>(reinterpret_cast<u8*>(dest) + dest_stride);
dest += dest_stride;
}
}
}
void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u32* dest,
template<GPUTexture::Format format>
void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u8* dest,
u32 dest_stride)
{
if ((width % 2u) == 0)
{
const u32 vram_width = width / 2;
[[maybe_unused]] constexpr u32 vram_pixels_per_vec = 4;
[[maybe_unused]] const u32 aligned_vram_width = Common::AlignDownPow2(vram_width, vram_pixels_per_vec);
for (u32 y = 0; y < height; y++)
{
const u16* page_ptr = page;
u32* dest_ptr = dest;
u8* dest_ptr = dest;
u32 x = 0;
for (u32 x = 0; x < vram_width; x++)
#ifdef CPU_ARCH_SIMD
for (; x < aligned_vram_width; x += vram_pixels_per_vec)
{
// No variable shift without AVX, kinda pointless to vectorize the extract...
alignas(VECTOR_ALIGNMENT) u16 c16[vram_pixels_per_vec * 2];
u32 pp = *(page_ptr++);
c16[0] = palette[pp & 0xFF];
c16[1] = palette[(pp >> 8) & 0xFF];
pp = *(page_ptr++);
c16[2] = palette[pp & 0xFF];
c16[3] = palette[(pp >> 8) & 0xFF];
pp = *(page_ptr++);
c16[4] = palette[pp & 0xFF];
c16[5] = palette[(pp >> 8) & 0xFF];
pp = *(page_ptr++);
c16[6] = palette[pp & 0xFF];
c16[7] = palette[(pp >> 8) & 0xFF];
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<true>(c16));
}
#endif
for (; x < vram_width; x++)
{
const u32 pp = *(page_ptr++);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp & 0xFF]);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp >> 8]);
WriteDecodedTexel<format>(dest_ptr, palette[pp & 0xFF]);
WriteDecodedTexel<format>(dest_ptr, palette[pp >> 8]);
}
page += VRAM_WIDTH;
dest = reinterpret_cast<u32*>(reinterpret_cast<u8*>(dest) + dest_stride);
dest += dest_stride;
}
}
else
@ -1140,7 +1275,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
for (u32 y = 0; y < height; y++)
{
const u16* page_ptr = page;
u32* dest_ptr = dest;
u8* dest_ptr = dest;
u32 offs = 0;
u16 texel = 0;
@ -1149,70 +1284,110 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi
if (offs == 0)
texel = *(page_ptr++);
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[texel & 0xFF]);
WriteDecodedTexel<format>(dest_ptr, palette[texel & 0xFF]);
texel >>= 8;
offs ^= 1;
}
page += VRAM_WIDTH;
dest = reinterpret_cast<u32*>(reinterpret_cast<u8*>(dest) + dest_stride);
dest += dest_stride;
}
}
}
void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u32* dest, u32 dest_stride)
template<GPUTexture::Format format>
void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8* dest, u32 dest_stride)
{
[[maybe_unused]] constexpr u32 pixels_per_vec = 8;
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec);
for (u32 y = 0; y < height; y++)
{
const u16* page_ptr = page;
u32* dest_ptr = dest;
u8* dest_ptr = dest;
u32 x = 0;
for (u32 x = 0; x < width; x++)
*(dest_ptr++) = VRAMRGBA5551ToRGBA8888(*(page_ptr++));
#ifdef CPU_ARCH_SIMD
for (; x < aligned_width; x += pixels_per_vec)
{
WriteDecodedTexels<format>(dest_ptr, GSVector4i::load<false>(page_ptr));
page_ptr += pixels_per_vec;
}
#endif
for (; x < width; x++)
WriteDecodedTexel<format>(dest_ptr, *(page_ptr++));
page += VRAM_WIDTH;
dest = reinterpret_cast<u32*>(reinterpret_cast<u8*>(dest) + dest_stride);
dest += dest_stride;
}
}
void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u32* dest,
u32 dest_stride, u32 width, u32 height)
void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u8* dest,
u32 dest_stride, u32 width, u32 height, GPUTexture::Format dest_format)
{
switch (mode)
if (dest_format == GPUTexture::Format::RGBA8)
{
case GPUTextureMode::Palette4Bit:
DecodeTexture4(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Palette8Bit:
DecodeTexture8(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Direct16Bit:
case GPUTextureMode::Reserved_Direct16Bit:
DecodeTexture16(page_ptr, width, height, dest, dest_stride);
break;
switch (mode)
{
case GPUTextureMode::Palette4Bit:
DecodeTexture4<GPUTexture::Format::RGBA8>(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Palette8Bit:
DecodeTexture8<GPUTexture::Format::RGBA8>(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Direct16Bit:
case GPUTextureMode::Reserved_Direct16Bit:
DecodeTexture16<GPUTexture::Format::RGBA8>(page_ptr, width, height, dest, dest_stride);
break;
DefaultCaseIsUnreachable()
DefaultCaseIsUnreachable()
}
}
else if (dest_format == GPUTexture::Format::RGB5A1)
{
switch (mode)
{
case GPUTextureMode::Palette4Bit:
DecodeTexture4<GPUTexture::Format::RGB5A1>(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Palette8Bit:
DecodeTexture8<GPUTexture::Format::RGB5A1>(page_ptr, palette, width, height, dest, dest_stride);
break;
case GPUTextureMode::Direct16Bit:
case GPUTextureMode::Reserved_Direct16Bit:
DecodeTexture16<GPUTexture::Format::RGB5A1>(page_ptr, width, height, dest, dest_stride);
break;
DefaultCaseIsUnreachable()
}
}
else
{
Panic("Unsupported texture format.");
}
}
void GPUTextureCache::DecodeTexture(u8 page, GPUTexturePaletteReg palette, GPUTextureMode mode, GPUTexture* texture)
{
alignas(16) static u32 s_temp_buffer[TEXTURE_PAGE_WIDTH * TEXTURE_PAGE_HEIGHT];
alignas(16) static u8 s_temp_buffer[TEXTURE_PAGE_WIDTH * TEXTURE_PAGE_HEIGHT * sizeof(u32)];
u32* tex_map;
const u32 ps = texture->GetPixelSize();
u8* tex_map;
u32 tex_stride;
const bool mapped =
texture->Map(reinterpret_cast<void**>(&tex_map), &tex_stride, 0, 0, TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT);
if (!mapped)
{
tex_map = s_temp_buffer;
tex_stride = sizeof(u32) * TEXTURE_PAGE_WIDTH;
tex_stride = Common::AlignUpPow2(ps * TEXTURE_PAGE_WIDTH, 4);
}
const u16* page_ptr = VRAMPagePointer(page);
const u16* palette_ptr = TextureModeHasPalette(mode) ? VRAMPalettePointer(palette) : nullptr;
DecodeTexture(mode, page_ptr, palette_ptr, tex_map, tex_stride, TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT);
DecodeTexture(mode, page_ptr, palette_ptr, tex_map, tex_stride, TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT,
texture->GetFormat());
if (mapped)
texture->Unmap();
@ -2064,7 +2239,7 @@ GPUTextureCache::HashCacheEntry* GPUTextureCache::LookupHashCache(SourceKey key,
entry.sources = {};
entry.texture =
g_gpu_device->FetchTexture(TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT, 1, 1, 1, GPUTexture::Type::Texture,
GPUTexture::Format::RGBA8, GPUTexture::Flags::None);
s_state.hash_cache_texture_format, GPUTexture::Flags::None);
if (!entry.texture)
{
ERROR_LOG("Failed to create texture.");
@ -2616,8 +2791,8 @@ void GPUTextureCache::DumpTexture(TextureReplacementType type, u32 offset_x, u32
DEV_LOG("Dumping VRAM write {:016X} [{}x{}] at {}", src_hash, width, height, rect);
Image image(width, height, ImageFormat::RGBA8);
GPUTextureCache::DecodeTexture(mode, &g_vram[rect.top * VRAM_WIDTH + rect.left], palette_data,
reinterpret_cast<u32*>(image.GetPixels()), image.GetPitch(), width, height);
GPUTextureCache::DecodeTexture(mode, &g_vram[rect.top * VRAM_WIDTH + rect.left], palette_data, image.GetPixels(),
image.GetPitch(), width, height, GPUTexture::Format::RGBA8);
// TODO: Vectorize this.
u32* image_pixels = reinterpret_cast<u32*>(image.GetPixels());