diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp index bf0bca50d..389e3c443 100644 --- a/src/core/gpu_hw_texture_cache.cpp +++ b/src/core/gpu_hw_texture_cache.cpp @@ -242,6 +242,7 @@ static bool ShouldTrackVRAMWrites(); static bool IsDumpingVRAMWriteTextures(); static void UpdateVRAMTrackingState(); +static void SetHashCacheTextureFormat(); static bool CompilePipelines(); static void DestroyPipelines(); @@ -280,11 +281,14 @@ static void RemoveVRAMWrite(VRAMWrite* entry); static void DumpTexturesFromVRAMWrite(VRAMWrite* entry); static void DumpTextureFromPage(const Source* src); -static void DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u32* dest, u32 dest_stride, - u32 width, u32 height); -static void DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u32* dest, u32 dest_stride); -static void DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u32* dest, u32 dest_stride); -static void DecodeTexture16(const u16* page, u32 width, u32 height, u32* dest, u32 dest_stride); +static void DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u8* dest, u32 dest_stride, + u32 width, u32 height, GPUTexture::Format dest_format); +template +static void DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride); +template +static void DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride); +template +static void DecodeTexture16(const u16* page, u32 width, u32 height, u8* dest, u32 dest_stride); static void DecodeTexture(u8 page, GPUTexturePaletteReg palette, GPUTextureMode mode, GPUTexture* texture); static std::optional GetTextureReplacementTypeFromFileTitle(const std::string_view file_title); @@ -512,6 +516,7 @@ struct GPUTextureCacheState VRAMWrite* last_vram_write = nullptr; bool track_vram_writes = false; + GPUTexture::Format hash_cache_texture_format = GPUTexture::Format::Unknown; HashCache hash_cache; /// List of candidates for purging when the hash cache gets too large. @@ -568,6 +573,7 @@ bool GPUTextureCache::IsDumpingVRAMWriteTextures() bool GPUTextureCache::Initialize() { + SetHashCacheTextureFormat(); LoadLocalConfiguration(false, false); UpdateVRAMTrackingState(); if (!CompilePipelines()) @@ -770,6 +776,21 @@ void GPUTextureCache::Shutdown() s_state.game_id = {}; } +void GPUTextureCache::SetHashCacheTextureFormat() +{ +#if 0 + // Prefer 16-bit texture formats where possible. + if (g_gpu_device->SupportsTextureFormat(GPUTexture::Format::RGB5A1)) + s_state.hash_cache_texture_format = GPUTexture::Format::RGB5A1; + else + s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8; + + INFO_LOG("Using {} format for hash cache entries.", GPUTexture::GetFormatName(s_state.hash_cache_texture_format)); +#else + s_state.hash_cache_texture_format = GPUTexture::Format::RGBA8; +#endif +} + bool GPUTextureCache::CompilePipelines() { if (!g_settings.texture_replacements.enable_texture_replacements) @@ -1063,29 +1084,116 @@ ALWAYS_INLINE_RELEASE static const u16* VRAMPalettePointer(GPUTexturePaletteReg return &g_vram[VRAM_WIDTH * palette.GetYBase() + palette.GetXBase()]; } -// TODO: Vectorize these with gather. -void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u32* dest, +template +ALWAYS_INLINE static void WriteDecodedTexel(u8*& dest, u16 c16) +{ + if constexpr (format == GPUTexture::Format::RGBA8) + { + const u32 c32 = VRAMRGBA5551ToRGBA8888(c16); + std::memcpy(std::assume_aligned(dest), &c32, sizeof(c32)); + dest += sizeof(c32); + } + else if constexpr (format == GPUTexture::Format::RGB5A1) + { + const u16 repacked = (c16 & 0x83E0) | ((c16 >> 10) & 0x1F) | ((c16 & 0x1F) << 10); + std::memcpy(std::assume_aligned(dest), &repacked, sizeof(repacked)); + dest += sizeof(repacked); + } +} + +#ifdef CPU_ARCH_SIMD + +ALWAYS_INLINE static GSVector4i VRAM5BitTo8Bit(GSVector4i val) +{ + return val.mul32l(GSVector4i::cxpr(527)).add32(GSVector4i::cxpr(23)).srl32<6>(); +} + +ALWAYS_INLINE static GSVector4i VRAMRGB5A1ToRGBA8888(GSVector4i val) +{ + static constexpr GSVector4i cmask = GSVector4i::cxpr(0x1F); + + const GSVector4i r = VRAM5BitTo8Bit(val & cmask); + const GSVector4i g = VRAM5BitTo8Bit((val.srl32<5>() & cmask)); + const GSVector4i b = VRAM5BitTo8Bit((val.srl32<10>() & cmask)); + const GSVector4i a = val.srl32<15>().sll32<31>().sra32<7>(); + + return r | g.sll32<8>() | b.sll32<16>() | b.sll32<24>() | a; +} + +template +ALWAYS_INLINE static void WriteDecodedTexels(u8*& dest, GSVector4i c16) +{ + if constexpr (format == GPUTexture::Format::RGBA8) + { + const GSVector4i low = VRAMRGB5A1ToRGBA8888(c16.upl16()); + const GSVector4i high = VRAMRGB5A1ToRGBA8888(c16.uph16()); + + GSVector4i::store(dest, low); + dest += sizeof(GSVector4i); + + GSVector4i::store(dest, high); + dest += sizeof(GSVector4i); + } + else if constexpr (format == GPUTexture::Format::RGB5A1) + { + static constexpr GSVector4i cmask = GSVector4i::cxpr16(0x1F); + + const GSVector4i repacked = + (c16 & GSVector4i::cxpr16(static_cast(0x83E0))) | (c16.srl16<10>() & cmask) | (c16 & cmask).sll16<10>(); + + GSVector4i::store(dest, repacked); + dest += sizeof(GSVector4i); + } +} + +#endif + +template +void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride) { if ((width % 4u) == 0) { const u32 vram_width = width / 4; + [[maybe_unused]] constexpr u32 vram_pixels_per_vec = 2; + [[maybe_unused]] const u32 aligned_vram_width = Common::AlignDownPow2(vram_width, vram_pixels_per_vec); + for (u32 y = 0; y < height; y++) { const u16* page_ptr = page; - u32* dest_ptr = dest; + u8* dest_ptr = dest; + u32 x = 0; - for (u32 x = 0; x < vram_width; x++) +#ifdef CPU_ARCH_SIMD + for (; x < aligned_vram_width; x += vram_pixels_per_vec) + { + // No variable shift without AVX, kinda pointless to vectorize the extract... + alignas(VECTOR_ALIGNMENT) u16 c16[vram_pixels_per_vec * 4]; + u32 pp = *(page_ptr++); + c16[0] = palette[pp & 0x0F]; + c16[1] = palette[(pp >> 4) & 0x0F]; + c16[2] = palette[(pp >> 8) & 0x0F]; + c16[3] = palette[pp >> 12]; + pp = *(page_ptr++); + c16[4] = palette[pp & 0x0F]; + c16[5] = palette[(pp >> 4) & 0x0F]; + c16[6] = palette[(pp >> 8) & 0x0F]; + c16[7] = palette[pp >> 12]; + WriteDecodedTexels(dest_ptr, GSVector4i::load(c16)); + } +#endif + + for (; x < vram_width; x++) { const u32 pp = *(page_ptr++); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp & 0x0F]); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[(pp >> 4) & 0x0F]); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[(pp >> 8) & 0x0F]); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp >> 12]); + WriteDecodedTexel(dest_ptr, palette[pp & 0x0F]); + WriteDecodedTexel(dest_ptr, palette[(pp >> 4) & 0x0F]); + WriteDecodedTexel(dest_ptr, palette[(pp >> 8) & 0x0F]); + WriteDecodedTexel(dest_ptr, palette[pp >> 12]); } page += VRAM_WIDTH; - dest = reinterpret_cast(reinterpret_cast(dest) + dest_stride); + dest += dest_stride; } } else @@ -1093,7 +1201,7 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi for (u32 y = 0; y < height; y++) { const u16* page_ptr = page; - u32* dest_ptr = dest; + u8* dest_ptr = dest; u32 offs = 0; u16 texel = 0; @@ -1102,37 +1210,64 @@ void GPUTextureCache::DecodeTexture4(const u16* page, const u16* palette, u32 wi if (offs == 0) texel = *(page_ptr++); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[texel & 0x0F]); + WriteDecodedTexel(dest_ptr, palette[texel & 0x0F]); texel >>= 4; offs = (offs + 1) % 4; } page += VRAM_WIDTH; - dest = reinterpret_cast(reinterpret_cast(dest) + dest_stride); + dest += dest_stride; } } } -void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u32* dest, + +template +void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 width, u32 height, u8* dest, u32 dest_stride) { if ((width % 2u) == 0) { const u32 vram_width = width / 2; + [[maybe_unused]] constexpr u32 vram_pixels_per_vec = 4; + [[maybe_unused]] const u32 aligned_vram_width = Common::AlignDownPow2(vram_width, vram_pixels_per_vec); + for (u32 y = 0; y < height; y++) { const u16* page_ptr = page; - u32* dest_ptr = dest; + u8* dest_ptr = dest; + u32 x = 0; - for (u32 x = 0; x < vram_width; x++) +#ifdef CPU_ARCH_SIMD + for (; x < aligned_vram_width; x += vram_pixels_per_vec) + { + // No variable shift without AVX, kinda pointless to vectorize the extract... + alignas(VECTOR_ALIGNMENT) u16 c16[vram_pixels_per_vec * 2]; + u32 pp = *(page_ptr++); + c16[0] = palette[pp & 0xFF]; + c16[1] = palette[(pp >> 8) & 0xFF]; + pp = *(page_ptr++); + c16[2] = palette[pp & 0xFF]; + c16[3] = palette[(pp >> 8) & 0xFF]; + pp = *(page_ptr++); + c16[4] = palette[pp & 0xFF]; + c16[5] = palette[(pp >> 8) & 0xFF]; + pp = *(page_ptr++); + c16[6] = palette[pp & 0xFF]; + c16[7] = palette[(pp >> 8) & 0xFF]; + WriteDecodedTexels(dest_ptr, GSVector4i::load(c16)); + } +#endif + + for (; x < vram_width; x++) { const u32 pp = *(page_ptr++); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp & 0xFF]); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[pp >> 8]); + WriteDecodedTexel(dest_ptr, palette[pp & 0xFF]); + WriteDecodedTexel(dest_ptr, palette[pp >> 8]); } page += VRAM_WIDTH; - dest = reinterpret_cast(reinterpret_cast(dest) + dest_stride); + dest += dest_stride; } } else @@ -1140,7 +1275,7 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi for (u32 y = 0; y < height; y++) { const u16* page_ptr = page; - u32* dest_ptr = dest; + u8* dest_ptr = dest; u32 offs = 0; u16 texel = 0; @@ -1149,70 +1284,110 @@ void GPUTextureCache::DecodeTexture8(const u16* page, const u16* palette, u32 wi if (offs == 0) texel = *(page_ptr++); - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(palette[texel & 0xFF]); + WriteDecodedTexel(dest_ptr, palette[texel & 0xFF]); texel >>= 8; offs ^= 1; } page += VRAM_WIDTH; - dest = reinterpret_cast(reinterpret_cast(dest) + dest_stride); + dest += dest_stride; } } } -void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u32* dest, u32 dest_stride) +template +void GPUTextureCache::DecodeTexture16(const u16* page, u32 width, u32 height, u8* dest, u32 dest_stride) { + [[maybe_unused]] constexpr u32 pixels_per_vec = 8; + [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(width, pixels_per_vec); + for (u32 y = 0; y < height; y++) { const u16* page_ptr = page; - u32* dest_ptr = dest; + u8* dest_ptr = dest; + u32 x = 0; - for (u32 x = 0; x < width; x++) - *(dest_ptr++) = VRAMRGBA5551ToRGBA8888(*(page_ptr++)); +#ifdef CPU_ARCH_SIMD + for (; x < aligned_width; x += pixels_per_vec) + { + WriteDecodedTexels(dest_ptr, GSVector4i::load(page_ptr)); + page_ptr += pixels_per_vec; + } +#endif + + for (; x < width; x++) + WriteDecodedTexel(dest_ptr, *(page_ptr++)); page += VRAM_WIDTH; - dest = reinterpret_cast(reinterpret_cast(dest) + dest_stride); + dest += dest_stride; } } -void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u32* dest, - u32 dest_stride, u32 width, u32 height) +void GPUTextureCache::DecodeTexture(GPUTextureMode mode, const u16* page_ptr, const u16* palette, u8* dest, + u32 dest_stride, u32 width, u32 height, GPUTexture::Format dest_format) { - switch (mode) + if (dest_format == GPUTexture::Format::RGBA8) { - case GPUTextureMode::Palette4Bit: - DecodeTexture4(page_ptr, palette, width, height, dest, dest_stride); - break; - case GPUTextureMode::Palette8Bit: - DecodeTexture8(page_ptr, palette, width, height, dest, dest_stride); - break; - case GPUTextureMode::Direct16Bit: - case GPUTextureMode::Reserved_Direct16Bit: - DecodeTexture16(page_ptr, width, height, dest, dest_stride); - break; + switch (mode) + { + case GPUTextureMode::Palette4Bit: + DecodeTexture4(page_ptr, palette, width, height, dest, dest_stride); + break; + case GPUTextureMode::Palette8Bit: + DecodeTexture8(page_ptr, palette, width, height, dest, dest_stride); + break; + case GPUTextureMode::Direct16Bit: + case GPUTextureMode::Reserved_Direct16Bit: + DecodeTexture16(page_ptr, width, height, dest, dest_stride); + break; - DefaultCaseIsUnreachable() + DefaultCaseIsUnreachable() + } + } + else if (dest_format == GPUTexture::Format::RGB5A1) + { + switch (mode) + { + case GPUTextureMode::Palette4Bit: + DecodeTexture4(page_ptr, palette, width, height, dest, dest_stride); + break; + case GPUTextureMode::Palette8Bit: + DecodeTexture8(page_ptr, palette, width, height, dest, dest_stride); + break; + case GPUTextureMode::Direct16Bit: + case GPUTextureMode::Reserved_Direct16Bit: + DecodeTexture16(page_ptr, width, height, dest, dest_stride); + break; + + DefaultCaseIsUnreachable() + } + } + else + { + Panic("Unsupported texture format."); } } void GPUTextureCache::DecodeTexture(u8 page, GPUTexturePaletteReg palette, GPUTextureMode mode, GPUTexture* texture) { - alignas(16) static u32 s_temp_buffer[TEXTURE_PAGE_WIDTH * TEXTURE_PAGE_HEIGHT]; + alignas(16) static u8 s_temp_buffer[TEXTURE_PAGE_WIDTH * TEXTURE_PAGE_HEIGHT * sizeof(u32)]; - u32* tex_map; + const u32 ps = texture->GetPixelSize(); + u8* tex_map; u32 tex_stride; const bool mapped = texture->Map(reinterpret_cast(&tex_map), &tex_stride, 0, 0, TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT); if (!mapped) { tex_map = s_temp_buffer; - tex_stride = sizeof(u32) * TEXTURE_PAGE_WIDTH; + tex_stride = Common::AlignUpPow2(ps * TEXTURE_PAGE_WIDTH, 4); } const u16* page_ptr = VRAMPagePointer(page); const u16* palette_ptr = TextureModeHasPalette(mode) ? VRAMPalettePointer(palette) : nullptr; - DecodeTexture(mode, page_ptr, palette_ptr, tex_map, tex_stride, TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT); + DecodeTexture(mode, page_ptr, palette_ptr, tex_map, tex_stride, TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT, + texture->GetFormat()); if (mapped) texture->Unmap(); @@ -2064,7 +2239,7 @@ GPUTextureCache::HashCacheEntry* GPUTextureCache::LookupHashCache(SourceKey key, entry.sources = {}; entry.texture = g_gpu_device->FetchTexture(TEXTURE_PAGE_WIDTH, TEXTURE_PAGE_HEIGHT, 1, 1, 1, GPUTexture::Type::Texture, - GPUTexture::Format::RGBA8, GPUTexture::Flags::None); + s_state.hash_cache_texture_format, GPUTexture::Flags::None); if (!entry.texture) { ERROR_LOG("Failed to create texture."); @@ -2616,8 +2791,8 @@ void GPUTextureCache::DumpTexture(TextureReplacementType type, u32 offset_x, u32 DEV_LOG("Dumping VRAM write {:016X} [{}x{}] at {}", src_hash, width, height, rect); Image image(width, height, ImageFormat::RGBA8); - GPUTextureCache::DecodeTexture(mode, &g_vram[rect.top * VRAM_WIDTH + rect.left], palette_data, - reinterpret_cast(image.GetPixels()), image.GetPitch(), width, height); + GPUTextureCache::DecodeTexture(mode, &g_vram[rect.top * VRAM_WIDTH + rect.left], palette_data, image.GetPixels(), + image.GetPitch(), width, height, GPUTexture::Format::RGBA8); // TODO: Vectorize this. u32* image_pixels = reinterpret_cast(image.GetPixels());