From 364199916f38cb3eb0aff2ba320eab388ba2b621 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sat, 8 Jan 2022 14:22:23 +1000 Subject: [PATCH] GS: Hash local memory blocks instead of expanded textures --- pcsx2/GS/GSExtra.h | 18 +- pcsx2/GS/GSState.cpp | 3 +- pcsx2/GS/Renderers/HW/GSTextureCache.cpp | 253 ++++++++++++++--------- pcsx2/GS/Renderers/HW/GSTextureCache.h | 14 +- 4 files changed, 184 insertions(+), 104 deletions(-) diff --git a/pcsx2/GS/GSExtra.h b/pcsx2/GS/GSExtra.h index 58d66677c3..2941f704a8 100644 --- a/pcsx2/GS/GSExtra.h +++ b/pcsx2/GS/GSExtra.h @@ -111,7 +111,23 @@ static const GSVector2i default_rt_size(1280, 1024); #endif // Maximum texture size to skip preload/hash path. -static constexpr int MAXIMUM_PRELOAD_TEXTURE_SIZE = 512; +// This is the width/height from the registers, i.e. not the power of 2. +__fi static bool CanPreloadTextureSize(u32 tw, u32 th) +{ + static constexpr u32 MAXIMUM_SIZE_IN_ONE_DIRECTION = 10; // 1024 + static constexpr u32 MAXIMUM_SIZE_IN_OTHER_DIRECTION = 8; // 256 + static constexpr u32 MAXIMUM_SIZE_IN_BOTH_DIRECTIONS = 9; // 512 + + // We use an area-based approach here. We want to hash long font maps, + // like 128x1024 (used in FFX), but skip 1024x512 textures (e.g. Xenosaga). + const u32 max_dimension = (tw > th) ? tw : th; + const u32 min_dimension = (tw > th) ? th : tw; + if (max_dimension <= MAXIMUM_SIZE_IN_BOTH_DIRECTIONS) + return true; + + return (max_dimension <= MAXIMUM_SIZE_IN_ONE_DIRECTION && + min_dimension <= MAXIMUM_SIZE_IN_OTHER_DIRECTION); +} // Maximum number of mipmap levels for a texture. // PS2 has a max of 7 levels (1 base + 6 mips). diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp index 953f65e404..c52c632a8e 100644 --- a/pcsx2/GS/GSState.cpp +++ b/pcsx2/GS/GSState.cpp @@ -2558,8 +2558,7 @@ void GSState::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFR // don't bother checking when preload is on, since we're going to test the whole thing anyway if (GSConfig.PreloadTexture && GSConfig.UseHardwareRenderer() && - (GSConfig.GPUPaletteConversion || - (w <= MAXIMUM_PRELOAD_TEXTURE_SIZE && h <= MAXIMUM_PRELOAD_TEXTURE_SIZE))) + CanPreloadTextureSize(static_cast(tw), static_cast(th))) { r = tr; return; diff --git a/pcsx2/GS/Renderers/HW/GSTextureCache.cpp b/pcsx2/GS/Renderers/HW/GSTextureCache.cpp index c9366efe40..331e889acb 100644 --- a/pcsx2/GS/Renderers/HW/GSTextureCache.cpp +++ b/pcsx2/GS/Renderers/HW/GSTextureCache.cpp @@ -888,21 +888,24 @@ void GSTextureCache::InvalidateVideoMem(const GSOffset& off, const GSVector4i& r { u32* RESTRICT valid = s->m_valid; - // Invalidate data of input texture - if (s->m_repeating) + if (!s->CanPreload()) { - // Note: very hot path on snowbling engine game - for (const GSVector2i& k : s->m_p2t[page]) + // Invalidate data of input texture + if (s->m_repeating) { - valid[k.x] &= k.y; + // Note: very hot path on snowbling engine game + for (const GSVector2i& k : s->m_p2t[page]) + { + valid[k.x] &= k.y; + } + } + else + { + valid[page] = 0; } } - else - { - valid[page] = 0; - } - s->m_complete = false; + s->m_complete_layers = 0; found |= b; } @@ -1212,7 +1215,8 @@ void GSTextureCache::InvalidateVideoMemSubTarget(GSTextureCache::Target* rt) void GSTextureCache::IncAge() { - int maxage = GSConfig.PreloadTexture ? (m_src.m_used ? 30 : 60) : (m_src.m_used ? 3 : 6); + const int max_age = m_src.m_used ? 3 : 6; + const int max_preload_age = m_src.m_used ? 30 : 60; // You can't use m_map[page] because Source* are duplicated on several pages. for (auto i = m_src.m_surfaces.begin(); i != m_src.m_surfaces.end();) @@ -1229,7 +1233,7 @@ void GSTextureCache::IncAge() else { ++i; - if (++s->m_age > maxage) + if (++s->m_age > (s->CanPreload() ? max_preload_age : max_age)) { m_src.RemoveAt(s); } @@ -1242,7 +1246,7 @@ void GSTextureCache::IncAge() // Sigh, this seems to be used to invalidate surfaces. So set a huge maxage to avoid flicker, // but still invalidate surfaces. (Disgaea 2 fmv when booting the game through the BIOS) // Original maxage was 4 here, Xenosaga 2 needs at least 240, else it flickers on scene transitions. - maxage = 400; // ffx intro scene changes leave the old image untouched for a couple of frames and only then start using it + static constexpr int max_rt_age = 400; // ffx intro scene changes leave the old image untouched for a couple of frames and only then start using it for (int type = 0; type < 2; type++) { @@ -1261,7 +1265,7 @@ void GSTextureCache::IncAge() t->m_32_bits_fmt = false; } - if (++t->m_age > maxage) + if (++t->m_age > max_rt_age) { i = list.erase(i); GL_CACHE("TC: Remove Target(%s): %d (0x%x) due to age", to_string(type), @@ -1845,7 +1849,6 @@ GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFR , m_palette(nullptr) , m_valid_rect(0, 0) , m_target(false) - , m_complete(false) , m_p2t(NULL) , m_from_target(NULL) , m_from_target_TEX0(TEX0) @@ -1874,7 +1877,7 @@ GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFR m_repeating = m_TEX0.IsRepeating(); - if (m_repeating) + if (m_repeating && !CanPreload()) { m_p2t = r->m_mem.GetPage2TileMap(m_TEX0); } @@ -1888,32 +1891,26 @@ GSTextureCache::Source::~Source() _aligned_free(m_write.rect); } -void GSTextureCache::Source::Update(const GSVector4i& rect, int layer) +void GSTextureCache::Source::Update(const GSVector4i& rect, int level) { Surface::UpdateAge(); - if (layer == 0 && (m_complete || m_target)) + if (m_target || (m_complete_layers & (1u << level))) + return; + + if (CanPreload()) { + PreloadLevel(level); return; } const GSVector2i& bs = GSLocalMemory::m_psm[m_TEX0.PSM].bs; - const int tw = 1 << m_TEX0.TW; const int th = 1 << m_TEX0.TH; - const bool preload = (GSConfig.PreloadTexture && (GSConfig.GPUPaletteConversion || (tw <= MAXIMUM_PRELOAD_TEXTURE_SIZE && th <= MAXIMUM_PRELOAD_TEXTURE_SIZE))); - if (preload) - { - PreloadUpdate(tw, th, layer); - return; - } - GSVector4i r = rect.ralign(bs); - if (layer == 0 && r.eq(GSVector4i(0, 0, tw, th))) - { - m_complete = true; // lame, but better than nothing - } + if (r.eq(GSVector4i(0, 0, tw, th))) + m_complete_layers |= (1u << level); const GSOffset& off = m_renderer->m_context->offset.tex; GSOffset::BNHelper bn = off.bnMulti(r.left, r.top); @@ -1940,7 +1937,7 @@ void GSTextureCache::Source::Update(const GSVector4i& rect, int layer) { m_valid[row] |= col; - Write(GSVector4i(x, y, x + bs.x, y + bs.y), layer); + Write(GSVector4i(x, y, x + bs.x, y + bs.y), level); blocks++; } @@ -1967,7 +1964,7 @@ void GSTextureCache::Source::Update(const GSVector4i& rect, int layer) { m_valid[row] |= col; - Write(GSVector4i(x, y, x + bs.x, y + bs.y), layer); + Write(GSVector4i(x, y, x + bs.x, y + bs.y), level); blocks++; } @@ -1979,7 +1976,7 @@ void GSTextureCache::Source::Update(const GSVector4i& rect, int layer) if (blocks > 0) { g_perfmon.Put(GSPerfMon::Unswizzle, bs.x * bs.y * blocks << (m_palette ? 2 : 0)); - Flush(m_write.count, layer); + Flush(m_write.count, level); } } @@ -2105,79 +2102,83 @@ void GSTextureCache::Source::Flush(u32 count, int layer) m_write.count -= count; } -GSTextureCache::Source::HashType GSTextureCache::Source::HashTexture(u8* buff, u32 row_size, u32 pitch, u32 height) -{ - if (row_size == pitch) - { - // fast path since it's all packed - return XXH3_64bits(buff, row_size * height); - } +using BlockHashState = XXH3_state_t; - // slow path where we have to process rows-at-a-time - XXH3_state_t st; +__fi static void BlockHashReset(BlockHashState& st) +{ XXH3_64bits_reset(&st); - for (u32 row = 0; row < height; row++) - { - XXH3_64bits_update(&st, buff, row_size); - buff += pitch; - } +} + +__fi static void BlockHashAccumulate(BlockHashState& st, const u8* bp) +{ + XXH3_64bits_update(&st, bp, BLOCK_SIZE); +} + +__fi static void BlockHashAccumulate(BlockHashState& st, const u8* bp, u32 size) +{ + XXH3_64bits_update(&st, bp, size); +} + +__fi static GSTextureCache::Source::HashType FinishBlockHash(BlockHashState& st) +{ return XXH3_64bits_digest(&st); } -void GSTextureCache::Source::PreloadUpdate(int tw, int th, int layer) +void GSTextureCache::Source::PreloadLevel(int level) { - const GSVector2i& bs = GSLocalMemory::m_psm[m_TEX0.PSM].bs; - const GSOffset& off = m_renderer->m_context->offset.tex; + // m_TEX0 is adjusted for mips (messy, should be changed). + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_TEX0.PSM]; + const GSVector2i& bs = psm.bs; + const int tw = 1 << m_TEX0.TW; + const int th = 1 << m_TEX0.TH; + + // For textures which are smaller than the block size, we expand and then hash. + // This is because otherwise we get the padding bytes, which can be random junk. + if (tw < bs.x || th < bs.y) + { + PreloadSmallLevel(level); + return; + } + + // From GSLocalMemory foreachBlock(), used for reading textures. + // We want to hash the exact same blocks here. const GSVector4i rect(0, 0, tw, th); const GSVector4i block_rect(rect.ralign(bs)); - GSOffset::BNHelper bn = off.bnMulti(0, 0); - - // flag everything as valid - if (m_repeating) + const GSOffset& off = m_renderer->m_context->offset.tex; + GSLocalMemory& mem = m_renderer->m_mem; + HashType hash; { - for (int y = block_rect.top; y < block_rect.bottom; y += bs.y, bn.nextBlockY()) + BlockHashState hash_st; + BlockHashReset(hash_st); + + GSOffset::BNHelper bn = off.bnMulti(block_rect.left, block_rect.top); + const int right = block_rect.right >> off.blockShiftX(); + const int bottom = block_rect.bottom >> off.blockShiftY(); + const int xAdd = (1 << off.blockShiftX()) * (psm.bpp / 8); + + for (; bn.blkY() < bottom; bn.nextBlockY()) { - for (int x = block_rect.left; x < block_rect.right; bn.nextBlockX(), x += bs.x) + for (int x = 0; bn.blkX() < right; bn.nextBlockX(), x += xAdd) { - const u32 i = static_cast((bn.blkY() << 7) + bn.blkX()); - u32 block = bn.valueNoWrap(); - - if (block < MAX_BLOCKS || m_wrap_gs_mem) - { - u32 addr = i % MAX_BLOCKS; - - u32 row = addr >> 5u; - u32 col = 1 << (addr & 31u); - m_valid[row] |= col; - } + BlockHashAccumulate(hash_st, mem.BlockPtr(bn.value())); } } - } - else - { - for (int y = block_rect.top; y < block_rect.bottom; y += bs.y, bn.nextBlockY()) - { - for (int x = block_rect.left; x < block_rect.right; x += bs.x, bn.nextBlockX()) - { - u32 block = bn.valueNoWrap(); - if (block < MAX_BLOCKS || m_wrap_gs_mem) - { - block %= MAX_BLOCKS; - - u32 row = block >> 5u; - u32 col = 1 << (block & 31u); - m_valid[row] |= col; - } - } - } + hash = FinishBlockHash(hash_st); } - if (layer == 0) - m_complete = true; + // Layer is complete again, regardless of whether the hash matches or not (and we reupload). + const u8 layer_bit = static_cast(1) << level; + m_complete_layers |= layer_bit; - // decode texture to temporary memory - const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_TEX0.PSM]; + // Check whether the hash matches. Black textures will be 0, so check the valid bit. + if ((m_valid_hashes & layer_bit) && m_layer_hash[level] == hash) + return; + + m_valid_hashes |= layer_bit; + m_layer_hash[level] = hash; + + // Expand texture/apply palette. const int read_width = std::max(tw, psm.bs.x); u32 pitch = static_cast(read_width) * sizeof(u32); u32 row_size = static_cast(tw) * sizeof(u32); @@ -2189,19 +2190,79 @@ void GSTextureCache::Source::PreloadUpdate(int tw, int th, int layer) rtx = psm.rtxP; } + // If we can stream it directly to GPU memory, do so, otherwise go through a temp buffer. + GSTexture::GSMap map; + if (rect.eq(block_rect) && m_texture->Map(map, &rect, level)) + { + (m_renderer->m_mem.*rtx)(off, block_rect, map.bits, map.pitch, m_TEXA); + m_texture->Unmap(); + } + else + { + u8* buff = m_temp; + (m_renderer->m_mem.*rtx)(off, block_rect, buff, pitch, m_TEXA); + m_texture->Update(rect, buff, pitch, level); + } +} + +void GSTextureCache::Source::PreloadSmallLevel(int level) +{ + // m_TEX0 is adjusted for mips (messy, should be changed). + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_TEX0.PSM]; + const GSVector2i& bs = psm.bs; + const int tw = 1 << m_TEX0.TW; + const int th = 1 << m_TEX0.TH; + const GSVector4i rect(0, 0, tw, th); + const GSVector4i block_rect(rect.ralign(bs)); + const GSOffset& off = m_renderer->m_context->offset.tex; + GSLocalMemory& mem = m_renderer->m_mem; + + // Expand texture/apply palette. + u32 pitch = static_cast(block_rect.z) * sizeof(u32); + u32 row_size = static_cast(tw) * sizeof(u32); + GSLocalMemory::readTexture rtx = psm.rtx; + if (m_palette) + { + pitch >>= 2; + row_size >>= 2; + rtx = psm.rtxP; + } + + // Use temp buffer for expanding, since we may not need to update. u8* buff = m_temp; (m_renderer->m_mem.*rtx)(off, block_rect, buff, pitch, m_TEXA); - // hash the texture - const HashType hash = HashTexture(buff, row_size, pitch, static_cast(th)); - const u8 layer_bit = static_cast(1) << layer; - if ((m_valid_hashes & layer_bit) && m_layer_hash[layer] == hash) + // Hash the expanded texture. + HashType hash; + { + u8* ptr = buff; + BlockHashState state; + BlockHashReset(state); + if (pitch == row_size) + { + BlockHashAccumulate(state, ptr, pitch * static_cast(th)); + } + else + { + for (int y = 0; y < th; y++, ptr += pitch) + BlockHashAccumulate(state, ptr, row_size); + } + hash = FinishBlockHash(state); + } + + // Layer is complete again, regardless of whether the hash matches or not (and we reupload). + const u8 layer_bit = static_cast(1) << level; + m_complete_layers |= layer_bit; + + // Check whether the hash matches. Black textures will be 0, so check the valid bit. + if ((m_valid_hashes & layer_bit) && m_layer_hash[level] == hash) return; - // reupload m_valid_hashes |= layer_bit; - m_layer_hash[layer] = hash; - m_texture->Update(rect, buff, pitch, layer); + m_layer_hash[level] = hash; + + // Upload to GPU. + m_texture->Update(rect, buff, pitch, level); } bool GSTextureCache::Source::ClutMatch(const PaletteKey& palette_key) diff --git a/pcsx2/GS/Renderers/HW/GSTextureCache.h b/pcsx2/GS/Renderers/HW/GSTextureCache.h index 0a6b9861c1..0d7a7bea16 100644 --- a/pcsx2/GS/Renderers/HW/GSTextureCache.h +++ b/pcsx2/GS/Renderers/HW/GSTextureCache.h @@ -100,16 +100,18 @@ public: class Source : public Surface { + public: + using HashType = u64; + + private: struct { GSVector4i* rect; u32 count; } m_write; - using HashType = u64; - - HashType HashTexture(u8* buff, u32 row_size, u32 pitch, u32 height); - void PreloadUpdate(int tw, int th, int layer); + void PreloadLevel(int level); + void PreloadSmallLevel(int level); void Write(const GSVector4i& r, int layer); void Flush(u32 count, int layer); @@ -120,8 +122,8 @@ public: u32 m_valid[MAX_PAGES]; // each u32 bits map to the 32 blocks of that page GSVector4i m_valid_rect; u8 m_valid_hashes = 0; + u8 m_complete_layers = 0; bool m_target; - bool m_complete; bool m_repeating; std::vector* m_p2t; // Keep a trace of the target origin. There is no guarantee that pointer will @@ -139,6 +141,8 @@ public: Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, u8* temp, bool dummy_container = false); virtual ~Source(); + __fi bool CanPreload() const { return (GSConfig.PreloadTexture && CanPreloadTextureSize(m_TEX0.TW, m_TEX0.TH)); } + void Update(const GSVector4i& rect, int layer = 0); void UpdateLayer(const GIFRegTEX0& TEX0, const GSVector4i& rect, int layer = 0);