GSdx-TC: Palette management rework. (#2344)

This follows the 4th point of #2310.
The idea was to rework the current palette texture management to improve performances with games that uses multiple palettes for the same data texture.

The new management shows small to none performances improvement in almost every game in terms of FPS, and it lowers the GPU BUS usage by some percentage points in games like Baldur's Gate: Dark Alliance (9% to 7%) which uses many palettes.

The hot topic is that the performances in Zone Of The Enders 2 skyrocketed (2x), because of the fact that the game uses many palettes and a small number of textures to render it's effects.

For more detailed information check the PR  #2344
This commit is contained in:
iMineLink 2018-11-04 22:06:24 +01:00 committed by lightningterror
parent 77a924ee7d
commit 9fa1b290ba
3 changed files with 265 additions and 39 deletions

View File

@ -1330,12 +1330,15 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
// We need the palette to convert the depth to the correct alpha value.
if (!tex->m_palette) {
// If this asserts fails, the allocated palette texture (tex->m_palette)
// is leaked because it is not released on tex destruction
ASSERT(!tex->m_should_have_tex_palette); // No 8-bit texture enabled
tex->m_palette = m_dev->CreateTexture(256, 1);
const uint32* clut = m_mem.m_clut;
int pal = GSLocalMemory::m_psm[tex->m_TEX0.PSM].pal;
uint16 pal = GSLocalMemory::m_psm[tex->m_TEX0.PSM].pal;
tex->m_palette->Update(GSVector4i(0, 0, pal, 1), clut, pal * sizeof(clut[0]));
tex->m_initpalette = false;
dev->PSSetShaderResource(1, tex->m_palette);
}

View File

@ -29,7 +29,8 @@ bool GSTextureCache::m_disable_partial_invalidation = false;
bool GSTextureCache::m_wrap_gs_mem = false;
GSTextureCache::GSTextureCache(GSRenderer* r)
: m_renderer(r)
: m_renderer(r),
m_palette_map(r)
{
s_IS_DIRECT3D11 = theApp.GetCurrentRendererType() == GSRendererType::DX1011_HW;
s_IS_OPENGL = theApp.GetCurrentRendererType() == GSRendererType::OGL_HW;
@ -95,6 +96,8 @@ void GSTextureCache::RemoveAll()
m_dst[type].clear();
}
m_palette_map.Clear();
}
GSTextureCache::Source* GSTextureCache::LookupDepthSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, bool palette)
@ -165,12 +168,7 @@ GSTextureCache::Source* GSTextureCache::LookupDepthSource(const GIFRegTEX0& TEX0
// If it is too expensive, one could cut memory allocation in Source constructor for this
// use case.
if (palette) {
const uint32* clut = m_renderer->m_mem.m_clut;
int size = psm_s.pal * sizeof(clut[0]);
src->m_palette = m_renderer->m_dev->CreateTexture(256, 1);
src->m_palette->Update(GSVector4i(0, 0, psm_s.pal, 1), clut, size);
src->m_initpalette = false;
AttachPaletteToSource(src, psm_s.pal);
}
m_src.m_surfaces.insert(src);
@ -229,7 +227,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
// converted by the CPU (s->m_palette == NULL), we need to ensure
// palette content is the same.
// Note: content of the palette will be uploaded at the end of the function
if (psm_s.pal > 0 && s->m_palette == NULL && !GSVector4i::compare64(clut, s->m_clut, psm_s.pal * sizeof(clut[0])))
if (psm_s.pal > 0 && !s->m_should_have_tex_palette && !GSVector4i::compare64(clut, s->m_clut, psm_s.pal * sizeof(clut[0])))
continue;
// We request a 24/16 bit RGBA texture. Alpha expansion was done by
@ -403,15 +401,8 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con
psm_str(TEX0.PSM));
}
if (src->m_palette)
{
int size = psm_s.pal * sizeof(clut[0]);
if(src->m_initpalette || !GSVector4i::update(src->m_clut, clut, size))
{
src->m_palette->Update(GSVector4i(0, 0, psm_s.pal, 1), src->m_clut, size);
src->m_initpalette = false;
}
if (src->m_should_have_tex_palette && (!src->m_clut || !GSVector4i::compare64(src->m_clut, clut, psm_s.pal * sizeof(uint32)))) {
AttachPaletteToSource(src, psm_s.pal);
}
src->Update(r);
@ -1347,7 +1338,7 @@ GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, con
// However it is different here. We want to reuse a Render Target as a texture.
// Because the texture is already on the GPU, CPU can't convert it.
if (psm.pal > 0) {
src->m_palette = m_renderer->m_dev->CreateTexture(256, 1);
src->m_should_have_tex_palette = true;
}
// Disable linear filtering for various GS post-processing effect
// 1/ Palette is used to interpret the alpha channel of the RT as an index.
@ -1461,19 +1452,21 @@ GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, con
if (m_paltex && psm.pal > 0)
{
src->m_texture = m_renderer->m_dev->CreateTexture(tw, th, Get8bitFormat());
src->m_palette = m_renderer->m_dev->CreateTexture(256, 1);
src->m_should_have_tex_palette = true;
}
else
else {
src->m_texture = m_renderer->m_dev->CreateTexture(tw, th);
if (psm.pal > 0) {
uint16 palette_size = psm.pal * sizeof(uint32);
src->m_clut = (uint32*)_aligned_malloc(palette_size * sizeof(uint32), 64);
memcpy(src->m_clut, (const uint32*)m_renderer->m_mem.m_clut, palette_size);
}
}
}
ASSERT(src->m_texture);
if(psm.pal > 0)
{
memcpy(src->m_clut, (const uint32*)m_renderer->m_mem.m_clut, psm.pal * sizeof(uint32));
}
m_src.Add(src, TEX0, m_renderer->m_context->offset.tex);
return src;
@ -1563,8 +1556,10 @@ void GSTextureCache::Surface::UpdateAge()
GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint8* temp, bool dummy_container)
: Surface(r, temp)
, m_palette_obj(nullptr)
, m_palette(NULL)
, m_initpalette(true)
, m_should_have_tex_palette(false)
, m_clut(NULL)
, m_target(false)
, m_complete(false)
, m_spritehack_t(false)
@ -1580,8 +1575,6 @@ GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFR
m_write.rect = NULL;
m_write.count = 0;
m_clut = NULL;
m_repeating = false;
} else {
@ -1589,10 +1582,6 @@ GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFR
memset(m_valid, 0, sizeof(m_valid));
m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 32);
memset(m_clut, 0, 256*sizeof(uint32));
m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 32);
m_write.count = 0;
@ -1610,9 +1599,10 @@ GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFR
GSTextureCache::Source::~Source()
{
m_renderer->m_dev->Recycle(m_palette);
if (!m_palette_obj) {
_aligned_free(m_clut);
m_renderer->m_dev->Recycle(m_palette);
}
_aligned_free(m_write.rect);
}
@ -2006,7 +1996,7 @@ void GSTextureCache::SourceMap::Add(Source* s, const GIFRegTEX0& TEX0, GSOffset*
void GSTextureCache::SourceMap::RemoveAll()
{
for(auto s : m_surfaces) delete s;
for (auto s : m_surfaces) delete s;
m_surfaces.clear();
@ -2055,3 +2045,171 @@ void GSTextureCache::SourceMap::RemoveAt(Source* s)
delete s;
}
// Query the PaletteMap for a valid Palette, then assign both palette texture pointer and clut copy pointer to the Source object
void GSTextureCache::AttachPaletteToSource(Source* s, uint16 pal)
{
std::shared_ptr<Palette> p = m_palette_map.LookupPalette(pal);
s->m_palette_obj = p;
s->m_palette = p->GetPaletteGSTexture();
s->m_clut = p->GetClut();
}
// GSTextureCache::Palette
// Creates a new palette texture with current clut content, keeping a reference to its copy
GSTextureCache::Palette::Palette(const GSRenderer* renderer, uint16 pal) {
uint16 palette_size = pal * sizeof(uint32);
m_clut = (uint32*)_aligned_malloc(palette_size, 64);
m_renderer = renderer;
memcpy(m_clut, (const uint32*)m_renderer->m_mem.m_clut, palette_size);
m_tex_palette = m_renderer->m_dev->CreateTexture(256, 1);
m_tex_palette->Update(GSVector4i(0, 0, pal, 1), m_clut, palette_size);
}
// Default destructor, recycles palette texture and frees clut copy
GSTextureCache::Palette::~Palette() {
m_renderer->m_dev->Recycle(GetPaletteGSTexture()); // Recycle palette texture
_aligned_free(GetClut()); // Free clut copy
}
uint32* GSTextureCache::Palette::GetClut() {
return m_clut;
}
GSTexture* GSTextureCache::Palette::GetPaletteGSTexture() {
return m_tex_palette;
}
// GSTextureCache::PaletteKeyHash
// Hashes the content of the clut.
// The hashing function is implemented by taking two things into account:
// 1) The clut can be an array of 16 or 256 uint32 (depending on the pal parameter) and in order to speed up the computation of the hash
// the array is hashed in blocks of 16 uint32, so for clut of size 16 uint32 the hashing is computed in one pass and for clut of 256 uint32
// it is computed in 16 passes,
// 2) The clut can contain many 0s, so as a way to increase the spread of hashing values for small changes in the input clut the hashing function
// is using addition in combination with logical XOR operator; The addition constants are large prime numbers, which may help in achieving what intended.
std::size_t GSTextureCache::PaletteKeyHash::operator()(const PaletteKey &key) const {
uint16 pal = key.pal;
const uint32* clut = key.clut;
size_t clut_hash = 3831179159;
for (uint16 i = 0; i < pal; i += 16) {
clut_hash = (clut_hash + 1488000301) ^ (clut[i] + 33644011);
clut_hash = (clut_hash + 3831179159) ^ (clut[i + 1] + 47627467);
clut_hash = (clut_hash + 3659574209) ^ (clut[i + 2] + 577038523);
clut_hash = (clut_hash + 33644011) ^ (clut[i + 3] + 3491555267);
clut_hash = (clut_hash + 777771959) ^ (clut[i + 4] + 3301075993);
clut_hash = (clut_hash + 4019618579) ^ (clut[i + 5] + 4186992613);
clut_hash = (clut_hash + 3465668953) ^ (clut[i + 6] + 3043435883);
clut_hash = (clut_hash + 3494478943) ^ (clut[i + 7] + 3441897883);
clut_hash = (clut_hash + 3432010979) ^ (clut[i + 8] + 2167922789);
clut_hash = (clut_hash + 1570862863) ^ (clut[i + 9] + 3401920591);
clut_hash = (clut_hash + 1002648679) ^ (clut[i + 10] + 1293530519);
clut_hash = (clut_hash + 551381741) ^ (clut[i + 11] + 2539834039);
clut_hash = (clut_hash + 3768974459) ^ (clut[i + 12] + 169943507);
clut_hash = (clut_hash + 862380703) ^ (clut[i + 13] + 2906932549);
clut_hash = (clut_hash + 3433082137) ^ (clut[i + 14] + 4234384109);
clut_hash = (clut_hash + 2679083843) ^ (clut[i + 15] + 2719605247);
}
return clut_hash;
};
// GSTextureCache::PaletteKeyEqual
// Compare clut contents
bool GSTextureCache::PaletteKeyEqual::operator()(const PaletteKey &lhs, const PaletteKey &rhs) const {
ASSERT(lhs.pal == rhs.pal); // By design, each map SHOULD contain only PaletteKey with the same pal value
uint16 pal = lhs.pal;
uint16 palette_size = pal * sizeof(uint32);
return GSVector4i::compare64(lhs.clut, rhs.clut, palette_size);
};
// GSTextureCache::PaletteMap
// Default constructor, stores renderer pointer and reverses space in the maps
GSTextureCache::PaletteMap::PaletteMap(const GSRenderer* renderer) {
this->m_renderer = renderer;
for (auto& map : m_maps) {
map.reserve(MAX_SIZE);
}
}
// Retrieves the palette with the desired clut
std::shared_ptr<GSTextureCache::Palette> GSTextureCache::PaletteMap::LookupPalette(uint16 pal) {
// Choose which hash map search into:
// pal == 16 : index 0
// pal == 256 : index 1
auto& map = m_maps[pal == 16 ? 0 : 1];
const uint32* clut = (const uint32*)m_renderer->m_mem.m_clut;
// Create PaletteKey for searching into map (clut is actually not copied, so do not store this key into the map)
PaletteKey palette_key = { clut, pal };
auto it1 = map.find(palette_key);
if (it1 != map.end()) {
// Clut content match, HIT
return it1->second;
}
// No Palette with matching clut content hash, MISS
if (map.size() > MAX_SIZE) {
// If the map is too big, try to clean it by disposing and removing unused palettes, before adding the new one
GL_INS("WARNING, %u-bit PaletteMap (Size %u): Max size %u exceeded, clearing unused palettes.", pal * sizeof(uint32), map.size(), MAX_SIZE);
uint32 current_size = map.size();
for (auto it = map.begin(); it != map.end(); ) {
// If the palette is unused, there is only one shared pointers holding a reference to the unused Palette object,
// and this shared pointer is the one stored in the map itself
if (it->second.use_count() <= 1) {
// Palette is unused
it = map.erase(it); // Erase element from map
// The palette object should now be gone as the shared pointer to the object in the map is deleted
}
else {
++it;
}
}
uint32 cleared_palette_count = current_size - (uint32)map.size();
if (cleared_palette_count == 0) {
GL_INS("ERROR, %u-bit PaletteMap (Size %u): Max size %u exceeded, could not clear any palette, negative performance impact.", pal * sizeof(uint32), map.size(), MAX_SIZE);
}
else {
map.reserve(MAX_SIZE); // Ensure map capacity is not modified by the clearing
GL_INS("INFO, %u-bit PaletteMap (Size %u): Cleared %u palettes.", pal * sizeof(uint32), map.size(), cleared_palette_count);
}
}
// Create new Palette using shared pointer
std::shared_ptr<Palette> palette = std::make_shared<Palette>(m_renderer, pal);
// Create key for storing the Palette into the map (use copy of the clut stored into Palette itself as key attribute)
palette_key = { palette->GetClut(), pal };
// Add the new palette to the map
map.emplace(palette_key, palette);
GL_CACHE("TC, %u-bit PaletteMap (Size %u): Added new palette.", pal * sizeof(uint32), map.size());
// Return the shared pointer to the newly created Palette
return palette;
}
void GSTextureCache::PaletteMap::Clear() {
for (auto& map : m_maps) {
map.clear(); // Clear all the nodes of the map, deleting Palette objects managed by shared pointers as they should be unused elsewhere
map.reserve(MAX_SIZE); // Ensure map capacity is not modified by the clearing
}
}

View File

@ -51,6 +51,47 @@ public:
void UpdateAge();
};
class Palette
{
private:
uint32* m_clut; // Pointer to a copy of relevant clut
GSTexture* m_tex_palette; // Pointer to valid texture with relevant clut as content
const GSRenderer* m_renderer; // Pointer to the current renderer, needed to recycle the referenced GSTexture on destruction
public:
Palette(const GSRenderer* renderer, uint16 pal); // Creates a copy of the current clut and a texture with its content
virtual ~Palette(); // Default destructor, recycles palette texture and frees clut copy
// Disable copy constructor and copy operator
Palette(const Palette&) = delete;
Palette& operator=(const Palette&) = delete;
// Disable move constructor and move operator
Palette(const Palette&&) = delete;
Palette& operator=(const Palette&&) = delete;
// Getter for clut pointer
uint32* GetClut();
// Getter for palette texture pointer
GSTexture* GetPaletteGSTexture();
};
struct PaletteKey {
const uint32* clut;
uint16 pal;
};
struct PaletteKeyHash {
// Calculate hash
virtual std::size_t operator()(const PaletteKey &key) const;
};
struct PaletteKeyEqual {
// Compare clut contents
virtual bool operator()(const PaletteKey &lhs, const PaletteKey &rhs) const;
};
class Source : public Surface
{
struct {GSVector4i* rect; uint32 count;} m_write;
@ -59,8 +100,9 @@ public:
void Flush(uint32 count, int layer);
public:
std::shared_ptr<Palette> m_palette_obj; // Shared pointer to the relevant Palette object (if any)
GSTexture* m_palette;
bool m_initpalette;
bool m_should_have_tex_palette; // Enables m_clut (and possibly m_palette) recycling on object destruction
uint32 m_valid[MAX_PAGES]; // each uint32 bits map to the 32 blocks of that page
uint32* m_clut;
bool m_target;
@ -105,6 +147,26 @@ public:
void Update();
};
class PaletteMap
{
private:
static const uint16 MAX_SIZE = 65535; // Max size of each map.
const GSRenderer* m_renderer; // Reference to the current renderer
// Array of 2 maps, the first for 64B palettes and the second for 1024B palettes.
// Each map stores the key PaletteKey (clut copy, pal value) pointing to the relevant shared pointer to Palette object.
// There is one PaletteKey per Palette, and the hashing and comparison of PaletteKey is done with custom operators PaletteKeyHash and PaletteKeyEqual.
std::array<std::unordered_map<PaletteKey, std::shared_ptr<Palette>, PaletteKeyHash, PaletteKeyEqual>, 2> m_maps;
public:
PaletteMap(const GSRenderer* renderer); // Default constructor
// Retrieves a shared pointer to a valid Palette from m_maps or creates a new one adding it to the data structure
std::shared_ptr<Palette> LookupPalette(uint16 pal);
void Clear(); // Clears m_maps, deleting clut(s) arrays and recycling palette textures
};
class SourceMap
{
public:
@ -123,6 +185,7 @@ public:
protected:
GSRenderer* m_renderer;
PaletteMap m_palette_map;
SourceMap m_src;
FastList<Target*> m_dst[2];
bool m_paltex;
@ -174,4 +237,6 @@ public:
}
void PrintMemoryUsage();
void AttachPaletteToSource(Source* s, uint16 pal);
};