From c812ab6a633b985f0c765428acf1062b4929a9c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Lam?= Date: Thu, 15 Apr 2021 12:35:46 +0200 Subject: [PATCH] Jit: Optimize block link queries by using hash tables Repeated erase() + iteration on a std::multimap is extremely slow. Slow enough that it causes a 7 second long stutter during some transitions in F-Zero X (a N64 VC game that triggers many, many icache invalidations). And slow enough that JitBaseBlockCache::DestroyBlock shows up on a flame graph as taking >50% of total CPU time on the CPU-GPU thread: https://i.imgur.com/vvqiFL6.png This commit optimises those block link queries by replacing the std::multimap (which is typically implemented with red-black trees) with hash tables. Master: https://i.imgur.com/vvqiFL6.png / 7s stutters (starting from 5.0-2021 and with branch following disabled) This commit: https://i.imgur.com/hAO74fy.png / ~0.7s stutters, which is pretty close to 5.0 stable. (5.0-2021 introduced the performance regression and it is especially noticeable when branch following is disabled, which is the case for all N64 VC games since 5.0-8377.) --- .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 38 +++++++++---------- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 6 ++- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index ed29151e6c..3eb8873802 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -127,7 +127,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link, { for (const auto& e : block.linkData) { - links_to.emplace(e.exitAddress, &block); + links_to[e.exitAddress].insert(&block); } LinkBlock(block); @@ -299,13 +299,14 @@ void JitBaseBlockCache::LinkBlockExits(JitBlock& block) void JitBaseBlockCache::LinkBlock(JitBlock& block) { LinkBlockExits(block); - auto ppp = links_to.equal_range(block.effectiveAddress); + const auto it = links_to.find(block.effectiveAddress); + if (it == links_to.end()) + return; - for (auto iter = ppp.first; iter != ppp.second; ++iter) + for (JitBlock* b2 : it->second) { - JitBlock& b2 = *iter->second; - if (block.msrBits == b2.msrBits) - LinkBlockExits(b2); + if (block.msrBits == b2->msrBits) + LinkBlockExits(*b2); } } @@ -318,14 +319,15 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block) } // Unlink all exits of other blocks which points to this block - auto ppp = links_to.equal_range(block.effectiveAddress); - for (auto iter = ppp.first; iter != ppp.second; ++iter) + const auto it = links_to.find(block.effectiveAddress); + if (it == links_to.end()) + return; + for (JitBlock* sourceBlock : it->second) { - JitBlock& sourceBlock = *iter->second; - if (sourceBlock.msrBits != block.msrBits) + if (sourceBlock->msrBits != block.msrBits) continue; - for (auto& e : sourceBlock.linkData) + for (auto& e : sourceBlock->linkData) { if (e.exitAddress == block.effectiveAddress) { @@ -346,14 +348,12 @@ void JitBaseBlockCache::DestroyBlock(JitBlock& block) // Delete linking addresses for (const auto& e : block.linkData) { - auto it = links_to.equal_range(e.exitAddress); - while (it.first != it.second) - { - if (it.first->second == &block) - it.first = links_to.erase(it.first); - else - it.first++; - } + auto it = links_to.find(e.exitAddress); + if (it == links_to.end()) + continue; + it->second.erase(&block); + if (it->second.empty()) + links_to.erase(it); } // Raise an signal if we are going to call this block again diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 678e808f2a..53e18e6645 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include "Common/CommonTypes.h" @@ -182,7 +184,7 @@ private: // links_to hold all exit points of all valid blocks in a reverse way. // It is used to query all blocks which links to an address. - std::multimap links_to; // destination_PC -> number + std::unordered_map> links_to; // destination_PC -> number // Map indexed by the physical address of the entry point. // This is used to query the block based on the current PC in a slow way. @@ -192,7 +194,7 @@ private: // This is used for invalidation of memory regions. The range is grouped // in macro blocks of each 0x100 bytes. static constexpr u32 BLOCK_RANGE_MAP_ELEMENTS = 0x100; - std::map> block_range_map; + std::map> block_range_map; // This bitsets shows which cachelines overlap with any blocks. // It is used to provide a fast way to query if no icache invalidation is needed.