Jit: Optimize block link queries by using hash tables
Repeated erase() + iteration on a std::multimap is extremely slow. Slow enough that it causes a 7 second long stutter during some transitions in F-Zero X (a N64 VC game that triggers many, many icache invalidations). And slow enough that JitBaseBlockCache::DestroyBlock shows up on a flame graph as taking >50% of total CPU time on the CPU-GPU thread: https://i.imgur.com/vvqiFL6.png This commit optimises those block link queries by replacing the std::multimap (which is typically implemented with red-black trees) with hash tables. Master: https://i.imgur.com/vvqiFL6.png / 7s stutters (starting from 5.0-2021 and with branch following disabled) This commit: https://i.imgur.com/hAO74fy.png / ~0.7s stutters, which is pretty close to 5.0 stable. (5.0-2021 introduced the performance regression and it is especially noticeable when branch following is disabled, which is the case for all N64 VC games since 5.0-8377.)
This commit is contained in:
parent
18e84361d9
commit
c812ab6a63
|
@ -127,7 +127,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
|
|||
{
|
||||
for (const auto& e : block.linkData)
|
||||
{
|
||||
links_to.emplace(e.exitAddress, &block);
|
||||
links_to[e.exitAddress].insert(&block);
|
||||
}
|
||||
|
||||
LinkBlock(block);
|
||||
|
@ -299,13 +299,14 @@ void JitBaseBlockCache::LinkBlockExits(JitBlock& block)
|
|||
void JitBaseBlockCache::LinkBlock(JitBlock& block)
|
||||
{
|
||||
LinkBlockExits(block);
|
||||
auto ppp = links_to.equal_range(block.effectiveAddress);
|
||||
const auto it = links_to.find(block.effectiveAddress);
|
||||
if (it == links_to.end())
|
||||
return;
|
||||
|
||||
for (auto iter = ppp.first; iter != ppp.second; ++iter)
|
||||
for (JitBlock* b2 : it->second)
|
||||
{
|
||||
JitBlock& b2 = *iter->second;
|
||||
if (block.msrBits == b2.msrBits)
|
||||
LinkBlockExits(b2);
|
||||
if (block.msrBits == b2->msrBits)
|
||||
LinkBlockExits(*b2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -318,14 +319,15 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block)
|
|||
}
|
||||
|
||||
// Unlink all exits of other blocks which points to this block
|
||||
auto ppp = links_to.equal_range(block.effectiveAddress);
|
||||
for (auto iter = ppp.first; iter != ppp.second; ++iter)
|
||||
const auto it = links_to.find(block.effectiveAddress);
|
||||
if (it == links_to.end())
|
||||
return;
|
||||
for (JitBlock* sourceBlock : it->second)
|
||||
{
|
||||
JitBlock& sourceBlock = *iter->second;
|
||||
if (sourceBlock.msrBits != block.msrBits)
|
||||
if (sourceBlock->msrBits != block.msrBits)
|
||||
continue;
|
||||
|
||||
for (auto& e : sourceBlock.linkData)
|
||||
for (auto& e : sourceBlock->linkData)
|
||||
{
|
||||
if (e.exitAddress == block.effectiveAddress)
|
||||
{
|
||||
|
@ -346,14 +348,12 @@ void JitBaseBlockCache::DestroyBlock(JitBlock& block)
|
|||
// Delete linking addresses
|
||||
for (const auto& e : block.linkData)
|
||||
{
|
||||
auto it = links_to.equal_range(e.exitAddress);
|
||||
while (it.first != it.second)
|
||||
{
|
||||
if (it.first->second == &block)
|
||||
it.first = links_to.erase(it.first);
|
||||
else
|
||||
it.first++;
|
||||
}
|
||||
auto it = links_to.find(e.exitAddress);
|
||||
if (it == links_to.end())
|
||||
continue;
|
||||
it->second.erase(&block);
|
||||
if (it->second.empty())
|
||||
links_to.erase(it);
|
||||
}
|
||||
|
||||
// Raise an signal if we are going to call this block again
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
#include <memory>
|
||||
#include <set>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "Common/CommonTypes.h"
|
||||
|
@ -182,7 +184,7 @@ private:
|
|||
|
||||
// links_to hold all exit points of all valid blocks in a reverse way.
|
||||
// It is used to query all blocks which links to an address.
|
||||
std::multimap<u32, JitBlock*> links_to; // destination_PC -> number
|
||||
std::unordered_map<u32, std::unordered_set<JitBlock*>> links_to; // destination_PC -> number
|
||||
|
||||
// Map indexed by the physical address of the entry point.
|
||||
// This is used to query the block based on the current PC in a slow way.
|
||||
|
@ -192,7 +194,7 @@ private:
|
|||
// This is used for invalidation of memory regions. The range is grouped
|
||||
// in macro blocks of each 0x100 bytes.
|
||||
static constexpr u32 BLOCK_RANGE_MAP_ELEMENTS = 0x100;
|
||||
std::map<u32, std::set<JitBlock*>> block_range_map;
|
||||
std::map<u32, std::unordered_set<JitBlock*>> block_range_map;
|
||||
|
||||
// This bitsets shows which cachelines overlap with any blocks.
|
||||
// It is used to provide a fast way to query if no icache invalidation is needed.
|
||||
|
|
Loading…
Reference in New Issue