Jit: Optimize block link queries by using hash tables

Repeated erase() + iteration on a std::multimap is extremely slow.

Slow enough that it causes a 7 second long stutter during some
transitions in F-Zero X (a N64 VC game that triggers many, many icache
invalidations).

And slow enough that JitBaseBlockCache::DestroyBlock shows up on a
flame graph as taking >50% of total CPU time on the CPU-GPU thread:
https://i.imgur.com/vvqiFL6.png

This commit optimises those block link queries by replacing the
std::multimap (which is typically implemented with red-black trees)
with hash tables.

Master: https://i.imgur.com/vvqiFL6.png / 7s stutters
(starting from 5.0-2021 and with branch following disabled)

This commit: https://i.imgur.com/hAO74fy.png / ~0.7s stutters, which
is pretty close to 5.0 stable. (5.0-2021 introduced the performance
regression and it is especially noticeable when branch following
is disabled, which is the case for all N64 VC games since 5.0-8377.)
This commit is contained in:
Léo Lam 2021-04-15 12:35:46 +02:00
parent 18e84361d9
commit c812ab6a63
No known key found for this signature in database
GPG Key ID: 0DF30F9081000741
2 changed files with 23 additions and 21 deletions

View File

@ -127,7 +127,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
{
for (const auto& e : block.linkData)
{
links_to.emplace(e.exitAddress, &block);
links_to[e.exitAddress].insert(&block);
}
LinkBlock(block);
@ -299,13 +299,14 @@ void JitBaseBlockCache::LinkBlockExits(JitBlock& block)
void JitBaseBlockCache::LinkBlock(JitBlock& block)
{
LinkBlockExits(block);
auto ppp = links_to.equal_range(block.effectiveAddress);
const auto it = links_to.find(block.effectiveAddress);
if (it == links_to.end())
return;
for (auto iter = ppp.first; iter != ppp.second; ++iter)
for (JitBlock* b2 : it->second)
{
JitBlock& b2 = *iter->second;
if (block.msrBits == b2.msrBits)
LinkBlockExits(b2);
if (block.msrBits == b2->msrBits)
LinkBlockExits(*b2);
}
}
@ -318,14 +319,15 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block)
}
// Unlink all exits of other blocks which points to this block
auto ppp = links_to.equal_range(block.effectiveAddress);
for (auto iter = ppp.first; iter != ppp.second; ++iter)
const auto it = links_to.find(block.effectiveAddress);
if (it == links_to.end())
return;
for (JitBlock* sourceBlock : it->second)
{
JitBlock& sourceBlock = *iter->second;
if (sourceBlock.msrBits != block.msrBits)
if (sourceBlock->msrBits != block.msrBits)
continue;
for (auto& e : sourceBlock.linkData)
for (auto& e : sourceBlock->linkData)
{
if (e.exitAddress == block.effectiveAddress)
{
@ -346,14 +348,12 @@ void JitBaseBlockCache::DestroyBlock(JitBlock& block)
// Delete linking addresses
for (const auto& e : block.linkData)
{
auto it = links_to.equal_range(e.exitAddress);
while (it.first != it.second)
{
if (it.first->second == &block)
it.first = links_to.erase(it.first);
else
it.first++;
}
auto it = links_to.find(e.exitAddress);
if (it == links_to.end())
continue;
it->second.erase(&block);
if (it->second.empty())
links_to.erase(it);
}
// Raise an signal if we are going to call this block again

View File

@ -12,6 +12,8 @@
#include <memory>
#include <set>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "Common/CommonTypes.h"
@ -182,7 +184,7 @@ private:
// links_to hold all exit points of all valid blocks in a reverse way.
// It is used to query all blocks which links to an address.
std::multimap<u32, JitBlock*> links_to; // destination_PC -> number
std::unordered_map<u32, std::unordered_set<JitBlock*>> links_to; // destination_PC -> number
// Map indexed by the physical address of the entry point.
// This is used to query the block based on the current PC in a slow way.
@ -192,7 +194,7 @@ private:
// This is used for invalidation of memory regions. The range is grouped
// in macro blocks of each 0x100 bytes.
static constexpr u32 BLOCK_RANGE_MAP_ELEMENTS = 0x100;
std::map<u32, std::set<JitBlock*>> block_range_map;
std::map<u32, std::unordered_set<JitBlock*>> block_range_map;
// This bitsets shows which cachelines overlap with any blocks.
// It is used to provide a fast way to query if no icache invalidation is needed.