Optimize JitCache::InvalidateICache by maintaining a "valid blocks" bitset

Most of the InvalidateICache calls are for a 32 bytes block: this is the
number of bytes invalidated by PowerPC dcb*/icb* instructions. Profiling
shows that a lot of CPU time is spent checking if there are any JIT blocks
covered by these 32 bytes (using std::map::lower_bound).

This patch adds a bitset containing the state of every 32 bytes block in
RAM (JIT cached/not JIT cached). Using that, a 32 bytes InvalidateICache
can check in the bitset if any JIT block might be invalidated. A bitset
check is a lot faster than an std::map::lower_bound operation, improving
performance of JitCache::InvalidateICache by more than 100%.

Some practical numbers:

* Xenoblade Chronicles (PAL)
  56.04FPS -> 59.28FPS (+5.78%)
* The Last Story (PAL)
  30.9FPS -> 32.83FPS (+6.25%)
* Super Mario Galaxy (PAL)
  59.76FPS -> 62.46FPS (+4.52%)

This function still takes more time than it should - more optimization in
this area might be possible (specializing for 32 bytes blocks to avoid
useless memcpy, for example).
This commit is contained in:
Pierre Bourdon 2012-10-06 01:49:09 +02:00
parent 8cefcaa94c
commit 3990002250
2 changed files with 43 additions and 24 deletions

View File

@ -142,6 +142,7 @@ bool JitBlock::ContainsAddress(u32 em_address)
}
links_to.clear();
block_map.clear();
valid_block.reset();
num_blocks = 0;
memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS);
}
@ -220,6 +221,9 @@ bool JitBlock::ContainsAddress(u32 em_address)
// Convert the logical address to a physical address for the block map
u32 pAddr = b.originalAddress & 0x1FFFFFFF;
for (u32 i = 0; i < (b.originalSize + 7) / 8; ++i)
valid_block[pAddr / 32 + i] = true;
block_map[std::make_pair(pAddr + 4 * b.originalSize - 1, pAddr)] = block_num;
if (block_link)
{
@ -440,35 +444,48 @@ bool JitBlock::ContainsAddress(u32 em_address)
// Convert the logical address to a physical address for the block map
u32 pAddr = address & 0x1FFFFFFF;
// Optimize the common case of length == 32 which is used by Interpreter::dcb*
bool destroy_block = true;
if (length == 32)
{
if (!valid_block[pAddr / 32])
destroy_block = false;
else
valid_block[pAddr / 32] = false;
}
// destroy JIT blocks
// !! this works correctly under assumption that any two overlapping blocks end at the same address
std::map<pair<u32,u32>, u32>::iterator it1 = block_map.lower_bound(std::make_pair(pAddr, 0)), it2 = it1, it;
while (it2 != block_map.end() && it2->first.second < pAddr + length)
if (destroy_block)
{
std::map<pair<u32,u32>, u32>::iterator it1 = block_map.lower_bound(std::make_pair(pAddr, 0)), it2 = it1, it;
while (it2 != block_map.end() && it2->first.second < pAddr + length)
{
#ifdef JIT_UNLIMITED_ICACHE
JitBlock &b = blocks[it2->second];
if (b.originalAddress & JIT_ICACHE_VMEM_BIT)
{
u32 cacheaddr = b.originalAddress & JIT_ICACHE_MASK;
memset(iCacheVMEM + cacheaddr, JIT_ICACHE_INVALID_BYTE, 4);
}
else if (b.originalAddress & JIT_ICACHE_EXRAM_BIT)
{
u32 cacheaddr = b.originalAddress & JIT_ICACHEEX_MASK;
memset(iCacheEx + cacheaddr, JIT_ICACHE_INVALID_BYTE, 4);
}
else
{
u32 cacheaddr = b.originalAddress & JIT_ICACHE_MASK;
memset(iCache + cacheaddr, JIT_ICACHE_INVALID_BYTE, 4);
}
JitBlock &b = blocks[it2->second];
if (b.originalAddress & JIT_ICACHE_VMEM_BIT)
{
u32 cacheaddr = b.originalAddress & JIT_ICACHE_MASK;
memset(iCacheVMEM + cacheaddr, JIT_ICACHE_INVALID_BYTE, 4);
}
else if (b.originalAddress & JIT_ICACHE_EXRAM_BIT)
{
u32 cacheaddr = b.originalAddress & JIT_ICACHEEX_MASK;
memset(iCacheEx + cacheaddr, JIT_ICACHE_INVALID_BYTE, 4);
}
else
{
u32 cacheaddr = b.originalAddress & JIT_ICACHE_MASK;
memset(iCache + cacheaddr, JIT_ICACHE_INVALID_BYTE, 4);
}
#endif
DestroyBlock(it2->second, true);
it2++;
}
if (it1 != it2)
{
block_map.erase(it1, it2);
DestroyBlock(it2->second, true);
it2++;
}
if (it1 != it2)
{
block_map.erase(it1, it2);
}
}
#ifdef JIT_UNLIMITED_ICACHE

View File

@ -18,6 +18,7 @@
#ifndef _JITCACHE_H
#define _JITCACHE_H
#include <bitset>
#include <map>
#include <vector>
@ -84,6 +85,7 @@ class JitBlockCache
int num_blocks;
std::multimap<u32, int> links_to;
std::map<std::pair<u32,u32>, u32> block_map; // (end_addr, start_addr) -> number
std::bitset<0x20000000 / 32> valid_block;
#ifdef JIT_UNLIMITED_ICACHE
u8 *iCache;
u8 *iCacheEx;