Jit: Improve block lookup performance through a shm memory segment.

By using a shm memory segment for the fast_block_map that is sparsely
allocated (i.e. on write by the OS) instead of a statically allocated
array we can make the block lookup faster by:
* Having a bigger space available for lookup that doesn't take up
  too much memory, because the OS will only allocate the needed
  pages when written to.
* Decrease the time spent to lookup a block in the assembly dispatcher
  due to less comparisions and shorter code (for example the pc check
  has been entirely dropped since only the msrBits need to be validated).

When the JIT block cache is full the shm segment will also be released
and reallocated to avoid allocating too much memory. It will also be
reset when the instruction cache is flushed by the PPC code to avoid
having stale entries.

Also fallback to the original method in case the memory segment couldn't
be allocated.
This commit is contained in:
Franz-Josef Haider 2023-04-04 21:26:43 +03:00
parent 4efa10c170
commit 859da32a6c
4 changed files with 168 additions and 57 deletions

View File

@ -20,6 +20,10 @@
using namespace Gen;
// These need to be next of each other so that the assembly
// code can compare them easily.
static_assert(offsetof(JitBlockData, effectiveAddress) + 4 == offsetof(JitBlockData, msrBits));
Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(jit)
{
}
@ -103,35 +107,58 @@ void Jit64AsmRoutineManager::Generate()
const bool assembly_dispatcher = true;
if (assembly_dispatcher)
{
// Fast block number lookup.
// ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2
MOV(32, R(RSCRATCH), PPCSTATE(pc));
// Keep a copy for later.
MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap());
AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 2));
if (icache <= INT_MAX)
if (m_jit.GetBlockCache()->GetFastBlockMap())
{
MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast<s32>(icache)));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap());
MOV(32, R(RSCRATCH), PPCSTATE(pc));
MOV(64, R(RSCRATCH2), Imm64(icache));
// Each 4-byte offset of the PC register corresponds to a 8-byte offset
// in the lookup table due to host pointers being 8-bytes long.
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
}
else
{
MOV(64, R(RSCRATCH2), Imm64(icache));
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
// Fast block number lookup.
// ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2
MOV(32, R(RSCRATCH), PPCSTATE(pc));
// Keep a copy for later.
MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMapFallback());
AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 2));
if (icache <= INT_MAX)
{
MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast<s32>(icache)));
}
else
{
MOV(64, R(RSCRATCH2), Imm64(icache));
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
}
}
// Check if we found a block.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch not_found = J_CC(CC_Z);
// Check both block.effectiveAddress and block.msrBits.
// Check block.msrBits.
MOV(32, R(RSCRATCH2), PPCSTATE(msr));
AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK));
SHL(64, R(RSCRATCH2), Imm8(32));
// RSCRATCH_EXTRA still has the PC.
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
CMP(64, R(RSCRATCH2),
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));
if (m_jit.GetBlockCache()->GetFastBlockMap())
{
CMP(32, R(RSCRATCH2), MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, msrBits))));
}
else
{
// Also check the block.effectiveAddress
SHL(64, R(RSCRATCH2), Imm8(32));
// RSCRATCH_EXTRA still has the PC.
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
CMP(64, R(RSCRATCH2),
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));
}
FixupBranch state_mismatch = J_CC(CC_NE);
// Success; branch to the block we found.

View File

@ -110,35 +110,67 @@ void JitArm64::GenerateAsm()
jo.fastmem_arena ? memory.GetLogicalBase() : memory.GetLogicalPageMappingsBase());
SetJumpTarget(membaseend);
// iCache[(address >> 2) & iCache_Mask];
ARM64Reg pc_masked = ARM64Reg::W25;
ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30;
ORR(pc_masked, ARM64Reg::WZR, LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 3, 32));
AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1));
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
LDR(block, cache_base, EncodeRegTo64(pc_masked));
FixupBranch not_found = CBZ(block);
if (GetBlockCache()->GetFastBlockMap())
{
// Check if there is a block
ARM64Reg pc_masked = ARM64Reg::X25;
ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30;
LSL(pc_masked, DISPATCHER_PC, 1);
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
LDR(block, cache_base, pc_masked);
FixupBranch not_found = CBZ(block);
// b.effectiveAddress != addr || b.msrBits != msr
ARM64Reg pc_and_msr = ARM64Reg::W25;
ARM64Reg pc_and_msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress));
CMP(pc_and_msr, DISPATCHER_PC);
FixupBranch pc_missmatch = B(CC_NEQ);
// b.msrBits != msr
ARM64Reg msr = ARM64Reg::W25;
ARM64Reg msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, msr, PPC_REG, PPCSTATE_OFF(msr));
AND(msr, msr, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, msr2, block, offsetof(JitBlockData, msrBits));
CMP(msr, msr2);
LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr));
AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits));
CMP(pc_and_msr, pc_and_msr2);
FixupBranch msr_missmatch = B(CC_NEQ);
FixupBranch msr_missmatch = B(CC_NEQ);
// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block);
SetJumpTarget(not_found);
SetJumpTarget(pc_missmatch);
SetJumpTarget(msr_missmatch);
// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block);
SetJumpTarget(not_found);
SetJumpTarget(msr_missmatch);
}
else
{
// iCache[(address >> 2) & iCache_Mask];
ARM64Reg pc_masked = ARM64Reg::W25;
ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30;
ORR(pc_masked, ARM64Reg::WZR,
LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32));
AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1));
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
LDR(block, cache_base, EncodeRegTo64(pc_masked));
FixupBranch not_found = CBZ(block);
// b.effectiveAddress != addr || b.msrBits != msr
ARM64Reg pc_and_msr = ARM64Reg::W25;
ARM64Reg pc_and_msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress));
CMP(pc_and_msr, DISPATCHER_PC);
FixupBranch pc_missmatch = B(CC_NEQ);
LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr));
AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits));
CMP(pc_and_msr, pc_and_msr2);
FixupBranch msr_missmatch = B(CC_NEQ);
// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block);
SetJumpTarget(not_found);
SetJumpTarget(pc_missmatch);
SetJumpTarget(msr_missmatch);
}
}
// Call C version of Dispatch().

View File

@ -42,12 +42,21 @@ void JitBaseBlockCache::Init()
{
Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR));
m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock");
Clear();
}
void JitBaseBlockCache::Shutdown()
{
Common::JitRegister::Shutdown();
if (m_fast_block_map)
{
m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE);
}
m_block_map_arena.ReleaseSHMSegment();
}
// This clears the JIT cache. It's called from JitCache.cpp when the JIT cache
@ -70,7 +79,24 @@ void JitBaseBlockCache::Clear()
valid_block.ClearAll();
fast_block_map.fill(nullptr);
if (m_fast_block_map)
{
m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE);
m_block_map_arena.ReleaseSHMSegment();
m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock");
}
m_fast_block_map =
reinterpret_cast<JitBlock**>(m_block_map_arena.CreateView(0, FAST_BLOCK_MAP_SIZE));
if (m_fast_block_map)
{
m_fast_block_map_ptr = m_fast_block_map;
}
else
{
m_fast_block_map_ptr = m_fast_block_map_fallback.data();
}
}
void JitBaseBlockCache::Reset()
@ -81,7 +107,12 @@ void JitBaseBlockCache::Reset()
JitBlock** JitBaseBlockCache::GetFastBlockMap()
{
return fast_block_map.data();
return m_fast_block_map;
}
JitBlock** JitBaseBlockCache::GetFastBlockMapFallback()
{
return m_fast_block_map_fallback.data();
}
void JitBaseBlockCache::RunOnBlocks(std::function<void(const JitBlock&)> f)
@ -106,7 +137,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
const std::set<u32>& physical_addresses)
{
size_t index = FastLookupIndexForAddress(block.effectiveAddress);
fast_block_map[index] = &block;
m_fast_block_map_ptr[index] = &block;
block.fast_block_map_index = index;
block.physical_addresses = physical_addresses;
@ -169,7 +200,7 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr)
const u8* JitBaseBlockCache::Dispatch()
{
const auto& ppc_state = m_jit.m_ppc_state;
JitBlock* block = fast_block_map[FastLookupIndexForAddress(ppc_state.pc)];
JitBlock* block = m_fast_block_map_ptr[FastLookupIndexForAddress(ppc_state.pc)];
if (!block || block->effectiveAddress != ppc_state.pc ||
block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK))
@ -390,8 +421,8 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block)
void JitBaseBlockCache::DestroyBlock(JitBlock& block)
{
if (fast_block_map[block.fast_block_map_index] == &block)
fast_block_map[block.fast_block_map_index] = nullptr;
if (m_fast_block_map_ptr[block.fast_block_map_index] == &block)
m_fast_block_map_ptr[block.fast_block_map_index] = nullptr;
UnlinkBlock(block);
@ -418,12 +449,12 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr)
return nullptr;
// Drop old fast block map entry
if (fast_block_map[block->fast_block_map_index] == block)
fast_block_map[block->fast_block_map_index] = nullptr;
if (m_fast_block_map_ptr[block->fast_block_map_index] == block)
m_fast_block_map_ptr[block->fast_block_map_index] = nullptr;
// And create a new one
size_t index = FastLookupIndexForAddress(addr);
fast_block_map[index] = block;
m_fast_block_map_ptr[index] = block;
block->fast_block_map_index = index;
return block;
@ -431,5 +462,12 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr)
size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address)
{
return (address >> 2) & FAST_BLOCK_MAP_MASK;
if (m_fast_block_map)
{
return address >> 2;
}
else
{
return (address >> 2) & FAST_BLOCK_MAP_FALLBACK_MASK;
}
}

View File

@ -16,6 +16,7 @@
#include <vector>
#include "Common/CommonTypes.h"
#include "Core/HW/Memmap.h"
class JitBase;
@ -131,8 +132,11 @@ public:
// is valid (MSR.IR and MSR.DR, the address translation bits).
static constexpr u32 JIT_CACHE_MSR_MASK = 0x30;
static constexpr u32 FAST_BLOCK_MAP_ELEMENTS = 0x10000;
static constexpr u32 FAST_BLOCK_MAP_MASK = FAST_BLOCK_MAP_ELEMENTS - 1;
// The value for the map is determined like this:
// ((4 GB guest memory space) / (4 bytes per address)) * sizeof(JitBlock*)
static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x2'0000'0000;
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000;
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1;
explicit JitBaseBlockCache(JitBase& jit);
virtual ~JitBaseBlockCache();
@ -144,6 +148,7 @@ public:
// Code Cache
JitBlock** GetFastBlockMap();
JitBlock** GetFastBlockMapFallback();
void RunOnBlocks(std::function<void(const JitBlock&)> f);
JitBlock* AllocateBlock(u32 em_address);
@ -203,7 +208,16 @@ private:
// It is used to provide a fast way to query if no icache invalidation is needed.
ValidBlockBitSet valid_block;
// This array is indexed with the masked PC and likely holds the correct block id.
// This array is indexed with the shifted PC and likely holds the correct block id.
// This is used as a fast cache of block_map used in the assembly dispatcher.
std::array<JitBlock*, FAST_BLOCK_MAP_ELEMENTS> fast_block_map{}; // start_addr & mask -> number
// It is implemented via a shm segment using m_block_map_arena.
JitBlock** m_fast_block_map = 0;
Common::MemArena m_block_map_arena;
// An alternative for the above fast_block_map but without a shm segment
// in case the shm memory region couldn't be allocated.
std::array<JitBlock*, FAST_BLOCK_MAP_FALLBACK_ELEMENTS>
m_fast_block_map_fallback{}; // start_addr & mask -> number
JitBlock** m_fast_block_map_ptr = 0;
};