Jit: Improve block lookup performance through a shm memory segment.
By using a shm memory segment for the fast_block_map that is sparsely allocated (i.e. on write by the OS) instead of a statically allocated array we can make the block lookup faster by: * Having a bigger space available for lookup that doesn't take up too much memory, because the OS will only allocate the needed pages when written to. * Decrease the time spent to lookup a block in the assembly dispatcher due to less comparisions and shorter code (for example the pc check has been entirely dropped since only the msrBits need to be validated). When the JIT block cache is full the shm segment will also be released and reallocated to avoid allocating too much memory. It will also be reset when the instruction cache is flushed by the PPC code to avoid having stale entries. Also fallback to the original method in case the memory segment couldn't be allocated.
This commit is contained in:
parent
4efa10c170
commit
859da32a6c
|
@ -20,6 +20,10 @@
|
|||
|
||||
using namespace Gen;
|
||||
|
||||
// These need to be next of each other so that the assembly
|
||||
// code can compare them easily.
|
||||
static_assert(offsetof(JitBlockData, effectiveAddress) + 4 == offsetof(JitBlockData, msrBits));
|
||||
|
||||
Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(jit)
|
||||
{
|
||||
}
|
||||
|
@ -103,35 +107,58 @@ void Jit64AsmRoutineManager::Generate()
|
|||
const bool assembly_dispatcher = true;
|
||||
if (assembly_dispatcher)
|
||||
{
|
||||
// Fast block number lookup.
|
||||
// ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2
|
||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||
// Keep a copy for later.
|
||||
MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap());
|
||||
AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 2));
|
||||
if (icache <= INT_MAX)
|
||||
if (m_jit.GetBlockCache()->GetFastBlockMap())
|
||||
{
|
||||
MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast<s32>(icache)));
|
||||
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap());
|
||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||
|
||||
MOV(64, R(RSCRATCH2), Imm64(icache));
|
||||
// Each 4-byte offset of the PC register corresponds to a 8-byte offset
|
||||
// in the lookup table due to host pointers being 8-bytes long.
|
||||
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(64, R(RSCRATCH2), Imm64(icache));
|
||||
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
|
||||
// Fast block number lookup.
|
||||
// ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2
|
||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||
// Keep a copy for later.
|
||||
MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMapFallback());
|
||||
AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 2));
|
||||
if (icache <= INT_MAX)
|
||||
{
|
||||
MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast<s32>(icache)));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(64, R(RSCRATCH2), Imm64(icache));
|
||||
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we found a block.
|
||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||
FixupBranch not_found = J_CC(CC_Z);
|
||||
|
||||
// Check both block.effectiveAddress and block.msrBits.
|
||||
// Check block.msrBits.
|
||||
MOV(32, R(RSCRATCH2), PPCSTATE(msr));
|
||||
AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK));
|
||||
SHL(64, R(RSCRATCH2), Imm8(32));
|
||||
// RSCRATCH_EXTRA still has the PC.
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||
CMP(64, R(RSCRATCH2),
|
||||
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));
|
||||
|
||||
if (m_jit.GetBlockCache()->GetFastBlockMap())
|
||||
{
|
||||
CMP(32, R(RSCRATCH2), MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, msrBits))));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Also check the block.effectiveAddress
|
||||
SHL(64, R(RSCRATCH2), Imm8(32));
|
||||
// RSCRATCH_EXTRA still has the PC.
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||
CMP(64, R(RSCRATCH2),
|
||||
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));
|
||||
}
|
||||
|
||||
FixupBranch state_mismatch = J_CC(CC_NE);
|
||||
|
||||
// Success; branch to the block we found.
|
||||
|
|
|
@ -110,35 +110,67 @@ void JitArm64::GenerateAsm()
|
|||
jo.fastmem_arena ? memory.GetLogicalBase() : memory.GetLogicalPageMappingsBase());
|
||||
SetJumpTarget(membaseend);
|
||||
|
||||
// iCache[(address >> 2) & iCache_Mask];
|
||||
ARM64Reg pc_masked = ARM64Reg::W25;
|
||||
ARM64Reg cache_base = ARM64Reg::X27;
|
||||
ARM64Reg block = ARM64Reg::X30;
|
||||
ORR(pc_masked, ARM64Reg::WZR, LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 3, 32));
|
||||
AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1));
|
||||
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
|
||||
LDR(block, cache_base, EncodeRegTo64(pc_masked));
|
||||
FixupBranch not_found = CBZ(block);
|
||||
if (GetBlockCache()->GetFastBlockMap())
|
||||
{
|
||||
// Check if there is a block
|
||||
ARM64Reg pc_masked = ARM64Reg::X25;
|
||||
ARM64Reg cache_base = ARM64Reg::X27;
|
||||
ARM64Reg block = ARM64Reg::X30;
|
||||
LSL(pc_masked, DISPATCHER_PC, 1);
|
||||
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
|
||||
LDR(block, cache_base, pc_masked);
|
||||
FixupBranch not_found = CBZ(block);
|
||||
|
||||
// b.effectiveAddress != addr || b.msrBits != msr
|
||||
ARM64Reg pc_and_msr = ARM64Reg::W25;
|
||||
ARM64Reg pc_and_msr2 = ARM64Reg::W24;
|
||||
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress));
|
||||
CMP(pc_and_msr, DISPATCHER_PC);
|
||||
FixupBranch pc_missmatch = B(CC_NEQ);
|
||||
// b.msrBits != msr
|
||||
ARM64Reg msr = ARM64Reg::W25;
|
||||
ARM64Reg msr2 = ARM64Reg::W24;
|
||||
LDR(IndexType::Unsigned, msr, PPC_REG, PPCSTATE_OFF(msr));
|
||||
AND(msr, msr, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
|
||||
LDR(IndexType::Unsigned, msr2, block, offsetof(JitBlockData, msrBits));
|
||||
CMP(msr, msr2);
|
||||
|
||||
LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr));
|
||||
AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
|
||||
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits));
|
||||
CMP(pc_and_msr, pc_and_msr2);
|
||||
FixupBranch msr_missmatch = B(CC_NEQ);
|
||||
FixupBranch msr_missmatch = B(CC_NEQ);
|
||||
|
||||
// return blocks[block_num].normalEntry;
|
||||
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
|
||||
BR(block);
|
||||
SetJumpTarget(not_found);
|
||||
SetJumpTarget(pc_missmatch);
|
||||
SetJumpTarget(msr_missmatch);
|
||||
// return blocks[block_num].normalEntry;
|
||||
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
|
||||
BR(block);
|
||||
SetJumpTarget(not_found);
|
||||
SetJumpTarget(msr_missmatch);
|
||||
}
|
||||
else
|
||||
{
|
||||
// iCache[(address >> 2) & iCache_Mask];
|
||||
ARM64Reg pc_masked = ARM64Reg::W25;
|
||||
ARM64Reg cache_base = ARM64Reg::X27;
|
||||
ARM64Reg block = ARM64Reg::X30;
|
||||
ORR(pc_masked, ARM64Reg::WZR,
|
||||
LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32));
|
||||
AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1));
|
||||
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
|
||||
LDR(block, cache_base, EncodeRegTo64(pc_masked));
|
||||
FixupBranch not_found = CBZ(block);
|
||||
|
||||
// b.effectiveAddress != addr || b.msrBits != msr
|
||||
ARM64Reg pc_and_msr = ARM64Reg::W25;
|
||||
ARM64Reg pc_and_msr2 = ARM64Reg::W24;
|
||||
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress));
|
||||
CMP(pc_and_msr, DISPATCHER_PC);
|
||||
FixupBranch pc_missmatch = B(CC_NEQ);
|
||||
|
||||
LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr));
|
||||
AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
|
||||
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits));
|
||||
CMP(pc_and_msr, pc_and_msr2);
|
||||
|
||||
FixupBranch msr_missmatch = B(CC_NEQ);
|
||||
|
||||
// return blocks[block_num].normalEntry;
|
||||
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
|
||||
BR(block);
|
||||
SetJumpTarget(not_found);
|
||||
SetJumpTarget(pc_missmatch);
|
||||
SetJumpTarget(msr_missmatch);
|
||||
}
|
||||
}
|
||||
|
||||
// Call C version of Dispatch().
|
||||
|
|
|
@ -42,12 +42,21 @@ void JitBaseBlockCache::Init()
|
|||
{
|
||||
Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR));
|
||||
|
||||
m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock");
|
||||
|
||||
Clear();
|
||||
}
|
||||
|
||||
void JitBaseBlockCache::Shutdown()
|
||||
{
|
||||
Common::JitRegister::Shutdown();
|
||||
|
||||
if (m_fast_block_map)
|
||||
{
|
||||
m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE);
|
||||
}
|
||||
|
||||
m_block_map_arena.ReleaseSHMSegment();
|
||||
}
|
||||
|
||||
// This clears the JIT cache. It's called from JitCache.cpp when the JIT cache
|
||||
|
@ -70,7 +79,24 @@ void JitBaseBlockCache::Clear()
|
|||
|
||||
valid_block.ClearAll();
|
||||
|
||||
fast_block_map.fill(nullptr);
|
||||
if (m_fast_block_map)
|
||||
{
|
||||
m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE);
|
||||
m_block_map_arena.ReleaseSHMSegment();
|
||||
m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock");
|
||||
}
|
||||
|
||||
m_fast_block_map =
|
||||
reinterpret_cast<JitBlock**>(m_block_map_arena.CreateView(0, FAST_BLOCK_MAP_SIZE));
|
||||
|
||||
if (m_fast_block_map)
|
||||
{
|
||||
m_fast_block_map_ptr = m_fast_block_map;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_fast_block_map_ptr = m_fast_block_map_fallback.data();
|
||||
}
|
||||
}
|
||||
|
||||
void JitBaseBlockCache::Reset()
|
||||
|
@ -81,7 +107,12 @@ void JitBaseBlockCache::Reset()
|
|||
|
||||
JitBlock** JitBaseBlockCache::GetFastBlockMap()
|
||||
{
|
||||
return fast_block_map.data();
|
||||
return m_fast_block_map;
|
||||
}
|
||||
|
||||
JitBlock** JitBaseBlockCache::GetFastBlockMapFallback()
|
||||
{
|
||||
return m_fast_block_map_fallback.data();
|
||||
}
|
||||
|
||||
void JitBaseBlockCache::RunOnBlocks(std::function<void(const JitBlock&)> f)
|
||||
|
@ -106,7 +137,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
|
|||
const std::set<u32>& physical_addresses)
|
||||
{
|
||||
size_t index = FastLookupIndexForAddress(block.effectiveAddress);
|
||||
fast_block_map[index] = █
|
||||
m_fast_block_map_ptr[index] = █
|
||||
block.fast_block_map_index = index;
|
||||
|
||||
block.physical_addresses = physical_addresses;
|
||||
|
@ -169,7 +200,7 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr)
|
|||
const u8* JitBaseBlockCache::Dispatch()
|
||||
{
|
||||
const auto& ppc_state = m_jit.m_ppc_state;
|
||||
JitBlock* block = fast_block_map[FastLookupIndexForAddress(ppc_state.pc)];
|
||||
JitBlock* block = m_fast_block_map_ptr[FastLookupIndexForAddress(ppc_state.pc)];
|
||||
|
||||
if (!block || block->effectiveAddress != ppc_state.pc ||
|
||||
block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK))
|
||||
|
@ -390,8 +421,8 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block)
|
|||
|
||||
void JitBaseBlockCache::DestroyBlock(JitBlock& block)
|
||||
{
|
||||
if (fast_block_map[block.fast_block_map_index] == &block)
|
||||
fast_block_map[block.fast_block_map_index] = nullptr;
|
||||
if (m_fast_block_map_ptr[block.fast_block_map_index] == &block)
|
||||
m_fast_block_map_ptr[block.fast_block_map_index] = nullptr;
|
||||
|
||||
UnlinkBlock(block);
|
||||
|
||||
|
@ -418,12 +449,12 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr)
|
|||
return nullptr;
|
||||
|
||||
// Drop old fast block map entry
|
||||
if (fast_block_map[block->fast_block_map_index] == block)
|
||||
fast_block_map[block->fast_block_map_index] = nullptr;
|
||||
if (m_fast_block_map_ptr[block->fast_block_map_index] == block)
|
||||
m_fast_block_map_ptr[block->fast_block_map_index] = nullptr;
|
||||
|
||||
// And create a new one
|
||||
size_t index = FastLookupIndexForAddress(addr);
|
||||
fast_block_map[index] = block;
|
||||
m_fast_block_map_ptr[index] = block;
|
||||
block->fast_block_map_index = index;
|
||||
|
||||
return block;
|
||||
|
@ -431,5 +462,12 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr)
|
|||
|
||||
size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address)
|
||||
{
|
||||
return (address >> 2) & FAST_BLOCK_MAP_MASK;
|
||||
if (m_fast_block_map)
|
||||
{
|
||||
return address >> 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (address >> 2) & FAST_BLOCK_MAP_FALLBACK_MASK;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "Common/CommonTypes.h"
|
||||
#include "Core/HW/Memmap.h"
|
||||
|
||||
class JitBase;
|
||||
|
||||
|
@ -131,8 +132,11 @@ public:
|
|||
// is valid (MSR.IR and MSR.DR, the address translation bits).
|
||||
static constexpr u32 JIT_CACHE_MSR_MASK = 0x30;
|
||||
|
||||
static constexpr u32 FAST_BLOCK_MAP_ELEMENTS = 0x10000;
|
||||
static constexpr u32 FAST_BLOCK_MAP_MASK = FAST_BLOCK_MAP_ELEMENTS - 1;
|
||||
// The value for the map is determined like this:
|
||||
// ((4 GB guest memory space) / (4 bytes per address)) * sizeof(JitBlock*)
|
||||
static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x2'0000'0000;
|
||||
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000;
|
||||
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1;
|
||||
|
||||
explicit JitBaseBlockCache(JitBase& jit);
|
||||
virtual ~JitBaseBlockCache();
|
||||
|
@ -144,6 +148,7 @@ public:
|
|||
|
||||
// Code Cache
|
||||
JitBlock** GetFastBlockMap();
|
||||
JitBlock** GetFastBlockMapFallback();
|
||||
void RunOnBlocks(std::function<void(const JitBlock&)> f);
|
||||
|
||||
JitBlock* AllocateBlock(u32 em_address);
|
||||
|
@ -203,7 +208,16 @@ private:
|
|||
// It is used to provide a fast way to query if no icache invalidation is needed.
|
||||
ValidBlockBitSet valid_block;
|
||||
|
||||
// This array is indexed with the masked PC and likely holds the correct block id.
|
||||
// This array is indexed with the shifted PC and likely holds the correct block id.
|
||||
// This is used as a fast cache of block_map used in the assembly dispatcher.
|
||||
std::array<JitBlock*, FAST_BLOCK_MAP_ELEMENTS> fast_block_map{}; // start_addr & mask -> number
|
||||
// It is implemented via a shm segment using m_block_map_arena.
|
||||
JitBlock** m_fast_block_map = 0;
|
||||
Common::MemArena m_block_map_arena;
|
||||
|
||||
// An alternative for the above fast_block_map but without a shm segment
|
||||
// in case the shm memory region couldn't be allocated.
|
||||
std::array<JitBlock*, FAST_BLOCK_MAP_FALLBACK_ELEMENTS>
|
||||
m_fast_block_map_fallback{}; // start_addr & mask -> number
|
||||
|
||||
JitBlock** m_fast_block_map_ptr = 0;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue