From 859da32a6c3ec98c10839f1e18254e1a1107d575 Mon Sep 17 00:00:00 2001 From: Franz-Josef Haider Date: Tue, 4 Apr 2023 21:26:43 +0300 Subject: [PATCH] Jit: Improve block lookup performance through a shm memory segment. By using a shm memory segment for the fast_block_map that is sparsely allocated (i.e. on write by the OS) instead of a statically allocated array we can make the block lookup faster by: * Having a bigger space available for lookup that doesn't take up too much memory, because the OS will only allocate the needed pages when written to. * Decrease the time spent to lookup a block in the assembly dispatcher due to less comparisions and shorter code (for example the pc check has been entirely dropped since only the msrBits need to be validated). When the JIT block cache is full the shm segment will also be released and reallocated to avoid allocating too much memory. It will also be reset when the instruction cache is flushed by the PPC code to avoid having stale entries. Also fallback to the original method in case the memory segment couldn't be allocated. --- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 61 ++++++++++---- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 84 +++++++++++++------ .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 58 ++++++++++--- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 22 ++++- 4 files changed, 168 insertions(+), 57 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 44e2a3d875..d989d33fd5 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -20,6 +20,10 @@ using namespace Gen; +// These need to be next of each other so that the assembly +// code can compare them easily. +static_assert(offsetof(JitBlockData, effectiveAddress) + 4 == offsetof(JitBlockData, msrBits)); + Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(jit) { } @@ -103,35 +107,58 @@ void Jit64AsmRoutineManager::Generate() const bool assembly_dispatcher = true; if (assembly_dispatcher) { - // Fast block number lookup. - // ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2 - MOV(32, R(RSCRATCH), PPCSTATE(pc)); - // Keep a copy for later. - MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH)); - u64 icache = reinterpret_cast(m_jit.GetBlockCache()->GetFastBlockMap()); - AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 2)); - if (icache <= INT_MAX) + if (m_jit.GetBlockCache()->GetFastBlockMap()) { - MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast(icache))); + u64 icache = reinterpret_cast(m_jit.GetBlockCache()->GetFastBlockMap()); + MOV(32, R(RSCRATCH), PPCSTATE(pc)); + + MOV(64, R(RSCRATCH2), Imm64(icache)); + // Each 4-byte offset of the PC register corresponds to a 8-byte offset + // in the lookup table due to host pointers being 8-bytes long. + MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0)); } else { - MOV(64, R(RSCRATCH2), Imm64(icache)); - MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0)); + // Fast block number lookup. + // ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2 + MOV(32, R(RSCRATCH), PPCSTATE(pc)); + // Keep a copy for later. + MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH)); + u64 icache = reinterpret_cast(m_jit.GetBlockCache()->GetFastBlockMapFallback()); + AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 2)); + if (icache <= INT_MAX) + { + MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast(icache))); + } + else + { + MOV(64, R(RSCRATCH2), Imm64(icache)); + MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0)); + } } // Check if we found a block. TEST(64, R(RSCRATCH), R(RSCRATCH)); FixupBranch not_found = J_CC(CC_Z); - // Check both block.effectiveAddress and block.msrBits. + // Check block.msrBits. MOV(32, R(RSCRATCH2), PPCSTATE(msr)); AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK)); - SHL(64, R(RSCRATCH2), Imm8(32)); - // RSCRATCH_EXTRA still has the PC. - OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); - CMP(64, R(RSCRATCH2), - MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, effectiveAddress)))); + + if (m_jit.GetBlockCache()->GetFastBlockMap()) + { + CMP(32, R(RSCRATCH2), MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, msrBits)))); + } + else + { + // Also check the block.effectiveAddress + SHL(64, R(RSCRATCH2), Imm8(32)); + // RSCRATCH_EXTRA still has the PC. + OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); + CMP(64, R(RSCRATCH2), + MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, effectiveAddress)))); + } + FixupBranch state_mismatch = J_CC(CC_NE); // Success; branch to the block we found. diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index ff5fd713f2..6f1066188e 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -110,35 +110,67 @@ void JitArm64::GenerateAsm() jo.fastmem_arena ? memory.GetLogicalBase() : memory.GetLogicalPageMappingsBase()); SetJumpTarget(membaseend); - // iCache[(address >> 2) & iCache_Mask]; - ARM64Reg pc_masked = ARM64Reg::W25; - ARM64Reg cache_base = ARM64Reg::X27; - ARM64Reg block = ARM64Reg::X30; - ORR(pc_masked, ARM64Reg::WZR, LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 3, 32)); - AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1)); - MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap()); - LDR(block, cache_base, EncodeRegTo64(pc_masked)); - FixupBranch not_found = CBZ(block); + if (GetBlockCache()->GetFastBlockMap()) + { + // Check if there is a block + ARM64Reg pc_masked = ARM64Reg::X25; + ARM64Reg cache_base = ARM64Reg::X27; + ARM64Reg block = ARM64Reg::X30; + LSL(pc_masked, DISPATCHER_PC, 1); + MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap()); + LDR(block, cache_base, pc_masked); + FixupBranch not_found = CBZ(block); - // b.effectiveAddress != addr || b.msrBits != msr - ARM64Reg pc_and_msr = ARM64Reg::W25; - ARM64Reg pc_and_msr2 = ARM64Reg::W24; - LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress)); - CMP(pc_and_msr, DISPATCHER_PC); - FixupBranch pc_missmatch = B(CC_NEQ); + // b.msrBits != msr + ARM64Reg msr = ARM64Reg::W25; + ARM64Reg msr2 = ARM64Reg::W24; + LDR(IndexType::Unsigned, msr, PPC_REG, PPCSTATE_OFF(msr)); + AND(msr, msr, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); + LDR(IndexType::Unsigned, msr2, block, offsetof(JitBlockData, msrBits)); + CMP(msr, msr2); - LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr)); - AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); - LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits)); - CMP(pc_and_msr, pc_and_msr2); - FixupBranch msr_missmatch = B(CC_NEQ); + FixupBranch msr_missmatch = B(CC_NEQ); - // return blocks[block_num].normalEntry; - LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); - BR(block); - SetJumpTarget(not_found); - SetJumpTarget(pc_missmatch); - SetJumpTarget(msr_missmatch); + // return blocks[block_num].normalEntry; + LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); + BR(block); + SetJumpTarget(not_found); + SetJumpTarget(msr_missmatch); + } + else + { + // iCache[(address >> 2) & iCache_Mask]; + ARM64Reg pc_masked = ARM64Reg::W25; + ARM64Reg cache_base = ARM64Reg::X27; + ARM64Reg block = ARM64Reg::X30; + ORR(pc_masked, ARM64Reg::WZR, + LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32)); + AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1)); + MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap()); + LDR(block, cache_base, EncodeRegTo64(pc_masked)); + FixupBranch not_found = CBZ(block); + + // b.effectiveAddress != addr || b.msrBits != msr + ARM64Reg pc_and_msr = ARM64Reg::W25; + ARM64Reg pc_and_msr2 = ARM64Reg::W24; + LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress)); + CMP(pc_and_msr, DISPATCHER_PC); + FixupBranch pc_missmatch = B(CC_NEQ); + + LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr)); + AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); + LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits)); + CMP(pc_and_msr, pc_and_msr2); + + FixupBranch msr_missmatch = B(CC_NEQ); + + // return blocks[block_num].normalEntry; + LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); + BR(block); + SetJumpTarget(not_found); + SetJumpTarget(pc_missmatch); + SetJumpTarget(msr_missmatch); + } } // Call C version of Dispatch(). diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index b01647b70f..6b746d94d1 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -42,12 +42,21 @@ void JitBaseBlockCache::Init() { Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR)); + m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock"); + Clear(); } void JitBaseBlockCache::Shutdown() { Common::JitRegister::Shutdown(); + + if (m_fast_block_map) + { + m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE); + } + + m_block_map_arena.ReleaseSHMSegment(); } // This clears the JIT cache. It's called from JitCache.cpp when the JIT cache @@ -70,7 +79,24 @@ void JitBaseBlockCache::Clear() valid_block.ClearAll(); - fast_block_map.fill(nullptr); + if (m_fast_block_map) + { + m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE); + m_block_map_arena.ReleaseSHMSegment(); + m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock"); + } + + m_fast_block_map = + reinterpret_cast(m_block_map_arena.CreateView(0, FAST_BLOCK_MAP_SIZE)); + + if (m_fast_block_map) + { + m_fast_block_map_ptr = m_fast_block_map; + } + else + { + m_fast_block_map_ptr = m_fast_block_map_fallback.data(); + } } void JitBaseBlockCache::Reset() @@ -81,7 +107,12 @@ void JitBaseBlockCache::Reset() JitBlock** JitBaseBlockCache::GetFastBlockMap() { - return fast_block_map.data(); + return m_fast_block_map; +} + +JitBlock** JitBaseBlockCache::GetFastBlockMapFallback() +{ + return m_fast_block_map_fallback.data(); } void JitBaseBlockCache::RunOnBlocks(std::function f) @@ -106,7 +137,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link, const std::set& physical_addresses) { size_t index = FastLookupIndexForAddress(block.effectiveAddress); - fast_block_map[index] = █ + m_fast_block_map_ptr[index] = █ block.fast_block_map_index = index; block.physical_addresses = physical_addresses; @@ -169,7 +200,7 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr) const u8* JitBaseBlockCache::Dispatch() { const auto& ppc_state = m_jit.m_ppc_state; - JitBlock* block = fast_block_map[FastLookupIndexForAddress(ppc_state.pc)]; + JitBlock* block = m_fast_block_map_ptr[FastLookupIndexForAddress(ppc_state.pc)]; if (!block || block->effectiveAddress != ppc_state.pc || block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK)) @@ -390,8 +421,8 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block) void JitBaseBlockCache::DestroyBlock(JitBlock& block) { - if (fast_block_map[block.fast_block_map_index] == &block) - fast_block_map[block.fast_block_map_index] = nullptr; + if (m_fast_block_map_ptr[block.fast_block_map_index] == &block) + m_fast_block_map_ptr[block.fast_block_map_index] = nullptr; UnlinkBlock(block); @@ -418,12 +449,12 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr) return nullptr; // Drop old fast block map entry - if (fast_block_map[block->fast_block_map_index] == block) - fast_block_map[block->fast_block_map_index] = nullptr; + if (m_fast_block_map_ptr[block->fast_block_map_index] == block) + m_fast_block_map_ptr[block->fast_block_map_index] = nullptr; // And create a new one size_t index = FastLookupIndexForAddress(addr); - fast_block_map[index] = block; + m_fast_block_map_ptr[index] = block; block->fast_block_map_index = index; return block; @@ -431,5 +462,12 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr) size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address) { - return (address >> 2) & FAST_BLOCK_MAP_MASK; + if (m_fast_block_map) + { + return address >> 2; + } + else + { + return (address >> 2) & FAST_BLOCK_MAP_FALLBACK_MASK; + } } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index f2c0719109..61b5966a14 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -16,6 +16,7 @@ #include #include "Common/CommonTypes.h" +#include "Core/HW/Memmap.h" class JitBase; @@ -131,8 +132,11 @@ public: // is valid (MSR.IR and MSR.DR, the address translation bits). static constexpr u32 JIT_CACHE_MSR_MASK = 0x30; - static constexpr u32 FAST_BLOCK_MAP_ELEMENTS = 0x10000; - static constexpr u32 FAST_BLOCK_MAP_MASK = FAST_BLOCK_MAP_ELEMENTS - 1; + // The value for the map is determined like this: + // ((4 GB guest memory space) / (4 bytes per address)) * sizeof(JitBlock*) + static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x2'0000'0000; + static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000; + static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1; explicit JitBaseBlockCache(JitBase& jit); virtual ~JitBaseBlockCache(); @@ -144,6 +148,7 @@ public: // Code Cache JitBlock** GetFastBlockMap(); + JitBlock** GetFastBlockMapFallback(); void RunOnBlocks(std::function f); JitBlock* AllocateBlock(u32 em_address); @@ -203,7 +208,16 @@ private: // It is used to provide a fast way to query if no icache invalidation is needed. ValidBlockBitSet valid_block; - // This array is indexed with the masked PC and likely holds the correct block id. + // This array is indexed with the shifted PC and likely holds the correct block id. // This is used as a fast cache of block_map used in the assembly dispatcher. - std::array fast_block_map{}; // start_addr & mask -> number + // It is implemented via a shm segment using m_block_map_arena. + JitBlock** m_fast_block_map = 0; + Common::MemArena m_block_map_arena; + + // An alternative for the above fast_block_map but without a shm segment + // in case the shm memory region couldn't be allocated. + std::array + m_fast_block_map_fallback{}; // start_addr & mask -> number + + JitBlock** m_fast_block_map_ptr = 0; };