From fd9c970621032e4f5b0d7056b41eb370c386845e Mon Sep 17 00:00:00 2001 From: Frajo Haider Date: Tue, 18 Jul 2023 17:15:17 +0300 Subject: [PATCH] JitArm64/Jit64: Extend the fast lookup mmap-ed segment further to avoid needing to check the msr bits. And in order to avoid a double dereference in the dispatcher, directly store the normalEntry in the map. The index to the block map becomes ((((DR<<1) | IR) << 30) | (address >> 2)). This has been chosen since the msr bits change less often than the address, thus we keep nearby entries together. Also do not call the C dispatcher in case the assembly dispatcher didn't find a block, since it wouldn't find a block either due to the 1:1 mapping, except when falling back to the non shm segment lookup table. --- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 77 ++++++++------- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 50 +++++----- .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 93 ++++++++++++++----- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 22 ++--- 4 files changed, 149 insertions(+), 93 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 844764d1d5..ef667cce86 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -113,15 +113,22 @@ void Jit64AsmRoutineManager::Generate() const bool assembly_dispatcher = true; if (assembly_dispatcher) { - if (m_jit.GetBlockCache()->GetFastBlockMap()) + if (m_jit.GetBlockCache()->GetEntryPoints()) { - u64 icache = reinterpret_cast(m_jit.GetBlockCache()->GetFastBlockMap()); - MOV(32, R(RSCRATCH), PPCSTATE(pc)); + MOV(32, R(RSCRATCH2), PPCSTATE(msr)); + AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK)); + SHL(64, R(RSCRATCH2), Imm8(28)); + MOV(32, R(RSCRATCH_EXTRA), PPCSTATE(pc)); + OR(64, R(RSCRATCH_EXTRA), R(RSCRATCH2)); + + u64 icache = reinterpret_cast(m_jit.GetBlockCache()->GetEntryPoints()); MOV(64, R(RSCRATCH2), Imm64(icache)); - // Each 4-byte offset of the PC register corresponds to a 8-byte offset - // in the lookup table due to host pointers being 8-bytes long. - MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0)); + // The entry points map is indexed by ((msrBits << 26) | (address >> 2)). + // The map contains 8 byte 64-bit pointers and that means we need to shift + // msr left by 29 bits and address left by 1 bit to get the correct offset + // in the map. + MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH_EXTRA, SCALE_2, 0)); } else { @@ -146,49 +153,57 @@ void Jit64AsmRoutineManager::Generate() // Check if we found a block. TEST(64, R(RSCRATCH), R(RSCRATCH)); FixupBranch not_found = J_CC(CC_Z); + FixupBranch state_mismatch; - // Check block.msrBits. - MOV(32, R(RSCRATCH2), PPCSTATE(msr)); - AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK)); - - if (m_jit.GetBlockCache()->GetFastBlockMap()) - { - CMP(32, R(RSCRATCH2), MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, msrBits)))); - } - else + if (!m_jit.GetBlockCache()->GetEntryPoints()) { + // Check block.msrBits. + MOV(32, R(RSCRATCH2), PPCSTATE(msr)); + AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK)); // Also check the block.effectiveAddress SHL(64, R(RSCRATCH2), Imm8(32)); // RSCRATCH_EXTRA still has the PC. OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); CMP(64, R(RSCRATCH2), MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, effectiveAddress)))); + + state_mismatch = J_CC(CC_NE); + // Success; branch to the block we found. + JMPptr(MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, normalEntry)))); + } + else + { + // Success; branch to the block we found. + JMPptr(R(RSCRATCH)); } - FixupBranch state_mismatch = J_CC(CC_NE); - - // Success; branch to the block we found. - JMPptr(MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, normalEntry)))); - SetJumpTarget(not_found); - SetJumpTarget(state_mismatch); + if (!m_jit.GetBlockCache()->GetEntryPoints()) + { + SetJumpTarget(state_mismatch); + } // Failure, fallback to the C++ dispatcher for calling the JIT. } - // Ok, no block, let's call the slow dispatcher - ABI_PushRegistersAndAdjustStack({}, 0); - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&m_jit))); - ABI_CallFunction(JitBase::Dispatch); - ABI_PopRegistersAndAdjustStack({}, 0); + // There is no point in calling the dispatcher in the fast lookup table + // case, since the assembly dispatcher would already have found a block. + if (!assembly_dispatcher || !m_jit.GetBlockCache()->GetEntryPoints()) + { + // Ok, no block, let's call the slow dispatcher + ABI_PushRegistersAndAdjustStack({}, 0); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&m_jit))); + ABI_CallFunction(JitBase::Dispatch); + ABI_PopRegistersAndAdjustStack({}, 0); - TEST(64, R(ABI_RETURN), R(ABI_RETURN)); - FixupBranch no_block_available = J_CC(CC_Z); + TEST(64, R(ABI_RETURN), R(ABI_RETURN)); + FixupBranch no_block_available = J_CC(CC_Z); - // Jump to the block - JMPptr(R(ABI_RETURN)); + // Jump to the block + JMPptr(R(ABI_RETURN)); - SetJumpTarget(no_block_available); + SetJumpTarget(no_block_available); + } // We reset the stack because Jit might clear the code cache. // Also if we are in the middle of disabling BLR optimization on windows diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index bc4c77255e..5ff59e8d43 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -97,32 +97,21 @@ void JitArm64::GenerateAsm() if (assembly_dispatcher) { - if (GetBlockCache()->GetFastBlockMap()) + if (GetBlockCache()->GetEntryPoints()) { // Check if there is a block - ARM64Reg pc_masked = ARM64Reg::X25; - ARM64Reg cache_base = ARM64Reg::X24; + ARM64Reg pc_and_msr = ARM64Reg::X25; + ARM64Reg cache_base = ARM64Reg::X27; ARM64Reg block = ARM64Reg::X30; - LSL(pc_masked, DISPATCHER_PC, 1); - MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap()); - LDR(block, cache_base, pc_masked); + LDR(IndexType::Unsigned, EncodeRegTo32(pc_and_msr), PPC_REG, PPCSTATE_OFF(msr)); + MOVP2R(cache_base, GetBlockCache()->GetEntryPoints()); + // The entry points map is indexed by ((msrBits << 26) | (address >> 2)). + UBFIZ(pc_and_msr, pc_and_msr, 26, 6); + BFXIL(pc_and_msr, EncodeRegTo64(DISPATCHER_PC), 2, 30); + LDR(block, cache_base, ArithOption(pc_and_msr, true)); FixupBranch not_found = CBZ(block); - - // b.msrBits != msr - ARM64Reg msr = ARM64Reg::W27; - ARM64Reg msr2 = ARM64Reg::W24; - LDR(IndexType::Unsigned, msr, PPC_REG, PPCSTATE_OFF(msr)); - AND(msr, msr, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); - LDR(IndexType::Unsigned, msr2, block, offsetof(JitBlockData, msrBits)); - CMP(msr, msr2); - - FixupBranch msr_missmatch = B(CC_NEQ); - - // return blocks[block_num].normalEntry; - LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); BR(block); SetJumpTarget(not_found); - SetJumpTarget(msr_missmatch); } else { @@ -160,18 +149,25 @@ void JitArm64::GenerateAsm() } } - // Call C version of Dispatch(). STR(IndexType::Unsigned, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); - MOVP2R(ARM64Reg::X8, reinterpret_cast(&JitBase::Dispatch)); - MOVP2R(ARM64Reg::X0, this); - BLR(ARM64Reg::X8); - FixupBranch no_block_available = CBZ(ARM64Reg::X0); + // There is no point in calling the dispatcher in the fast lookup table + // case, since the assembly dispatcher would already have found a block. + if (!assembly_dispatcher || !GetBlockCache()->GetEntryPoints()) + { + // Call C version of Dispatch(). + MOVP2R(ARM64Reg::X8, reinterpret_cast(&JitBase::Dispatch)); + MOVP2R(ARM64Reg::X0, this); + BLR(ARM64Reg::X8); - BR(ARM64Reg::X0); + FixupBranch no_block_available = CBZ(ARM64Reg::X0); + + BR(ARM64Reg::X0); + + SetJumpTarget(no_block_available); + } // Call JIT - SetJumpTarget(no_block_available); ResetStack(); MOVP2R(ARM64Reg::X0, this); MOV(ARM64Reg::W1, DISPATCHER_PC); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index 95fad45bde..c8dddbc4ae 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -43,14 +43,10 @@ void JitBaseBlockCache::Init() Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR)); #ifdef _ARCH_64 - m_fast_block_map = reinterpret_cast(m_block_map_arena.Create(FAST_BLOCK_MAP_SIZE)); + m_entry_points_ptr = reinterpret_cast(m_entry_points_arena.Create(FAST_BLOCK_MAP_SIZE)); #else - m_fast_block_map = nullptr; + m_entry_points_ptr = nullptr; #endif - if (m_fast_block_map) - m_fast_block_map_ptr = m_fast_block_map; - else - m_fast_block_map_ptr = m_fast_block_map_fallback.data(); Clear(); } @@ -59,7 +55,7 @@ void JitBaseBlockCache::Shutdown() { Common::JitRegister::Shutdown(); - m_block_map_arena.Release(); + m_entry_points_arena.Release(); } // This clears the JIT cache. It's called from JitCache.cpp when the JIT cache @@ -82,8 +78,8 @@ void JitBaseBlockCache::Clear() valid_block.ClearAll(); - if (m_fast_block_map) - m_block_map_arena.Clear(); + if (m_entry_points_ptr) + m_entry_points_arena.Clear(); } void JitBaseBlockCache::Reset() @@ -92,9 +88,9 @@ void JitBaseBlockCache::Reset() Init(); } -JitBlock** JitBaseBlockCache::GetFastBlockMap() +u8** JitBaseBlockCache::GetEntryPoints() { - return m_fast_block_map; + return m_entry_points_ptr; } JitBlock** JitBaseBlockCache::GetFastBlockMapFallback() @@ -123,8 +119,11 @@ JitBlock* JitBaseBlockCache::AllocateBlock(u32 em_address) void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link, const std::set& physical_addresses) { - size_t index = FastLookupIndexForAddress(block.effectiveAddress); - m_fast_block_map_ptr[index] = █ + size_t index = FastLookupIndexForAddress(block.effectiveAddress, block.msrBits); + if (m_entry_points_ptr) + m_entry_points_ptr[index] = block.normalEntry; + else + m_fast_block_map_fallback[index] = █ block.fast_block_map_index = index; block.physical_addresses = physical_addresses; @@ -187,7 +186,28 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr) const u8* JitBaseBlockCache::Dispatch() { const auto& ppc_state = m_jit.m_ppc_state; - JitBlock* block = m_fast_block_map_ptr[FastLookupIndexForAddress(ppc_state.pc)]; + if (m_entry_points_ptr) + { + u8* entry_point = + m_entry_points_ptr[FastLookupIndexForAddress(ppc_state.pc, ppc_state.msr.Hex)]; + if (entry_point) + { + return entry_point; + } + else + { + JitBlock* block = + MoveBlockIntoFastCache(ppc_state.pc, ppc_state.msr.Hex & JIT_CACHE_MSR_MASK); + + if (!block) + return nullptr; + + return block->normalEntry; + } + } + + JitBlock* block = + m_fast_block_map_fallback[FastLookupIndexForAddress(ppc_state.pc, ppc_state.msr.Hex)]; if (!block || block->effectiveAddress != ppc_state.pc || block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK)) @@ -408,8 +428,20 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block) void JitBaseBlockCache::DestroyBlock(JitBlock& block) { - if (m_fast_block_map_ptr[block.fast_block_map_index] == &block) - m_fast_block_map_ptr[block.fast_block_map_index] = nullptr; + if (m_entry_points_ptr) + { + if (m_entry_points_ptr[block.fast_block_map_index] == block.normalEntry) + { + m_entry_points_ptr[block.fast_block_map_index] = nullptr; + } + } + else + { + if (m_fast_block_map_fallback[block.fast_block_map_index] == &block) + { + m_fast_block_map_fallback[block.fast_block_map_index] = nullptr; + } + } UnlinkBlock(block); @@ -436,22 +468,37 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr) return nullptr; // Drop old fast block map entry - if (m_fast_block_map_ptr[block->fast_block_map_index] == block) - m_fast_block_map_ptr[block->fast_block_map_index] = nullptr; + if (m_entry_points_ptr) + { + if (m_entry_points_ptr[block->fast_block_map_index] == block->normalEntry) + { + m_entry_points_ptr[block->fast_block_map_index] = nullptr; + } + } + else + { + if (m_fast_block_map_fallback[block->fast_block_map_index] == block) + { + m_fast_block_map_fallback[block->fast_block_map_index] = nullptr; + } + } // And create a new one - size_t index = FastLookupIndexForAddress(addr); - m_fast_block_map_ptr[index] = block; + size_t index = FastLookupIndexForAddress(addr, msr); + if (m_entry_points_ptr) + m_entry_points_ptr[index] = block->normalEntry; + else + m_fast_block_map_fallback[index] = block; block->fast_block_map_index = index; return block; } -size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address) +size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address, u32 msr) { - if (m_fast_block_map) + if (m_entry_points_ptr) { - return address >> 2; + return ((msr & JIT_CACHE_MSR_MASK) << 26) | (address >> 2); } else { diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index cf6f785d98..e7978f2058 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -133,8 +133,8 @@ public: static constexpr u32 JIT_CACHE_MSR_MASK = 0x30; // The value for the map is determined like this: - // ((4 GB guest memory space) / (4 bytes per address)) * sizeof(JitBlock*) - static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x2'0000'0000; + // ((4 GB guest memory space) / (4 bytes per address) * sizeof(JitBlock*)) * (4 for 2 bits of msr) + static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x8'0000'0000; static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000; static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1; @@ -147,7 +147,7 @@ public: void Reset(); // Code Cache - JitBlock** GetFastBlockMap(); + u8** GetEntryPoints(); JitBlock** GetFastBlockMapFallback(); void RunOnBlocks(std::function f); @@ -188,7 +188,7 @@ private: JitBlock* MoveBlockIntoFastCache(u32 em_address, u32 msr); // Fast but risky block lookup based on fast_block_map. - size_t FastLookupIndexForAddress(u32 address); + size_t FastLookupIndexForAddress(u32 address, u32 msr); // links_to hold all exit points of all valid blocks in a reverse way. // It is used to query all blocks which links to an address. @@ -208,16 +208,14 @@ private: // It is used to provide a fast way to query if no icache invalidation is needed. ValidBlockBitSet valid_block; - // This array is indexed with the shifted PC and likely holds the correct block id. - // This is used as a fast cache of block_map used in the assembly dispatcher. - // It is implemented via a shm segment using m_block_map_arena. - JitBlock** m_fast_block_map = 0; - Common::LazyMemoryRegion m_block_map_arena; + // This contains the entry points for each block. + // It is used by the assembly dispatcher to quickly + // know where to jump based on pc and msr bits. + Common::LazyMemoryRegion m_entry_points_arena; + u8** m_entry_points_ptr = 0; - // An alternative for the above fast_block_map but without a shm segment + // An alternative for the above but without a shm segment // in case the shm memory region couldn't be allocated. std::array m_fast_block_map_fallback{}; // start_addr & mask -> number - - JitBlock** m_fast_block_map_ptr = 0; };