From 3df09f349d5ebe6de8694ee1ec08797748132387 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 16 Jun 2023 19:49:13 +0200 Subject: [PATCH 1/5] JitArm64: Prefer X8 and up for temporary registers in JitAsm Just to make the code easier to understand at a glance. I especially found it a bit annoying to reason about whether callee-saved registers like W28 were being used because we needed a callee-saved register or just for no reason in particular. X8 and up is what compilers normally use when they're not register starved. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 50 ++++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 0adb657841..db5b2344f2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -41,17 +41,17 @@ void JitArm64::GenerateAsm() enter_code = GetCodePtr(); ABI_PushRegisters(regs_to_save); - m_float_emit.ABI_PushRegisters(regs_to_save_fpr, ARM64Reg::X30); + m_float_emit.ABI_PushRegisters(regs_to_save_fpr, ARM64Reg::X8); MOVP2R(PPC_REG, &m_ppc_state); // Store the stack pointer, so we can reset it if the BLR optimization fails. - ADD(ARM64Reg::X0, ARM64Reg::SP, 0); - STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); + ADD(ARM64Reg::X8, ARM64Reg::SP, 0); + STR(IndexType::Unsigned, ARM64Reg::X8, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); // Push {nullptr; -1} as invalid destination on the stack. - MOVI2R(ARM64Reg::X0, 0xFFFF'FFFF'FFFF'FFFF); - STP(IndexType::Pre, ARM64Reg::ZR, ARM64Reg::X0, ARM64Reg::SP, -16); + MOVI2R(ARM64Reg::X8, 0xFFFF'FFFF'FFFF'FFFF); + STP(IndexType::Pre, ARM64Reg::ZR, ARM64Reg::X8, ARM64Reg::SP, -16); // The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance(). // Advance() does an exception check so we don't know what PC to use until afterwards. @@ -86,9 +86,9 @@ void JitArm64::GenerateAsm() FixupBranch debug_exit; if (enable_debugging) { - LDR(IndexType::Unsigned, ARM64Reg::W0, ARM64Reg::X0, - MOVPage2R(ARM64Reg::X0, cpu.GetStatePtr())); - debug_exit = CBNZ(ARM64Reg::W0); + LDR(IndexType::Unsigned, ARM64Reg::W8, ARM64Reg::X8, + MOVPage2R(ARM64Reg::X8, cpu.GetStatePtr())); + debug_exit = CBNZ(ARM64Reg::W8); } dispatcher_no_check = GetCodePtr(); @@ -100,9 +100,9 @@ void JitArm64::GenerateAsm() if (GetBlockCache()->GetEntryPoints()) { // Check if there is a block - ARM64Reg pc_and_msr = ARM64Reg::X25; - ARM64Reg cache_base = ARM64Reg::X27; - ARM64Reg block = ARM64Reg::X30; + ARM64Reg pc_and_msr = ARM64Reg::X8; + ARM64Reg cache_base = ARM64Reg::X9; + ARM64Reg block = ARM64Reg::X10; LDR(IndexType::Unsigned, EncodeRegTo32(pc_and_msr), PPC_REG, PPCSTATE_OFF(msr)); MOVP2R(cache_base, GetBlockCache()->GetEntryPoints()); // The entry points map is indexed by ((msrBits << 26) | (address >> 2)). @@ -116,9 +116,9 @@ void JitArm64::GenerateAsm() else { // iCache[(address >> 2) & iCache_Mask]; - ARM64Reg pc_masked = ARM64Reg::W25; - ARM64Reg cache_base = ARM64Reg::X27; - ARM64Reg block = ARM64Reg::X30; + ARM64Reg pc_masked = ARM64Reg::W8; + ARM64Reg cache_base = ARM64Reg::X9; + ARM64Reg block = ARM64Reg::X10; ORR(pc_masked, ARM64Reg::WZR, LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32)); AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1)); @@ -127,25 +127,25 @@ void JitArm64::GenerateAsm() FixupBranch not_found = CBZ(block); // b.effectiveAddress != addr || b.msrBits != msr - ARM64Reg pc_and_msr = ARM64Reg::W25; - ARM64Reg pc_and_msr2 = ARM64Reg::W24; + ARM64Reg pc_and_msr = ARM64Reg::W11; + ARM64Reg pc_and_msr2 = ARM64Reg::W12; LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress)); CMP(pc_and_msr, DISPATCHER_PC); - FixupBranch pc_missmatch = B(CC_NEQ); + FixupBranch pc_mismatch = B(CC_NEQ); LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr)); AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits)); CMP(pc_and_msr, pc_and_msr2); - FixupBranch msr_missmatch = B(CC_NEQ); + FixupBranch msr_mismatch = B(CC_NEQ); // return blocks[block_num].normalEntry; LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); BR(block); SetJumpTarget(not_found); - SetJumpTarget(pc_missmatch); - SetJumpTarget(msr_missmatch); + SetJumpTarget(pc_mismatch); + SetJumpTarget(msr_mismatch); } } @@ -182,8 +182,8 @@ void JitArm64::GenerateAsm() // Check the state pointer to see if we are exiting // Gets checked on at the end of every slice - LDR(IndexType::Unsigned, ARM64Reg::W0, ARM64Reg::X0, MOVPage2R(ARM64Reg::X0, cpu.GetStatePtr())); - FixupBranch exit = CBNZ(ARM64Reg::W0); + LDR(IndexType::Unsigned, ARM64Reg::W8, ARM64Reg::X8, MOVPage2R(ARM64Reg::X8, cpu.GetStatePtr())); + FixupBranch exit = CBNZ(ARM64Reg::W8); SetJumpTarget(to_start_of_timing_slice); ABI_CallFunction(&CoreTiming::GlobalAdvance); @@ -212,10 +212,10 @@ void JitArm64::GenerateAsm() // Reset the stack pointer, since the BLR optimization may have pushed things onto the stack // without popping them. - LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); - ADD(ARM64Reg::SP, ARM64Reg::X0, 0); + LDR(IndexType::Unsigned, ARM64Reg::X8, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); + ADD(ARM64Reg::SP, ARM64Reg::X8, 0); - m_float_emit.ABI_PopRegisters(regs_to_save_fpr, ARM64Reg::X30); + m_float_emit.ABI_PopRegisters(regs_to_save_fpr, ARM64Reg::X8); ABI_PopRegisters(regs_to_save); RET(ARM64Reg::X30); From 4a4e7d9b8a35ddf19a1441e7ca401498fb4bda62 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 19 Aug 2023 12:10:08 +0200 Subject: [PATCH 2/5] Jit: Swap locations of effectiveAddress and msrBits This slightly improves instruction-level parallelism in Jit64's slow dispatcher by shifting the PC left instead of the MSR. In the past, this also enabled an optimization in JitArm64's fast path where we could use LDP to load normalEntry and msrBits in one instruction, but this was superseded by fd9c970. --- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 17 +++++++---------- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 4 ++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 36189dbb40..f77afc563c 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -19,11 +19,6 @@ #include "Core/System.h" using namespace Gen; - -// These need to be next of each other so that the assembly -// code can compare them easily. -static_assert(offsetof(JitBlockData, effectiveAddress) + 4 == offsetof(JitBlockData, msrBits)); - Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(jit) { } @@ -168,12 +163,14 @@ void Jit64AsmRoutineManager::Generate() // Check block.msrBits. MOV(32, R(RSCRATCH2), PPCSTATE(msr)); AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK)); - // Also check the block.effectiveAddress - SHL(64, R(RSCRATCH2), Imm8(32)); - // RSCRATCH_EXTRA still has the PC. + // Also check the block.effectiveAddress. RSCRATCH_EXTRA still has the PC. + SHL(64, R(RSCRATCH_EXTRA), Imm8(32)); OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); - CMP(64, R(RSCRATCH2), - MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, effectiveAddress)))); + + static_assert(offsetof(JitBlockData, msrBits) + 4 == + offsetof(JitBlockData, effectiveAddress)); + + CMP(64, R(RSCRATCH2), MDisp(RSCRATCH, static_cast(offsetof(JitBlockData, msrBits)))); state_mismatch = J_CC(CC_NE); // Success; branch to the block we found. diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index e7978f2058..de1486ceb2 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -33,10 +33,10 @@ struct JitBlockData // The normal entry point for the block, returned by Dispatch(). u8* normalEntry; - // The effective address (PC) for the beginning of the block. - u32 effectiveAddress; // The MSR bits expected for this block to be valid; see JIT_CACHE_MSR_MASK. u32 msrBits; + // The effective address (PC) for the beginning of the block. + u32 effectiveAddress; // The physical address of the code represented by this block. // Various maps in the cache are indexed by this (block_map // and valid_block in particular). This is useful because of From c9347a2a194e10af0719cad2d2286b16fdaddf75 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 16 Jun 2023 19:55:24 +0200 Subject: [PATCH 3/5] JitArm64: Use LDP in slow dispatcher With one LDP instruction, we can replace two LDR instructions. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 23 +++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index db5b2344f2..0cfece976c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -115,10 +115,14 @@ void JitArm64::GenerateAsm() } else { - // iCache[(address >> 2) & iCache_Mask]; ARM64Reg pc_masked = ARM64Reg::W8; ARM64Reg cache_base = ARM64Reg::X9; ARM64Reg block = ARM64Reg::X10; + ARM64Reg pc = ARM64Reg::W11; + ARM64Reg msr = ARM64Reg::W12; + ARM64Reg msr2 = ARM64Reg::W13; + + // iCache[(address >> 2) & iCache_Mask]; ORR(pc_masked, ARM64Reg::WZR, LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32)); AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1)); @@ -127,22 +131,21 @@ void JitArm64::GenerateAsm() FixupBranch not_found = CBZ(block); // b.effectiveAddress != addr || b.msrBits != msr - ARM64Reg pc_and_msr = ARM64Reg::W11; - ARM64Reg pc_and_msr2 = ARM64Reg::W12; - LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress)); - CMP(pc_and_msr, DISPATCHER_PC); + static_assert(offsetof(JitBlockData, msrBits) + 4 == + offsetof(JitBlockData, effectiveAddress)); + LDP(IndexType::Signed, msr, pc, block, offsetof(JitBlockData, effectiveAddress)); + CMP(pc, DISPATCHER_PC); FixupBranch pc_mismatch = B(CC_NEQ); - LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr)); - AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); - LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits)); - CMP(pc_and_msr, pc_and_msr2); - + LDR(IndexType::Unsigned, msr2, PPC_REG, PPCSTATE_OFF(msr)); + AND(msr2, msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); + CMP(msr, msr2); FixupBranch msr_mismatch = B(CC_NEQ); // return blocks[block_num].normalEntry; LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); BR(block); + SetJumpTarget(not_found); SetJumpTarget(pc_mismatch); SetJumpTarget(msr_mismatch); From 9e970bcb30d1281c1e48b5ebec5f734ad7c1a022 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 19 Aug 2023 13:51:03 +0200 Subject: [PATCH 4/5] JitArm64: Optiming shifting and masking PC in slow dispatcher Instead of shifting left by 1, we can first shift right by 2 and then left by 3. This is both faster and smaller, because we get the right shift for free with the masking and the left shift for free with the address calculation. It also happens to match the pseudocode more closely, which is always nice for readability. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 0cfece976c..5cd554ce89 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -123,11 +123,10 @@ void JitArm64::GenerateAsm() ARM64Reg msr2 = ARM64Reg::W13; // iCache[(address >> 2) & iCache_Mask]; - ORR(pc_masked, ARM64Reg::WZR, - LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32)); - AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1)); + UBFX(pc_masked, DISPATCHER_PC, 2, + MathUtil::IntLog2(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_ELEMENTS) - 2); MOVP2R(cache_base, GetBlockCache()->GetFastBlockMapFallback()); - LDR(block, cache_base, EncodeRegTo64(pc_masked)); + LDR(block, cache_base, ArithOption(EncodeRegTo64(pc_masked), true)); FixupBranch not_found = CBZ(block); // b.effectiveAddress != addr || b.msrBits != msr From 06c7862160320b3c611504776da1d10912faea0c Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 19 Aug 2023 14:10:33 +0200 Subject: [PATCH 5/5] JitArm64: Rearrange dispatcher instructions to improve scheduling Loads can take a little while to complete. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 5cd554ce89..ef51bb6b35 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -121,11 +121,12 @@ void JitArm64::GenerateAsm() ARM64Reg pc = ARM64Reg::W11; ARM64Reg msr = ARM64Reg::W12; ARM64Reg msr2 = ARM64Reg::W13; + ARM64Reg entry = ARM64Reg::X14; // iCache[(address >> 2) & iCache_Mask]; + MOVP2R(cache_base, GetBlockCache()->GetFastBlockMapFallback()); UBFX(pc_masked, DISPATCHER_PC, 2, MathUtil::IntLog2(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_ELEMENTS) - 2); - MOVP2R(cache_base, GetBlockCache()->GetFastBlockMapFallback()); LDR(block, cache_base, ArithOption(EncodeRegTo64(pc_masked), true)); FixupBranch not_found = CBZ(block); @@ -133,17 +134,17 @@ void JitArm64::GenerateAsm() static_assert(offsetof(JitBlockData, msrBits) + 4 == offsetof(JitBlockData, effectiveAddress)); LDP(IndexType::Signed, msr, pc, block, offsetof(JitBlockData, effectiveAddress)); + LDR(IndexType::Unsigned, msr2, PPC_REG, PPCSTATE_OFF(msr)); CMP(pc, DISPATCHER_PC); FixupBranch pc_mismatch = B(CC_NEQ); - LDR(IndexType::Unsigned, msr2, PPC_REG, PPCSTATE_OFF(msr)); + LDR(IndexType::Unsigned, entry, block, offsetof(JitBlockData, normalEntry)); AND(msr2, msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32)); CMP(msr, msr2); FixupBranch msr_mismatch = B(CC_NEQ); // return blocks[block_num].normalEntry; - LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry)); - BR(block); + BR(entry); SetJumpTarget(not_found); SetJumpTarget(pc_mismatch);