From 384efb0cb204676dd53e6216aa8dcc537cc039e8 Mon Sep 17 00:00:00 2001 From: degasus Date: Wed, 1 Feb 2017 00:10:32 +0100 Subject: [PATCH] JitArm64: Initial implementation of the BLR optimization. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 162 ++++++++++++++++-- Source/Core/Core/PowerPC/JitArm64/Jit.h | 11 +- .../Core/PowerPC/JitArm64/JitArm64Cache.cpp | 26 ++- .../Core/PowerPC/JitArm64/JitArm64_Branch.cpp | 16 +- .../JitArm64/JitArm64_SystemRegisters.cpp | 4 + Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 14 ++ Source/Core/Core/PowerPC/JitCommon/JitCache.h | 1 + Source/Core/Core/PowerPC/PowerPC.h | 3 + 8 files changed, 209 insertions(+), 28 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index d12c542399..a7e7dab60a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -46,9 +46,7 @@ void JitArm64::Init() UpdateMemoryOptions(); gpr.Init(this); fpr.Init(this); - blocks.Init(); - GenerateAsm(); code_block.m_stats = &js.st; code_block.m_gpa = &js.gpa; @@ -56,6 +54,9 @@ void JitArm64::Init() analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW); + m_enable_blr_optimization = true; + + GenerateAsm(); m_supports_cycle_counter = HasCycleCounters(); } @@ -192,8 +193,16 @@ void JitArm64::DoDownCount() gpr.Unlock(WA, WB); } -// Exits -void JitArm64::WriteExit(u32 destination) +void JitArm64::ResetStack() +{ + if (!m_enable_blr_optimization) + return; + + LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); + SUB(SP, X0, 16); +} + +void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return) { Cleanup(); DoDownCount(); @@ -201,31 +210,159 @@ void JitArm64::WriteExit(u32 destination) if (Profiler::g_ProfileBlocks) EndTimeProfile(js.curBlock); - // If nobody has taken care of this yet (this can be removed when all branches are done) + LK &= m_enable_blr_optimization; + + if (LK) + { + // Push {ARM_PC+20; PPC_PC} on the stack + MOVI2R(X1, exit_address_after_return); + ADR(X0, 20); + STP(INDEX_PRE, X0, X1, SP, -16); + } + JitBlock* b = js.curBlock; JitBlock::LinkData linkData; linkData.exitAddress = destination; linkData.exitPtrs = GetWritableCodePtr(); linkData.linkStatus = false; + linkData.call = LK; b->linkData.push_back(linkData); MOVI2R(DISPATCHER_PC, destination); - B(dispatcher); + + if (!LK) + { + B(dispatcher); + } + else + { + BL(dispatcher); + + // MOVI2R might only require one instruction. So the const offset of 20 bytes + // might be wrong. Be sure and just add a NOP here. + HINT(HINT_NOP); + + // Write the regular exit node after the return. + linkData.exitAddress = exit_address_after_return; + linkData.exitPtrs = GetWritableCodePtr(); + linkData.linkStatus = false; + linkData.call = false; + b->linkData.push_back(linkData); + + MOVI2R(DISPATCHER_PC, exit_address_after_return); + B(dispatcher); + } } -void JitArm64::WriteExit(ARM64Reg Reg) +void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return) { Cleanup(); DoDownCount(); - if (Reg != DISPATCHER_PC) - MOV(DISPATCHER_PC, Reg); - gpr.Unlock(Reg); + LK &= m_enable_blr_optimization; + + if (dest != DISPATCHER_PC) + MOV(DISPATCHER_PC, dest); + gpr.Unlock(dest); if (Profiler::g_ProfileBlocks) EndTimeProfile(js.curBlock); + if (!LK) + { + B(dispatcher); + } + else + { + // Push {ARM_PC, PPC_PC} on the stack + MOVI2R(X1, exit_address_after_return); + ADR(X0, 12); + STP(INDEX_PRE, X0, X1, SP, -16); + + BL(dispatcher); + + // Write the regular exit node after the return. + JitBlock* b = js.curBlock; + JitBlock::LinkData linkData; + linkData.exitAddress = exit_address_after_return; + linkData.exitPtrs = GetWritableCodePtr(); + linkData.linkStatus = false; + linkData.call = false; + b->linkData.push_back(linkData); + + MOVI2R(DISPATCHER_PC, exit_address_after_return); + B(dispatcher); + } +} + +void JitArm64::FakeLKExit(u32 exit_address_after_return) +{ + if (!m_enable_blr_optimization) + return; + + // We may need to fake the BLR stack on inlined CALL instructions. + // Else we can't return to this location any more. + ARM64Reg after_reg = gpr.GetReg(); + ARM64Reg code_reg = gpr.GetReg(); + MOVI2R(after_reg, exit_address_after_return); + ADR(EncodeRegTo64(code_reg), 12); + STP(INDEX_PRE, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), SP, -16); + gpr.Unlock(after_reg, code_reg); + + FixupBranch skip_exit = BL(); + + // Write the regular exit node after the return. + JitBlock* b = js.curBlock; + JitBlock::LinkData linkData; + linkData.exitAddress = exit_address_after_return; + linkData.exitPtrs = GetWritableCodePtr(); + linkData.linkStatus = false; + linkData.call = false; + b->linkData.push_back(linkData); + + MOVI2R(DISPATCHER_PC, exit_address_after_return); B(dispatcher); + + SetJumpTarget(skip_exit); +} + +void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest) +{ + if (!m_enable_blr_optimization) + { + WriteExit(dest); + return; + } + + Cleanup(); + + if (Profiler::g_ProfileBlocks) + EndTimeProfile(js.curBlock); + + ARM64Reg code = gpr.GetReg(); + ARM64Reg pc = gpr.GetReg(); + + // Check if {ARM_PC, PPC_PC} matches the current state. + LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16); + CMP(pc, dest); + FixupBranch no_match = B(CC_NEQ); + + DoDownCount(); + + RET(EncodeRegTo64(code)); + + SetJumpTarget(no_match); + + DoDownCount(); + + if (dest != DISPATCHER_PC) + MOV(DISPATCHER_PC, dest); + + ResetStack(); + + B(dispatcher); + + gpr.Unlock(dest, pc, code); } void JitArm64::WriteExceptionExit(u32 destination, bool only_external) @@ -399,11 +536,11 @@ void JitArm64::Jit(u32) } JitBlock* b = blocks.AllocateBlock(em_address); - const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC); + DoJit(em_address, &code_buffer, b, nextPC); blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses); } -const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC) +void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC) { if (em_address == 0) { @@ -629,5 +766,4 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB FlushIcache(); farcode.FlushIcache(); - return start; } diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 5e2685c6a9..92b5ffa4c1 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -190,6 +190,8 @@ private: // Do we support cycle counter profiling? bool m_supports_cycle_counter; + bool m_enable_blr_optimization; + void EmitResetCycleCounters(); void EmitGetCycles(Arm64Gen::ARM64Reg reg); @@ -219,10 +221,11 @@ private: void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); - const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC); + void DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC); void DoDownCount(); void Cleanup(); + void ResetStack(); // AsmRoutines void GenerateAsm(); @@ -234,10 +237,12 @@ private: void EndTimeProfile(JitBlock* b); // Exits - void WriteExit(u32 destination); - void WriteExit(Arm64Gen::ARM64Reg dest); + void WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0); + void WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0); void WriteExceptionExit(u32 destination, bool only_external = false); void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false); + void FakeLKExit(u32 exit_address_after_return); + void WriteBLRExit(Arm64Gen::ARM64Reg dest); FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64Cache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64Cache.cpp index 0672488181..7cf4f4aa41 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64Cache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64Cache.cpp @@ -19,21 +19,31 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const if (dest) { - // Are we able to jump directly to the normal entry? - s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2; - if (distance >= -0x40000 && distance <= 0x3FFFF) + if (source.call) { - emit.B(CC_PL, dest->normalEntry); + emit.BL(dest->checkedEntry); } + else + { + // Are we able to jump directly to the normal entry? + s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2; + if (distance >= -0x40000 && distance <= 0x3FFFF) + { + emit.B(CC_PL, dest->normalEntry); + } - // Use the checked entry if either downcount is smaller zero, - // or if we're not able to inline the downcount check here. - emit.B(dest->checkedEntry); + // Use the checked entry if either downcount is smaller zero, + // or if we're not able to inline the downcount check here. + emit.B(dest->checkedEntry); + } } else { emit.MOVI2R(DISPATCHER_PC, source.exitAddress); - emit.B(m_jit.GetAsmRoutines()->dispatcher); + if (source.call) + emit.BL(m_jit.GetAsmRoutines()->dispatcher); + else + emit.B(m_jit.GetAsmRoutines()->dispatcher); } emit.FlushIcache(); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index 4d1142a184..38e393e08b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -92,6 +92,13 @@ void JitArm64::bx(UGeckoInstruction inst) if (!js.isLastInstruction) { + if (inst.LK && !js.op->skipLRStack) + { + // We have to fake the stack as the RET instruction was not + // found in the same block. This is a big overhead, but still + // better than calling the dispatcher. + FakeLKExit(js.compilerPC + 4); + } return; } @@ -112,7 +119,7 @@ void JitArm64::bx(UGeckoInstruction inst) return; } - WriteExit(destination); + WriteExit(destination, inst.LK, js.compilerPC + 4); } void JitArm64::bcx(UGeckoInstruction inst) @@ -162,7 +169,7 @@ void JitArm64::bcx(UGeckoInstruction inst) gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); - WriteExit(destination); + WriteExit(destination, inst.LK, js.compilerPC + 4); SwitchToNearCode(); @@ -211,7 +218,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst) LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR])); AND(WA, WA, 30, 29); // Wipe the bottom 2 bits. - WriteExit(WA); + + WriteExit(WA, inst.LK_3, js.compilerPC + 4); } void JitArm64::bclrx(UGeckoInstruction inst) @@ -264,7 +272,7 @@ void JitArm64::bclrx(UGeckoInstruction inst) gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); - WriteExit(WA); + WriteBLRExit(WA); if (conditional) SwitchToNearCode(); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index e683c2f0a5..1e2d3944b7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -56,6 +56,10 @@ void JitArm64::mtmsr(UGeckoInstruction inst) gpr.Flush(FlushMode::FLUSH_ALL); fpr.Flush(FlushMode::FLUSH_ALL); + // Our jit cache also stores some MSR bits, as they have changed, we either + // have to validate them in the BLR/RET check, or just flush the stack here. + ResetStack(); + WriteExceptionExit(js.compilerPC + 4, true); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 9daf386adb..41979be0fa 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -28,6 +28,14 @@ void JitArm64::GenerateAsm() MOVP2R(PPC_REG, &PowerPC::ppcState); + // Store the stack pointer, so we can reset it if the BLR optimization fails. + ADD(X0, SP, 0); + STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); + + // Push {nullptr; -1} as invalid destination on the stack. + MOVI2R(X0, 0xFFFFFFFF); + STP(INDEX_PRE, ZR, X0, SP, -16); + // The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance(). // Advance() does an exception check so we don't know what PC to use until afterwards. FixupBranch to_start_of_timing_slice = B(); @@ -119,6 +127,7 @@ void JitArm64::GenerateAsm() // Call JIT SetJumpTarget(no_block_available); + ResetStack(); MOV(W0, DISPATCHER_PC); MOVP2R(X30, reinterpret_cast(&JitTrampoline)); BLR(X30); @@ -150,6 +159,11 @@ void JitArm64::GenerateAsm() B(dispatcherNoCheck); SetJumpTarget(Exit); + + // Reset the stack pointer, as the BLR optimization have touched it. + LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer)); + ADD(SP, X0, 0); + ABI_PopRegisters(regs_to_save); RET(X30); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 4d281ed9c8..485be781dd 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -58,6 +58,7 @@ struct JitBlock u8* exitPtrs; // to be able to rewrite the exit jump u32 exitAddress; bool linkStatus; // is it already linked? + bool call; }; std::vector linkData; diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index d269c1462a..f507616491 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -116,6 +116,9 @@ struct PowerPCState // also for power management, but we don't care about that. u32 spr[1024]; + // Storage for the stack pointer of the BLR optimization. + u8* stored_stack_pointer; + std::array, NUM_TLBS> tlb; u32 pagetable_base;