From f31b25fe3934b899a00fb049b3c45740c67b11b7 Mon Sep 17 00:00:00 2001 From: degasus Date: Sun, 22 Jan 2017 19:13:29 +0100 Subject: [PATCH 1/3] Jit64: Enable branch following. --- Source/Core/Common/x64Emitter.cpp | 10 ++ Source/Core/Common/x64Emitter.h | 1 + Source/Core/Core/PowerPC/Jit64/Jit.cpp | 17 ++++ Source/Core/Core/PowerPC/Jit64/Jit.h | 1 + Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp | 23 +++++ Source/Core/Core/PowerPC/PPCAnalyst.cpp | 95 +++++++++++-------- Source/Core/Core/PowerPC/PPCAnalyst.h | 8 +- 7 files changed, 110 insertions(+), 45 deletions(-) diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 395477a18c..34965a6a24 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -443,6 +443,16 @@ void XEmitter::CALL(const void* fnptr) Write32(u32(distance)); } +FixupBranch XEmitter::CALL() +{ + FixupBranch branch; + branch.type = 1; + branch.ptr = code + 5; + Write8(0xE8); + Write32(0); + return branch; +} + FixupBranch XEmitter::J(bool force5bytes) { FixupBranch branch; diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 94bb765203..b294ed1358 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -467,6 +467,7 @@ public: #undef CALL #endif void CALL(const void* fnptr); + FixupBranch CALL(); void CALLptr(OpArg arg); FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 568bfeca55..2207a60bbe 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -372,6 +372,21 @@ bool Jit64::Cleanup() return did_something; } +void Jit64::FakeBLCall(u32 after) +{ + if (!m_enable_blr_optimization) + return; + + // We may need to fake the BLR stack on inlined CALL instructions. + // Else we can't return to this location any more. + MOV(32, R(RSCRATCH2), Imm32(after)); + PUSH(RSCRATCH2); + FixupBranch skip_exit = CALL(); + POP(RSCRATCH2); + JustWriteExit(after, false, 0); + SetJumpTarget(skip_exit); +} + void Jit64::WriteExit(u32 destination, bool bl, u32 after) { if (!m_enable_blr_optimization) @@ -569,6 +584,7 @@ void Jit64::Jit(u32 em_address) analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE); analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); + analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW); } Trace(); } @@ -973,6 +989,7 @@ void Jit64::EnableOptimization() analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); + analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW); } void Jit64::IntializeSpeculativeConstants() diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index fc7e5d3522..1c48da0650 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -85,6 +85,7 @@ public: // Utilities for use by opcodes + void FakeBLCall(u32 after); void WriteExit(u32 destination, bool bl = false, u32 after = 0); void JustWriteExit(u32 destination, bool bl, u32 after); void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index f33b93f6e3..eb4b5eabd6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -74,6 +74,13 @@ void Jit64::bx(UGeckoInstruction inst) // Because PPCAnalyst::Flatten() merged the blocks. if (!js.isLastInstruction) { + if (inst.LK && !js.op->skipLRStack) + { + // We have to fake the stack as the RET instruction was not + // found in the same block. This is a big overhead, but still + // better than calling the dispatcher. + FakeBLCall(js.compilerPC + 4); + } return; } @@ -131,6 +138,22 @@ void Jit64::bcx(UGeckoInstruction inst) if (inst.LK) MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4)); + // If this is not the last instruction of a block + // and an unconditional branch, we will skip the rest process. + // Because PPCAnalyst::Flatten() merged the blocks. + if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) && + (inst.BO & BO_DONT_CHECK_CONDITION)) + { + if (inst.LK && !js.op->skipLRStack) + { + // We have to fake the stack as the RET instruction was not + // found in the same block. This is a big overhead, but still + // better than calling the dispatcher. + FakeBLCall(js.compilerPC + 4); + } + return; + } + u32 destination; if (inst.AA) destination = SignExt16(inst.BD << 2); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 6b1942559c..2a06bd3ae2 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -32,8 +32,9 @@ namespace PPCAnalyst { constexpr int CODEBUFFER_SIZE = 32000; + // 0 does not perform block merging -constexpr u32 FUNCTION_FOLLOWING_THRESHOLD = 16; +constexpr u32 BRANCH_FOLLOWING_THRESHOLD = 2; constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF; @@ -651,7 +652,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 CodeOp* code = buffer->codebuffer; bool found_exit = false; - u32 return_address = 0; + bool found_call = false; + size_t caller = 0; u32 numFollows = 0; u32 num_inst = 0; @@ -686,50 +688,60 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 bool conditional_continue = false; - // Do we inline leaf functions? - if (HasOption(OPTION_LEAF_INLINE)) + // TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD. + // If it is small, the performance will be down. + // If it is big, the size of generated code will be big and + // cache clearning will happen many times. + if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD) { if (inst.OPCD == 18 && blockSize > 1) { - // Is bx - should we inline? yes! - if (inst.AA) - destination = SignExt26(inst.LI << 2); - else - destination = address + SignExt26(inst.LI << 2); - if (destination != block->m_address) - follow = true; + // Always follow BX instructions. + // TODO: Loop unrolling might bloat the code size too much. + // Enable it carefully. + follow = destination != block->m_address; + destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address); + if (inst.LK) + { + found_call = true; + caller = i; + } } - else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && (inst.BO & (1 << 4)) && - (inst.BO & (1 << 2)) && return_address != 0) + else if (inst.OPCD == 16 && (inst.BO & BO_DONT_DECREMENT_FLAG) && + (inst.BO & BO_DONT_CHECK_CONDITION) && blockSize > 1) + { + // Always follow unconditional BCX instructions, but they are very rare. + follow = true; + destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address); + if (inst.LK) + { + found_call = true; + caller = i; + } + } + else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call && + (inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION)) { // bclrx with unconditional branch = return follow = true; - destination = return_address; - return_address = 0; + destination = code[caller].address + 4; + found_call = false; + code[i].skip = true; - if (inst.LK) - return_address = address + 4; + // Skip the RET, so also don't generate the stack entry for the BLR optimization. + code[caller].skipLRStack = true; } else if (inst.OPCD == 31 && inst.SUBOP10 == 467) { - // mtspr + // mtspr, skip CALL/RET merging as LR is overwritten. const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F); if (index == SPR_LR) { // We give up to follow the return address // because we have to check the register usage. - return_address = 0; + found_call = false; } } - - // TODO: Find the optimal value for FUNCTION_FOLLOWING_THRESHOLD. - // If it is small, the performance will be down. - // If it is big, the size of generated code will be big and - // cache clearning will happen many times. - // TODO: Investivate the reason why - // "0" is fastest in some games, MP2 for example. - if (numFollows > FUNCTION_FOLLOWING_THRESHOLD) - follow = false; } if (HasOption(OPTION_CONDITIONAL_CONTINUE)) @@ -759,27 +771,28 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 } } - if (!follow) + if (follow) { + // Follow the unconditional branch. + numFollows++; + address = destination; + } + else + { + // Just pick the next instruction address += 4; if (!conditional_continue && opinfo->flags & FL_ENDBLOCK) // right now we stop early { found_exit = true; break; } + if (conditional_continue) + { + // If we skip any conditional branch, we can't garantee to get the matching CALL/RET pair. + // So we stop inling the RET here and let the BLR optitmization handle this case. + found_call = false; + } } -// XXX: We don't support inlining yet. -#if 0 - else - { - numFollows++; - // We don't "code[i].skip = true" here - // because bx may store a certain value to the link register. - // Instead, we skip a part of bx in Jit**::bx(). - address = destination; - merged_addresses[size_of_merged_addresses++] = address; - } -#endif } block->m_num_instructions = num_inst; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 02ebc42c18..1754c79740 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -42,6 +42,7 @@ struct CodeOp // 16B bool outputFPRF; bool outputCA; bool canEndBlock; + bool skipLRStack; bool skip; // followed BL-s for example // which registers are still needed after this instruction in this block BitSet32 fprInUse; @@ -189,11 +190,10 @@ public: // Requires JIT support to be enabled. OPTION_CONDITIONAL_CONTINUE = (1 << 0), - // If there is a unconditional branch that jumps to a leaf function then inline it. + // Try to inline unconditional branches. // Might require JIT intervention to support it correctly. - // Requires JITBLock support for inlined code - // XXX: NOT COMPLETE - OPTION_LEAF_INLINE = (1 << 1), + // Especially if the BLR optimization is used. + OPTION_BRANCH_FOLLOW = (1 << 1), // Complex blocks support jumping backwards on to themselves. // Happens commonly in loops, pretty complex to support. From 48557643450049310e3b1e71b5b074bce0450390 Mon Sep 17 00:00:00 2001 From: degasus Date: Sun, 22 Jan 2017 20:40:17 +0100 Subject: [PATCH 2/3] JitArm64: Implement leaf inlining. There is no BLR stack, so this is quite trivial. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 1 + Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 7ccdb00614..d12c542399 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -55,6 +55,7 @@ void JitArm64::Init() code_block.m_fpa = &js.fpa; analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); + analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW); m_supports_cycle_counter = HasCycleCounters(); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index 4ba851d603..4d1142a184 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -76,9 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITBranchOff); - gpr.Flush(FlushMode::FLUSH_ALL); - fpr.Flush(FlushMode::FLUSH_ALL); - u32 destination; if (inst.AA) destination = SignExt26(inst.LI << 2); @@ -93,6 +90,14 @@ void JitArm64::bx(UGeckoInstruction inst) gpr.Unlock(WA); } + if (!js.isLastInstruction) + { + return; + } + + gpr.Flush(FlushMode::FLUSH_ALL); + fpr.Flush(FlushMode::FLUSH_ALL); + if (destination == js.compilerPC) { // make idle loops go faster From ca10cf5afec122bba3b9db030ad25d63e5395008 Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 28 Jan 2017 02:59:48 +0100 Subject: [PATCH 3/3] PPCAnalyst: Update comments --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 5 +++++ Source/Core/Core/PowerPC/PPCAnalyst.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 2a06bd3ae2..0735eda2f7 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -723,6 +723,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 (inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION)) { // bclrx with unconditional branch = return + // Follow it if we can propagate the LR value of the last CALL instruction. + // Through it would be easy to track the upper level of call/return, + // we can't guarantee the LR value. The PPC ABI forces all functions to push + // the LR value on the stack as there are no spare registers. So we'd need + // to check all store instruction to not alias with the stack. follow = true; destination = code[caller].address + 4; found_call = false; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 1754c79740..5a3b86a57a 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -190,7 +190,8 @@ public: // Requires JIT support to be enabled. OPTION_CONDITIONAL_CONTINUE = (1 << 0), - // Try to inline unconditional branches. + // Try to inline unconditional branches/calls/returns. + // Also track the LR value to follow unconditional return instructions. // Might require JIT intervention to support it correctly. // Especially if the BLR optimization is used. OPTION_BRANCH_FOLLOW = (1 << 1),