diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 395477a18c..34965a6a24 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -443,6 +443,16 @@ void XEmitter::CALL(const void* fnptr)
   Write32(u32(distance));
 }
 
+FixupBranch XEmitter::CALL()
+{
+  FixupBranch branch;
+  branch.type = 1;
+  branch.ptr = code + 5;
+  Write8(0xE8);
+  Write32(0);
+  return branch;
+}
+
 FixupBranch XEmitter::J(bool force5bytes)
 {
   FixupBranch branch;
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index 94bb765203..b294ed1358 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -467,6 +467,7 @@ public:
 #undef CALL
 #endif
   void CALL(const void* fnptr);
+  FixupBranch CALL();
   void CALLptr(OpArg arg);
 
   FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
index 568bfeca55..2207a60bbe 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@@ -372,6 +372,21 @@ bool Jit64::Cleanup()
   return did_something;
 }
 
+void Jit64::FakeBLCall(u32 after)
+{
+  if (!m_enable_blr_optimization)
+    return;
+
+  // We may need to fake the BLR stack on inlined CALL instructions.
+  // Else we can't return to this location any more.
+  MOV(32, R(RSCRATCH2), Imm32(after));
+  PUSH(RSCRATCH2);
+  FixupBranch skip_exit = CALL();
+  POP(RSCRATCH2);
+  JustWriteExit(after, false, 0);
+  SetJumpTarget(skip_exit);
+}
+
 void Jit64::WriteExit(u32 destination, bool bl, u32 after)
 {
   if (!m_enable_blr_optimization)
@@ -569,6 +584,7 @@ void Jit64::Jit(u32 em_address)
         analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
         analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
         analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
+        analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
       }
       Trace();
     }
@@ -973,6 +989,7 @@ void Jit64::EnableOptimization()
   analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
   analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
   analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
+  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
 }
 
 void Jit64::IntializeSpeculativeConstants()
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index fc7e5d3522..1c48da0650 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -85,6 +85,7 @@ public:
 
   // Utilities for use by opcodes
 
+  void FakeBLCall(u32 after);
   void WriteExit(u32 destination, bool bl = false, u32 after = 0);
   void JustWriteExit(u32 destination, bool bl, u32 after);
   void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
index f33b93f6e3..eb4b5eabd6 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
@@ -74,6 +74,13 @@ void Jit64::bx(UGeckoInstruction inst)
   // Because PPCAnalyst::Flatten() merged the blocks.
   if (!js.isLastInstruction)
   {
+    if (inst.LK && !js.op->skipLRStack)
+    {
+      // We have to fake the stack as the RET instruction was not
+      // found in the same block. This is a big overhead, but still
+      // better than calling the dispatcher.
+      FakeBLCall(js.compilerPC + 4);
+    }
     return;
   }
 
@@ -131,6 +138,22 @@ void Jit64::bcx(UGeckoInstruction inst)
   if (inst.LK)
     MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
 
+  // If this is not the last instruction of a block
+  // and an unconditional branch, we will skip the rest process.
+  // Because PPCAnalyst::Flatten() merged the blocks.
+  if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
+      (inst.BO & BO_DONT_CHECK_CONDITION))
+  {
+    if (inst.LK && !js.op->skipLRStack)
+    {
+      // We have to fake the stack as the RET instruction was not
+      // found in the same block. This is a big overhead, but still
+      // better than calling the dispatcher.
+      FakeBLCall(js.compilerPC + 4);
+    }
+    return;
+  }
+
   u32 destination;
   if (inst.AA)
     destination = SignExt16(inst.BD << 2);
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
index 7ccdb00614..d12c542399 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@@ -55,6 +55,7 @@ void JitArm64::Init()
   code_block.m_fpa = &js.fpa;
   analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
   analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
+  analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
 
   m_supports_cycle_counter = HasCycleCounters();
 }
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
index 4ba851d603..4d1142a184 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
@@ -76,9 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
   INSTRUCTION_START
   JITDISABLE(bJITBranchOff);
 
-  gpr.Flush(FlushMode::FLUSH_ALL);
-  fpr.Flush(FlushMode::FLUSH_ALL);
-
   u32 destination;
   if (inst.AA)
     destination = SignExt26(inst.LI << 2);
@@ -93,6 +90,14 @@ void JitArm64::bx(UGeckoInstruction inst)
     gpr.Unlock(WA);
   }
 
+  if (!js.isLastInstruction)
+  {
+    return;
+  }
+
+  gpr.Flush(FlushMode::FLUSH_ALL);
+  fpr.Flush(FlushMode::FLUSH_ALL);
+
   if (destination == js.compilerPC)
   {
     // make idle loops go faster
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 6b1942559c..0735eda2f7 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -32,8 +32,9 @@
 namespace PPCAnalyst
 {
 constexpr int CODEBUFFER_SIZE = 32000;
+
 // 0 does not perform block merging
-constexpr u32 FUNCTION_FOLLOWING_THRESHOLD = 16;
+constexpr u32 BRANCH_FOLLOWING_THRESHOLD = 2;
 
 constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF;
 
@@ -651,7 +652,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
   CodeOp* code = buffer->codebuffer;
 
   bool found_exit = false;
-  u32 return_address = 0;
+  bool found_call = false;
+  size_t caller = 0;
   u32 numFollows = 0;
   u32 num_inst = 0;
 
@@ -686,50 +688,65 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
 
     bool conditional_continue = false;
 
-    // Do we inline leaf functions?
-    if (HasOption(OPTION_LEAF_INLINE))
+    // TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD.
+    //       If it is small, the performance will be down.
+    //       If it is big, the size of generated code will be big and
+    //       cache clearning will happen many times.
+    if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
     {
       if (inst.OPCD == 18 && blockSize > 1)
       {
-        // Is bx - should we inline? yes!
-        if (inst.AA)
-          destination = SignExt26(inst.LI << 2);
-        else
-          destination = address + SignExt26(inst.LI << 2);
-        if (destination != block->m_address)
-          follow = true;
+        // Always follow BX instructions.
+        // TODO: Loop unrolling might bloat the code size too much.
+        //       Enable it carefully.
+        follow = destination != block->m_address;
+        destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
+        if (inst.LK)
+        {
+          found_call = true;
+          caller = i;
+        }
       }
-      else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && (inst.BO & (1 << 4)) &&
-               (inst.BO & (1 << 2)) && return_address != 0)
+      else if (inst.OPCD == 16 && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
+               (inst.BO & BO_DONT_CHECK_CONDITION) && blockSize > 1)
+      {
+        // Always follow unconditional BCX instructions, but they are very rare.
+        follow = true;
+        destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
+        if (inst.LK)
+        {
+          found_call = true;
+          caller = i;
+        }
+      }
+      else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
+               (inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
       {
         // bclrx with unconditional branch = return
+        // Follow it if we can propagate the LR value of the last CALL instruction.
+        // Through it would be easy to track the upper level of call/return,
+        // we can't guarantee the LR value. The PPC ABI forces all functions to push
+        // the LR value on the stack as there are no spare registers. So we'd need
+        // to check all store instruction to not alias with the stack.
         follow = true;
-        destination = return_address;
-        return_address = 0;
+        destination = code[caller].address + 4;
+        found_call = false;
+        code[i].skip = true;
 
-        if (inst.LK)
-          return_address = address + 4;
+        // Skip the RET, so also don't generate the stack entry for the BLR optimization.
+        code[caller].skipLRStack = true;
       }
       else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
       {
-        // mtspr
+        // mtspr, skip CALL/RET merging as LR is overwritten.
         const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
         if (index == SPR_LR)
         {
           // We give up to follow the return address
           // because we have to check the register usage.
-          return_address = 0;
+          found_call = false;
         }
       }
-
-      // TODO: Find the optimal value for FUNCTION_FOLLOWING_THRESHOLD.
-      //       If it is small, the performance will be down.
-      //       If it is big, the size of generated code will be big and
-      //       cache clearning will happen many times.
-      // TODO: Investivate the reason why
-      //       "0" is fastest in some games, MP2 for example.
-      if (numFollows > FUNCTION_FOLLOWING_THRESHOLD)
-        follow = false;
     }
 
     if (HasOption(OPTION_CONDITIONAL_CONTINUE))
@@ -759,27 +776,28 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
       }
     }
 
-    if (!follow)
+    if (follow)
     {
+      // Follow the unconditional branch.
+      numFollows++;
+      address = destination;
+    }
+    else
+    {
+      // Just pick the next instruction
       address += 4;
       if (!conditional_continue && opinfo->flags & FL_ENDBLOCK)  // right now we stop early
       {
         found_exit = true;
         break;
       }
+      if (conditional_continue)
+      {
+        // If we skip any conditional branch, we can't garantee to get the matching CALL/RET pair.
+        // So we stop inling the RET here and let the BLR optitmization handle this case.
+        found_call = false;
+      }
     }
-// XXX: We don't support inlining yet.
-#if 0
-		else
-		{
-			numFollows++;
-			// We don't "code[i].skip = true" here
-			// because bx may store a certain value to the link register.
-			// Instead, we skip a part of bx in Jit**::bx().
-			address = destination;
-			merged_addresses[size_of_merged_addresses++] = address;
-		}
-#endif
   }
 
   block->m_num_instructions = num_inst;
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index 02ebc42c18..5a3b86a57a 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -42,6 +42,7 @@ struct CodeOp  // 16B
   bool outputFPRF;
   bool outputCA;
   bool canEndBlock;
+  bool skipLRStack;
   bool skip;  // followed BL-s for example
   // which registers are still needed after this instruction in this block
   BitSet32 fprInUse;
@@ -189,11 +190,11 @@ public:
     // Requires JIT support to be enabled.
     OPTION_CONDITIONAL_CONTINUE = (1 << 0),
 
-    // If there is a unconditional branch that jumps to a leaf function then inline it.
+    // Try to inline unconditional branches/calls/returns.
+    // Also track the LR value to follow unconditional return instructions.
     // Might require JIT intervention to support it correctly.
-    // Requires JITBLock support for inlined code
-    // XXX: NOT COMPLETE
-    OPTION_LEAF_INLINE = (1 << 1),
+    // Especially if the BLR optimization is used.
+    OPTION_BRANCH_FOLLOW = (1 << 1),
 
     // Complex blocks support jumping backwards on to themselves.
     // Happens commonly in loops, pretty complex to support.