Merge pull request #4735 from degasus/jitcache
Jit64: Enable branch following.
This commit is contained in:
commit
5da565a1a1
|
@ -443,6 +443,16 @@ void XEmitter::CALL(const void* fnptr)
|
|||
Write32(u32(distance));
|
||||
}
|
||||
|
||||
FixupBranch XEmitter::CALL()
|
||||
{
|
||||
FixupBranch branch;
|
||||
branch.type = 1;
|
||||
branch.ptr = code + 5;
|
||||
Write8(0xE8);
|
||||
Write32(0);
|
||||
return branch;
|
||||
}
|
||||
|
||||
FixupBranch XEmitter::J(bool force5bytes)
|
||||
{
|
||||
FixupBranch branch;
|
||||
|
|
|
@ -467,6 +467,7 @@ public:
|
|||
#undef CALL
|
||||
#endif
|
||||
void CALL(const void* fnptr);
|
||||
FixupBranch CALL();
|
||||
void CALLptr(OpArg arg);
|
||||
|
||||
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
|
||||
|
|
|
@ -372,6 +372,21 @@ bool Jit64::Cleanup()
|
|||
return did_something;
|
||||
}
|
||||
|
||||
void Jit64::FakeBLCall(u32 after)
|
||||
{
|
||||
if (!m_enable_blr_optimization)
|
||||
return;
|
||||
|
||||
// We may need to fake the BLR stack on inlined CALL instructions.
|
||||
// Else we can't return to this location any more.
|
||||
MOV(32, R(RSCRATCH2), Imm32(after));
|
||||
PUSH(RSCRATCH2);
|
||||
FixupBranch skip_exit = CALL();
|
||||
POP(RSCRATCH2);
|
||||
JustWriteExit(after, false, 0);
|
||||
SetJumpTarget(skip_exit);
|
||||
}
|
||||
|
||||
void Jit64::WriteExit(u32 destination, bool bl, u32 after)
|
||||
{
|
||||
if (!m_enable_blr_optimization)
|
||||
|
@ -569,6 +584,7 @@ void Jit64::Jit(u32 em_address)
|
|||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
|
||||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
||||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||
}
|
||||
Trace();
|
||||
}
|
||||
|
@ -973,6 +989,7 @@ void Jit64::EnableOptimization()
|
|||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||
}
|
||||
|
||||
void Jit64::IntializeSpeculativeConstants()
|
||||
|
|
|
@ -85,6 +85,7 @@ public:
|
|||
|
||||
// Utilities for use by opcodes
|
||||
|
||||
void FakeBLCall(u32 after);
|
||||
void WriteExit(u32 destination, bool bl = false, u32 after = 0);
|
||||
void JustWriteExit(u32 destination, bool bl, u32 after);
|
||||
void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
|
||||
|
|
|
@ -74,6 +74,13 @@ void Jit64::bx(UGeckoInstruction inst)
|
|||
// Because PPCAnalyst::Flatten() merged the blocks.
|
||||
if (!js.isLastInstruction)
|
||||
{
|
||||
if (inst.LK && !js.op->skipLRStack)
|
||||
{
|
||||
// We have to fake the stack as the RET instruction was not
|
||||
// found in the same block. This is a big overhead, but still
|
||||
// better than calling the dispatcher.
|
||||
FakeBLCall(js.compilerPC + 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -131,6 +138,22 @@ void Jit64::bcx(UGeckoInstruction inst)
|
|||
if (inst.LK)
|
||||
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
|
||||
|
||||
// If this is not the last instruction of a block
|
||||
// and an unconditional branch, we will skip the rest process.
|
||||
// Because PPCAnalyst::Flatten() merged the blocks.
|
||||
if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
|
||||
(inst.BO & BO_DONT_CHECK_CONDITION))
|
||||
{
|
||||
if (inst.LK && !js.op->skipLRStack)
|
||||
{
|
||||
// We have to fake the stack as the RET instruction was not
|
||||
// found in the same block. This is a big overhead, but still
|
||||
// better than calling the dispatcher.
|
||||
FakeBLCall(js.compilerPC + 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
u32 destination;
|
||||
if (inst.AA)
|
||||
destination = SignExt16(inst.BD << 2);
|
||||
|
|
|
@ -55,6 +55,7 @@ void JitArm64::Init()
|
|||
code_block.m_fpa = &js.fpa;
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||
|
||||
m_supports_cycle_counter = HasCycleCounters();
|
||||
}
|
||||
|
|
|
@ -76,9 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(bJITBranchOff);
|
||||
|
||||
gpr.Flush(FlushMode::FLUSH_ALL);
|
||||
fpr.Flush(FlushMode::FLUSH_ALL);
|
||||
|
||||
u32 destination;
|
||||
if (inst.AA)
|
||||
destination = SignExt26(inst.LI << 2);
|
||||
|
@ -93,6 +90,14 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||
gpr.Unlock(WA);
|
||||
}
|
||||
|
||||
if (!js.isLastInstruction)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
gpr.Flush(FlushMode::FLUSH_ALL);
|
||||
fpr.Flush(FlushMode::FLUSH_ALL);
|
||||
|
||||
if (destination == js.compilerPC)
|
||||
{
|
||||
// make idle loops go faster
|
||||
|
|
|
@ -32,8 +32,9 @@
|
|||
namespace PPCAnalyst
|
||||
{
|
||||
constexpr int CODEBUFFER_SIZE = 32000;
|
||||
|
||||
// 0 does not perform block merging
|
||||
constexpr u32 FUNCTION_FOLLOWING_THRESHOLD = 16;
|
||||
constexpr u32 BRANCH_FOLLOWING_THRESHOLD = 2;
|
||||
|
||||
constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF;
|
||||
|
||||
|
@ -651,7 +652,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||
CodeOp* code = buffer->codebuffer;
|
||||
|
||||
bool found_exit = false;
|
||||
u32 return_address = 0;
|
||||
bool found_call = false;
|
||||
size_t caller = 0;
|
||||
u32 numFollows = 0;
|
||||
u32 num_inst = 0;
|
||||
|
||||
|
@ -686,50 +688,65 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||
|
||||
bool conditional_continue = false;
|
||||
|
||||
// Do we inline leaf functions?
|
||||
if (HasOption(OPTION_LEAF_INLINE))
|
||||
// TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD.
|
||||
// If it is small, the performance will be down.
|
||||
// If it is big, the size of generated code will be big and
|
||||
// cache clearning will happen many times.
|
||||
if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
|
||||
{
|
||||
if (inst.OPCD == 18 && blockSize > 1)
|
||||
{
|
||||
// Is bx - should we inline? yes!
|
||||
if (inst.AA)
|
||||
destination = SignExt26(inst.LI << 2);
|
||||
else
|
||||
destination = address + SignExt26(inst.LI << 2);
|
||||
if (destination != block->m_address)
|
||||
follow = true;
|
||||
// Always follow BX instructions.
|
||||
// TODO: Loop unrolling might bloat the code size too much.
|
||||
// Enable it carefully.
|
||||
follow = destination != block->m_address;
|
||||
destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
|
||||
if (inst.LK)
|
||||
{
|
||||
found_call = true;
|
||||
caller = i;
|
||||
}
|
||||
}
|
||||
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && (inst.BO & (1 << 4)) &&
|
||||
(inst.BO & (1 << 2)) && return_address != 0)
|
||||
else if (inst.OPCD == 16 && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
|
||||
(inst.BO & BO_DONT_CHECK_CONDITION) && blockSize > 1)
|
||||
{
|
||||
// Always follow unconditional BCX instructions, but they are very rare.
|
||||
follow = true;
|
||||
destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
|
||||
if (inst.LK)
|
||||
{
|
||||
found_call = true;
|
||||
caller = i;
|
||||
}
|
||||
}
|
||||
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
|
||||
(inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
|
||||
{
|
||||
// bclrx with unconditional branch = return
|
||||
// Follow it if we can propagate the LR value of the last CALL instruction.
|
||||
// Through it would be easy to track the upper level of call/return,
|
||||
// we can't guarantee the LR value. The PPC ABI forces all functions to push
|
||||
// the LR value on the stack as there are no spare registers. So we'd need
|
||||
// to check all store instruction to not alias with the stack.
|
||||
follow = true;
|
||||
destination = return_address;
|
||||
return_address = 0;
|
||||
destination = code[caller].address + 4;
|
||||
found_call = false;
|
||||
code[i].skip = true;
|
||||
|
||||
if (inst.LK)
|
||||
return_address = address + 4;
|
||||
// Skip the RET, so also don't generate the stack entry for the BLR optimization.
|
||||
code[caller].skipLRStack = true;
|
||||
}
|
||||
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
|
||||
{
|
||||
// mtspr
|
||||
// mtspr, skip CALL/RET merging as LR is overwritten.
|
||||
const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
|
||||
if (index == SPR_LR)
|
||||
{
|
||||
// We give up to follow the return address
|
||||
// because we have to check the register usage.
|
||||
return_address = 0;
|
||||
found_call = false;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Find the optimal value for FUNCTION_FOLLOWING_THRESHOLD.
|
||||
// If it is small, the performance will be down.
|
||||
// If it is big, the size of generated code will be big and
|
||||
// cache clearning will happen many times.
|
||||
// TODO: Investivate the reason why
|
||||
// "0" is fastest in some games, MP2 for example.
|
||||
if (numFollows > FUNCTION_FOLLOWING_THRESHOLD)
|
||||
follow = false;
|
||||
}
|
||||
|
||||
if (HasOption(OPTION_CONDITIONAL_CONTINUE))
|
||||
|
@ -759,27 +776,28 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||
}
|
||||
}
|
||||
|
||||
if (!follow)
|
||||
if (follow)
|
||||
{
|
||||
// Follow the unconditional branch.
|
||||
numFollows++;
|
||||
address = destination;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Just pick the next instruction
|
||||
address += 4;
|
||||
if (!conditional_continue && opinfo->flags & FL_ENDBLOCK) // right now we stop early
|
||||
{
|
||||
found_exit = true;
|
||||
break;
|
||||
}
|
||||
if (conditional_continue)
|
||||
{
|
||||
// If we skip any conditional branch, we can't garantee to get the matching CALL/RET pair.
|
||||
// So we stop inling the RET here and let the BLR optitmization handle this case.
|
||||
found_call = false;
|
||||
}
|
||||
}
|
||||
// XXX: We don't support inlining yet.
|
||||
#if 0
|
||||
else
|
||||
{
|
||||
numFollows++;
|
||||
// We don't "code[i].skip = true" here
|
||||
// because bx may store a certain value to the link register.
|
||||
// Instead, we skip a part of bx in Jit**::bx().
|
||||
address = destination;
|
||||
merged_addresses[size_of_merged_addresses++] = address;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
block->m_num_instructions = num_inst;
|
||||
|
|
|
@ -42,6 +42,7 @@ struct CodeOp // 16B
|
|||
bool outputFPRF;
|
||||
bool outputCA;
|
||||
bool canEndBlock;
|
||||
bool skipLRStack;
|
||||
bool skip; // followed BL-s for example
|
||||
// which registers are still needed after this instruction in this block
|
||||
BitSet32 fprInUse;
|
||||
|
@ -189,11 +190,11 @@ public:
|
|||
// Requires JIT support to be enabled.
|
||||
OPTION_CONDITIONAL_CONTINUE = (1 << 0),
|
||||
|
||||
// If there is a unconditional branch that jumps to a leaf function then inline it.
|
||||
// Try to inline unconditional branches/calls/returns.
|
||||
// Also track the LR value to follow unconditional return instructions.
|
||||
// Might require JIT intervention to support it correctly.
|
||||
// Requires JITBLock support for inlined code
|
||||
// XXX: NOT COMPLETE
|
||||
OPTION_LEAF_INLINE = (1 << 1),
|
||||
// Especially if the BLR optimization is used.
|
||||
OPTION_BRANCH_FOLLOW = (1 << 1),
|
||||
|
||||
// Complex blocks support jumping backwards on to themselves.
|
||||
// Happens commonly in loops, pretty complex to support.
|
||||
|
|
Loading…
Reference in New Issue