PowerPC: More idle loop detections.

This commit is contained in:
degasus 2018-08-09 09:40:12 +02:00
parent 55db7c7a05
commit b8b4b4a383
3 changed files with 48 additions and 20 deletions

View File

@ -285,7 +285,19 @@ void Jit64::bclrx(UGeckoInstruction inst)
RCForkGuard fpr_guard = fpr.Fork(); RCForkGuard fpr_guard = fpr.Fork();
gpr.Flush(); gpr.Flush();
fpr.Flush(); fpr.Flush();
WriteBLRExit();
if (js.op->branchIsIdleLoop)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(CoreTiming::Idle);
ABI_PopRegistersAndAdjustStack({}, 0);
MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo));
WriteExceptionExit();
}
else
{
WriteBLRExit();
}
} }
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)

View File

@ -278,7 +278,20 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
WriteBLRExit(WA); if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg XA = EncodeRegTo64(WA);
MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);
WriteExceptionExit(js.op->branchTo);
}
else
{
WriteBLRExit(WA);
}
gpr.Unlock(WA); gpr.Unlock(WA);

View File

@ -782,9 +782,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
SetInstructionStats(block, &code[i], opinfo, static_cast<u32>(i)); SetInstructionStats(block, &code[i], opinfo, static_cast<u32>(i));
code[i].branchIsIdleLoop =
code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i);
bool follow = false; bool follow = false;
bool conditional_continue = false; bool conditional_continue = false;
@ -793,7 +790,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
// If it is small, the performance will be down. // If it is small, the performance will be down.
// If it is big, the size of generated code will be big and // If it is big, the size of generated code will be big and
// cache clearning will happen many times. // cache clearning will happen many times.
if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD) if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW))
{ {
if (inst.OPCD == 18 && block_size > 1) if (inst.OPCD == 18 && block_size > 1)
{ {
@ -816,22 +813,25 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
caller = i; caller = i;
} }
} }
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call && else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call)
(inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
{ {
// bclrx with unconditional branch = return
// Follow it if we can propagate the LR value of the last CALL instruction.
// Through it would be easy to track the upper level of call/return,
// we can't guarantee the LR value. The PPC ABI forces all functions to push
// the LR value on the stack as there are no spare registers. So we'd need
// to check all store instruction to not alias with the stack.
follow = true;
code[i].branchTo = code[caller].address + 4; code[i].branchTo = code[caller].address + 4;
found_call = false; if ((inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION) &&
code[i].skip = true; numFollows < BRANCH_FOLLOWING_THRESHOLD)
{
// bclrx with unconditional branch = return
// Follow it if we can propagate the LR value of the last CALL instruction.
// Through it would be easy to track the upper level of call/return,
// we can't guarantee the LR value. The PPC ABI forces all functions to push
// the LR value on the stack as there are no spare registers. So we'd need
// to check all store instruction to not alias with the stack.
follow = true;
found_call = false;
code[i].skip = true;
// Skip the RET, so also don't generate the stack entry for the BLR optimization. // Skip the RET, so also don't generate the stack entry for the BLR optimization.
code[caller].skipLRStack = true; code[caller].skipLRStack = true;
}
} }
else if (inst.OPCD == 31 && inst.SUBOP10 == 467) else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
{ {
@ -874,7 +874,10 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
} }
} }
if (follow) code[i].branchIsIdleLoop =
code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i);
if (follow && numFollows < BRANCH_FOLLOWING_THRESHOLD)
{ {
// Follow the unconditional branch. // Follow the unconditional branch.
numFollows++; numFollows++;