Opportunistically predict BLR destinations using RET.
When executing a BL-type instruction, push the new LR onto the stack, then CALL the dispatcher or linked block rather than JMPing to it. When executing BLR, compare [rsp+8] to LR, and RET if it's right, which it usually will be unless the thread was switched out. If it's not right, reset RSP to avoid overflow. This both saves a trip through the dispatcher and improves branch prediction. There is a small possibility of stack overflow anyway, which should be handled... *yawn*
This commit is contained in:
parent
558dee84ca
commit
b597ec3e08
|
@ -227,31 +227,55 @@ static void ImHere()
|
|||
been_here[PC] = 1;
|
||||
}
|
||||
|
||||
void Jit64::Cleanup()
|
||||
bool Jit64::Cleanup()
|
||||
{
|
||||
bool did_something = false;
|
||||
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
|
||||
{
|
||||
ABI_PushRegistersAndAdjustStack(0, 0);
|
||||
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
|
||||
ABI_PopRegistersAndAdjustStack(0, 0);
|
||||
did_something = true;
|
||||
}
|
||||
|
||||
// SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time.
|
||||
if (MMCR0.Hex || MMCR1.Hex)
|
||||
{
|
||||
ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst);
|
||||
did_something = true;
|
||||
}
|
||||
|
||||
return did_something;
|
||||
}
|
||||
|
||||
void Jit64::WriteExit(u32 destination)
|
||||
void Jit64::WriteExit(u32 destination, bool bl, u32 after)
|
||||
{
|
||||
// BLR optimization has similar consequences to block linking.
|
||||
if (!jo.enableBlocklink)
|
||||
{
|
||||
bl = false;
|
||||
}
|
||||
|
||||
Cleanup();
|
||||
|
||||
if (bl)
|
||||
{
|
||||
MOV(32, R(RSCRATCH2), Imm32(after));
|
||||
PUSH(RSCRATCH2);
|
||||
}
|
||||
|
||||
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
|
||||
|
||||
JustWriteExit(destination, bl, after);
|
||||
}
|
||||
|
||||
void Jit64::JustWriteExit(u32 destination, bool bl, u32 after)
|
||||
{
|
||||
//If nobody has taken care of this yet (this can be removed when all branches are done)
|
||||
JitBlock *b = js.curBlock;
|
||||
JitBlock::LinkData linkData;
|
||||
linkData.exitAddress = destination;
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
linkData.linkStatus = false;
|
||||
|
||||
// Link opportunity!
|
||||
|
@ -259,24 +283,78 @@ void Jit64::WriteExit(u32 destination)
|
|||
if (jo.enableBlocklink && (block = blocks.GetBlockNumberFromStartAddress(destination)) >= 0)
|
||||
{
|
||||
// It exists! Joy of joy!
|
||||
JMP(blocks.GetBlock(block)->checkedEntry, true);
|
||||
JitBlock* jb = blocks.GetBlock(block);
|
||||
const u8* addr = jb->checkedEntry;
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
if (bl)
|
||||
CALL(addr);
|
||||
else
|
||||
JMP(addr, true);
|
||||
linkData.linkStatus = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(32, PPCSTATE(pc), Imm32(destination));
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
if (bl)
|
||||
CALL(asm_routines.dispatcher);
|
||||
else
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
}
|
||||
|
||||
b->linkData.push_back(linkData);
|
||||
|
||||
if (bl)
|
||||
{
|
||||
POP(RSCRATCH);
|
||||
JustWriteExit(after, false, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void Jit64::WriteExitDestInRSCRATCH()
|
||||
void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after)
|
||||
{
|
||||
if (!jo.enableBlocklink)
|
||||
{
|
||||
bl = false;
|
||||
}
|
||||
if (bl)
|
||||
{
|
||||
MOV(32, R(RSCRATCH2), Imm32(after));
|
||||
PUSH(RSCRATCH2);
|
||||
}
|
||||
MOV(32, PPCSTATE(pc), R(RSCRATCH));
|
||||
Cleanup();
|
||||
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
if (bl)
|
||||
{
|
||||
CALL(asm_routines.dispatcher);
|
||||
POP(RSCRATCH);
|
||||
JustWriteExit(after, false, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
}
|
||||
}
|
||||
|
||||
void Jit64::WriteBLRExit()
|
||||
{
|
||||
if (!jo.enableBlocklink)
|
||||
{
|
||||
WriteExitDestInRSCRATCH();
|
||||
return;
|
||||
}
|
||||
MOV(32, PPCSTATE(pc), R(RSCRATCH));
|
||||
bool disturbed = Cleanup();
|
||||
if (disturbed)
|
||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||
CMP(64, R(RSCRATCH), MDisp(RSP, 8));
|
||||
FixupBranch nope = J_CC(CC_NE);
|
||||
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
|
||||
RET();
|
||||
SetJumpTarget(nope);
|
||||
MOV(32, R(RSCRATCH), Imm32(js.downcountAmount));
|
||||
JMP(asm_routines.dispatcherMispredictedBLR, true);
|
||||
}
|
||||
|
||||
void Jit64::WriteRfiExitDestInRSCRATCH()
|
||||
|
|
|
@ -89,13 +89,15 @@ public:
|
|||
|
||||
// Utilities for use by opcodes
|
||||
|
||||
void WriteExit(u32 destination);
|
||||
void WriteExitDestInRSCRATCH();
|
||||
void WriteExit(u32 destination, bool bl = false, u32 after = 0);
|
||||
void JustWriteExit(u32 destination, bool bl, u32 after);
|
||||
void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
|
||||
void WriteBLRExit();
|
||||
void WriteExceptionExit();
|
||||
void WriteExternalExceptionExit();
|
||||
void WriteRfiExitDestInRSCRATCH();
|
||||
void WriteCallInterpreter(UGeckoInstruction _inst);
|
||||
void Cleanup();
|
||||
bool Cleanup();
|
||||
|
||||
void GenerateConstantOverflow(bool overflow);
|
||||
void GenerateConstantOverflow(s64 val);
|
||||
|
|
|
@ -9,6 +9,9 @@
|
|||
|
||||
using namespace Gen;
|
||||
|
||||
// Not PowerPC state. Can't put in 'this' because it's out of range...
|
||||
static void* s_saved_rsp;
|
||||
|
||||
// PLAN: no more block numbers - crazy opcodes just contain offset within
|
||||
// dynarec buffer
|
||||
// At this offset - 4, there is an int specifying the block number.
|
||||
|
@ -16,7 +19,13 @@ using namespace Gen;
|
|||
void Jit64AsmRoutineManager::Generate()
|
||||
{
|
||||
enterCode = AlignCode16();
|
||||
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
|
||||
// We need to own the beginning of RSP, so we do an extra stack adjustment
|
||||
// for the shadow region before calls in this function. This call will
|
||||
// waste a bit of space for a second shadow, but whatever.
|
||||
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, /*frame*/ 16);
|
||||
// something that can't pass the BLR test
|
||||
MOV(64, M(&s_saved_rsp), R(RSP));
|
||||
MOV(64, MDisp(RSP, 8), Imm32((u32)-1));
|
||||
|
||||
// Two statically allocated registers.
|
||||
MOV(64, R(RMEM), Imm64((u64)Memory::base));
|
||||
|
@ -24,8 +33,22 @@ void Jit64AsmRoutineManager::Generate()
|
|||
MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));
|
||||
|
||||
const u8* outerLoop = GetCodePtr();
|
||||
ABI_PushRegistersAndAdjustStack(0, 0);
|
||||
ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
|
||||
ABI_PopRegistersAndAdjustStack(0, 0);
|
||||
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
|
||||
dispatcherMispredictedBLR = GetCodePtr();
|
||||
|
||||
#if 0 // debug mispredicts
|
||||
MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
|
||||
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0);
|
||||
CALL(reinterpret_cast<void *>(&ReportMispredict));
|
||||
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0);
|
||||
#endif
|
||||
|
||||
MOV(64, R(RSP), M(&s_saved_rsp));
|
||||
|
||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
|
||||
|
||||
dispatcher = GetCodePtr();
|
||||
// The result of slice decrementation should be in flags if somebody jumped here
|
||||
|
@ -36,10 +59,13 @@ void Jit64AsmRoutineManager::Generate()
|
|||
{
|
||||
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING));
|
||||
FixupBranch notStepping = J_CC(CC_Z);
|
||||
ABI_PushRegistersAndAdjustStack(0, 0);
|
||||
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
|
||||
ABI_PopRegistersAndAdjustStack(0, 0);
|
||||
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
|
||||
FixupBranch noBreakpoint = J_CC(CC_Z);
|
||||
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
|
||||
MOV(64, R(RSP), M(&s_saved_rsp));
|
||||
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
|
||||
RET();
|
||||
SetJumpTarget(noBreakpoint);
|
||||
SetJumpTarget(notStepping);
|
||||
|
@ -120,14 +146,17 @@ void Jit64AsmRoutineManager::Generate()
|
|||
FixupBranch noExtException = J_CC(CC_Z);
|
||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||
MOV(32, PPCSTATE(npc), R(RSCRATCH));
|
||||
ABI_PushRegistersAndAdjustStack(0, 0);
|
||||
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
|
||||
ABI_PopRegistersAndAdjustStack(0, 0);
|
||||
SetJumpTarget(noExtException);
|
||||
|
||||
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
|
||||
J_CC(CC_Z, outerLoop);
|
||||
|
||||
//Landing pad for drec space
|
||||
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
|
||||
MOV(64, R(RSP), M(&s_saved_rsp));
|
||||
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
|
||||
RET();
|
||||
|
||||
GenerateCommon();
|
||||
|
|
|
@ -92,7 +92,7 @@ void Jit64::bx(UGeckoInstruction inst)
|
|||
// make idle loops go faster
|
||||
js.downcountAmount += 8;
|
||||
}
|
||||
WriteExit(destination);
|
||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
||||
}
|
||||
|
||||
// TODO - optimize to hell and beyond
|
||||
|
@ -133,7 +133,7 @@ void Jit64::bcx(UGeckoInstruction inst)
|
|||
|
||||
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||
WriteExit(destination);
|
||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
||||
|
||||
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
|
||||
SetJumpTarget( pConditionDontBranch );
|
||||
|
@ -168,7 +168,7 @@ void Jit64::bcctrx(UGeckoInstruction inst)
|
|||
if (inst.LK_3)
|
||||
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4)); // LR = PC + 4;
|
||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
WriteExitDestInRSCRATCH();
|
||||
WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -187,7 +187,7 @@ void Jit64::bcctrx(UGeckoInstruction inst)
|
|||
|
||||
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||
WriteExitDestInRSCRATCH();
|
||||
WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4);
|
||||
// Would really like to continue the block here, but it ends. TODO.
|
||||
SetJumpTarget(b);
|
||||
|
||||
|
@ -235,7 +235,7 @@ void Jit64::bclrx(UGeckoInstruction inst)
|
|||
|
||||
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||
WriteExitDestInRSCRATCH();
|
||||
WriteBLRExit();
|
||||
|
||||
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
|
||||
SetJumpTarget( pConditionDontBranch );
|
||||
|
|
|
@ -312,7 +312,7 @@ void Jit64::DoMergedBranch()
|
|||
destination = SignExt16(js.next_inst.BD << 2);
|
||||
else
|
||||
destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
|
||||
WriteExit(destination);
|
||||
WriteExit(destination, js.next_inst.LK, js.next_compilerPC + 4);
|
||||
}
|
||||
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
|
||||
{
|
||||
|
@ -320,7 +320,7 @@ void Jit64::DoMergedBranch()
|
|||
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
|
||||
MOV(32, R(RSCRATCH), M(&CTR));
|
||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
WriteExitDestInRSCRATCH();
|
||||
WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4);
|
||||
}
|
||||
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
|
||||
{
|
||||
|
@ -328,7 +328,7 @@ void Jit64::DoMergedBranch()
|
|||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
if (js.next_inst.LK)
|
||||
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
|
||||
WriteExitDestInRSCRATCH();
|
||||
WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -17,6 +17,7 @@ public:
|
|||
|
||||
const u8 *enterCode;
|
||||
|
||||
const u8 *dispatcherMispredictedBLR;
|
||||
const u8 *dispatcher;
|
||||
const u8 *dispatcherNoCheck;
|
||||
const u8 *dispatcherPcInRSCRATCH;
|
||||
|
|
|
@ -364,7 +364,10 @@ using namespace Gen;
|
|||
void JitBlockCache::WriteLinkBlock(u8* location, const u8* address)
|
||||
{
|
||||
XEmitter emit(location);
|
||||
emit.JMP(address, true);
|
||||
if (*location == 0xE8)
|
||||
emit.CALL(address);
|
||||
else
|
||||
emit.JMP(address, true);
|
||||
}
|
||||
|
||||
void JitBlockCache::WriteDestroyBlock(const u8* location, u32 address)
|
||||
|
|
Loading…
Reference in New Issue