Opportunistically predict BLR destinations using RET.

When executing a BL-type instruction, push the new LR onto the stack,
then CALL the dispatcher or linked block rather than JMPing to it.  When
executing BLR, compare [rsp+8] to LR, and RET if it's right, which it
usually will be unless the thread was switched out.  If it's not right,
reset RSP to avoid overflow.

This both saves a trip through the dispatcher and improves branch
prediction.

There is a small possibility of stack overflow anyway, which should
be handled... *yawn*
This commit is contained in:
comex 2014-09-07 16:36:25 -04:00
parent 558dee84ca
commit b597ec3e08
7 changed files with 135 additions and 22 deletions

View File

@ -227,31 +227,55 @@ static void ImHere()
been_here[PC] = 1;
}
void Jit64::Cleanup()
bool Jit64::Cleanup()
{
bool did_something = false;
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
{
ABI_PushRegistersAndAdjustStack(0, 0);
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
ABI_PopRegistersAndAdjustStack(0, 0);
did_something = true;
}
// SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time.
if (MMCR0.Hex || MMCR1.Hex)
{
ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst);
did_something = true;
}
void Jit64::WriteExit(u32 destination)
return did_something;
}
void Jit64::WriteExit(u32 destination, bool bl, u32 after)
{
// BLR optimization has similar consequences to block linking.
if (!jo.enableBlocklink)
{
bl = false;
}
Cleanup();
if (bl)
{
MOV(32, R(RSCRATCH2), Imm32(after));
PUSH(RSCRATCH2);
}
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
JustWriteExit(destination, bl, after);
}
void Jit64::JustWriteExit(u32 destination, bool bl, u32 after)
{
//If nobody has taken care of this yet (this can be removed when all branches are done)
JitBlock *b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = destination;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
// Link opportunity!
@ -259,25 +283,79 @@ void Jit64::WriteExit(u32 destination)
if (jo.enableBlocklink && (block = blocks.GetBlockNumberFromStartAddress(destination)) >= 0)
{
// It exists! Joy of joy!
JMP(blocks.GetBlock(block)->checkedEntry, true);
JitBlock* jb = blocks.GetBlock(block);
const u8* addr = jb->checkedEntry;
linkData.exitPtrs = GetWritableCodePtr();
if (bl)
CALL(addr);
else
JMP(addr, true);
linkData.linkStatus = true;
}
else
{
MOV(32, PPCSTATE(pc), Imm32(destination));
linkData.exitPtrs = GetWritableCodePtr();
if (bl)
CALL(asm_routines.dispatcher);
else
JMP(asm_routines.dispatcher, true);
}
b->linkData.push_back(linkData);
if (bl)
{
POP(RSCRATCH);
JustWriteExit(after, false, 0);
}
}
void Jit64::WriteExitDestInRSCRATCH()
void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after)
{
if (!jo.enableBlocklink)
{
bl = false;
}
if (bl)
{
MOV(32, R(RSCRATCH2), Imm32(after));
PUSH(RSCRATCH2);
}
MOV(32, PPCSTATE(pc), R(RSCRATCH));
Cleanup();
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
if (bl)
{
CALL(asm_routines.dispatcher);
POP(RSCRATCH);
JustWriteExit(after, false, 0);
}
else
{
JMP(asm_routines.dispatcher, true);
}
}
void Jit64::WriteBLRExit()
{
if (!jo.enableBlocklink)
{
WriteExitDestInRSCRATCH();
return;
}
MOV(32, PPCSTATE(pc), R(RSCRATCH));
bool disturbed = Cleanup();
if (disturbed)
MOV(32, R(RSCRATCH), PPCSTATE(pc));
CMP(64, R(RSCRATCH), MDisp(RSP, 8));
FixupBranch nope = J_CC(CC_NE);
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
RET();
SetJumpTarget(nope);
MOV(32, R(RSCRATCH), Imm32(js.downcountAmount));
JMP(asm_routines.dispatcherMispredictedBLR, true);
}
void Jit64::WriteRfiExitDestInRSCRATCH()
{

View File

@ -89,13 +89,15 @@ public:
// Utilities for use by opcodes
void WriteExit(u32 destination);
void WriteExitDestInRSCRATCH();
void WriteExit(u32 destination, bool bl = false, u32 after = 0);
void JustWriteExit(u32 destination, bool bl, u32 after);
void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
void WriteBLRExit();
void WriteExceptionExit();
void WriteExternalExceptionExit();
void WriteRfiExitDestInRSCRATCH();
void WriteCallInterpreter(UGeckoInstruction _inst);
void Cleanup();
bool Cleanup();
void GenerateConstantOverflow(bool overflow);
void GenerateConstantOverflow(s64 val);

View File

@ -9,6 +9,9 @@
using namespace Gen;
// Not PowerPC state. Can't put in 'this' because it's out of range...
static void* s_saved_rsp;
// PLAN: no more block numbers - crazy opcodes just contain offset within
// dynarec buffer
// At this offset - 4, there is an int specifying the block number.
@ -16,7 +19,13 @@ using namespace Gen;
void Jit64AsmRoutineManager::Generate()
{
enterCode = AlignCode16();
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
// We need to own the beginning of RSP, so we do an extra stack adjustment
// for the shadow region before calls in this function. This call will
// waste a bit of space for a second shadow, but whatever.
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, /*frame*/ 16);
// something that can't pass the BLR test
MOV(64, M(&s_saved_rsp), R(RSP));
MOV(64, MDisp(RSP, 8), Imm32((u32)-1));
// Two statically allocated registers.
MOV(64, R(RMEM), Imm64((u64)Memory::base));
@ -24,8 +33,22 @@ void Jit64AsmRoutineManager::Generate()
MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));
const u8* outerLoop = GetCodePtr();
ABI_PushRegistersAndAdjustStack(0, 0);
ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
ABI_PopRegistersAndAdjustStack(0, 0);
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
dispatcherMispredictedBLR = GetCodePtr();
#if 0 // debug mispredicts
MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0);
CALL(reinterpret_cast<void *>(&ReportMispredict));
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0);
#endif
MOV(64, R(RSP), M(&s_saved_rsp));
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
dispatcher = GetCodePtr();
// The result of slice decrementation should be in flags if somebody jumped here
@ -36,10 +59,13 @@ void Jit64AsmRoutineManager::Generate()
{
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING));
FixupBranch notStepping = J_CC(CC_Z);
ABI_PushRegistersAndAdjustStack(0, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
ABI_PopRegistersAndAdjustStack(0, 0);
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
FixupBranch noBreakpoint = J_CC(CC_Z);
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
MOV(64, R(RSP), M(&s_saved_rsp));
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET();
SetJumpTarget(noBreakpoint);
SetJumpTarget(notStepping);
@ -120,14 +146,17 @@ void Jit64AsmRoutineManager::Generate()
FixupBranch noExtException = J_CC(CC_Z);
MOV(32, R(RSCRATCH), PPCSTATE(pc));
MOV(32, PPCSTATE(npc), R(RSCRATCH));
ABI_PushRegistersAndAdjustStack(0, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
ABI_PopRegistersAndAdjustStack(0, 0);
SetJumpTarget(noExtException);
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
J_CC(CC_Z, outerLoop);
//Landing pad for drec space
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
MOV(64, R(RSP), M(&s_saved_rsp));
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET();
GenerateCommon();

View File

@ -92,7 +92,7 @@ void Jit64::bx(UGeckoInstruction inst)
// make idle loops go faster
js.downcountAmount += 8;
}
WriteExit(destination);
WriteExit(destination, inst.LK, js.compilerPC + 4);
}
// TODO - optimize to hell and beyond
@ -133,7 +133,7 @@ void Jit64::bcx(UGeckoInstruction inst)
gpr.Flush(FLUSH_MAINTAIN_STATE);
fpr.Flush(FLUSH_MAINTAIN_STATE);
WriteExit(destination);
WriteExit(destination, inst.LK, js.compilerPC + 4);
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
SetJumpTarget( pConditionDontBranch );
@ -168,7 +168,7 @@ void Jit64::bcctrx(UGeckoInstruction inst)
if (inst.LK_3)
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4)); // LR = PC + 4;
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
WriteExitDestInRSCRATCH();
WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4);
}
else
{
@ -187,7 +187,7 @@ void Jit64::bcctrx(UGeckoInstruction inst)
gpr.Flush(FLUSH_MAINTAIN_STATE);
fpr.Flush(FLUSH_MAINTAIN_STATE);
WriteExitDestInRSCRATCH();
WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4);
// Would really like to continue the block here, but it ends. TODO.
SetJumpTarget(b);
@ -235,7 +235,7 @@ void Jit64::bclrx(UGeckoInstruction inst)
gpr.Flush(FLUSH_MAINTAIN_STATE);
fpr.Flush(FLUSH_MAINTAIN_STATE);
WriteExitDestInRSCRATCH();
WriteBLRExit();
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
SetJumpTarget( pConditionDontBranch );

View File

@ -312,7 +312,7 @@ void Jit64::DoMergedBranch()
destination = SignExt16(js.next_inst.BD << 2);
else
destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2);
WriteExit(destination);
WriteExit(destination, js.next_inst.LK, js.next_compilerPC + 4);
}
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx
{
@ -320,7 +320,7 @@ void Jit64::DoMergedBranch()
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
MOV(32, R(RSCRATCH), M(&CTR));
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
WriteExitDestInRSCRATCH();
WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4);
}
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
{
@ -328,7 +328,7 @@ void Jit64::DoMergedBranch()
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
if (js.next_inst.LK)
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
WriteExitDestInRSCRATCH();
WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4);
}
else
{

View File

@ -17,6 +17,7 @@ public:
const u8 *enterCode;
const u8 *dispatcherMispredictedBLR;
const u8 *dispatcher;
const u8 *dispatcherNoCheck;
const u8 *dispatcherPcInRSCRATCH;

View File

@ -364,6 +364,9 @@ using namespace Gen;
void JitBlockCache::WriteLinkBlock(u8* location, const u8* address)
{
XEmitter emit(location);
if (*location == 0xE8)
emit.CALL(address);
else
emit.JMP(address, true);
}