Merge pull request #4796 from degasus/blr

JitArm64: Initial implementation of the BLR optimization.
This commit is contained in:
Markus Wick 2017-02-02 21:24:23 +01:00 committed by GitHub
commit 318a387e92
8 changed files with 209 additions and 28 deletions

View File

@ -46,9 +46,7 @@ void JitArm64::Init()
UpdateMemoryOptions(); UpdateMemoryOptions();
gpr.Init(this); gpr.Init(this);
fpr.Init(this); fpr.Init(this);
blocks.Init(); blocks.Init();
GenerateAsm();
code_block.m_stats = &js.st; code_block.m_stats = &js.st;
code_block.m_gpa = &js.gpa; code_block.m_gpa = &js.gpa;
@ -56,6 +54,9 @@ void JitArm64::Init()
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
m_enable_blr_optimization = true;
GenerateAsm();
m_supports_cycle_counter = HasCycleCounters(); m_supports_cycle_counter = HasCycleCounters();
} }
@ -192,8 +193,16 @@ void JitArm64::DoDownCount()
gpr.Unlock(WA, WB); gpr.Unlock(WA, WB);
} }
// Exits void JitArm64::ResetStack()
void JitArm64::WriteExit(u32 destination) {
if (!m_enable_blr_optimization)
return;
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
SUB(SP, X0, 16);
}
void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return)
{ {
Cleanup(); Cleanup();
DoDownCount(); DoDownCount();
@ -201,31 +210,159 @@ void JitArm64::WriteExit(u32 destination)
if (Profiler::g_ProfileBlocks) if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock); EndTimeProfile(js.curBlock);
// If nobody has taken care of this yet (this can be removed when all branches are done) LK &= m_enable_blr_optimization;
if (LK)
{
// Push {ARM_PC+20; PPC_PC} on the stack
MOVI2R(X1, exit_address_after_return);
ADR(X0, 20);
STP(INDEX_PRE, X0, X1, SP, -16);
}
JitBlock* b = js.curBlock; JitBlock* b = js.curBlock;
JitBlock::LinkData linkData; JitBlock::LinkData linkData;
linkData.exitAddress = destination; linkData.exitAddress = destination;
linkData.exitPtrs = GetWritableCodePtr(); linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false; linkData.linkStatus = false;
linkData.call = LK;
b->linkData.push_back(linkData); b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, destination); MOVI2R(DISPATCHER_PC, destination);
if (!LK)
{
B(dispatcher); B(dispatcher);
}
else
{
BL(dispatcher);
// MOVI2R might only require one instruction. So the const offset of 20 bytes
// might be wrong. Be sure and just add a NOP here.
HINT(HINT_NOP);
// Write the regular exit node after the return.
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
}
} }
void JitArm64::WriteExit(ARM64Reg Reg) void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return)
{ {
Cleanup(); Cleanup();
DoDownCount(); DoDownCount();
if (Reg != DISPATCHER_PC) LK &= m_enable_blr_optimization;
MOV(DISPATCHER_PC, Reg);
gpr.Unlock(Reg); if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
gpr.Unlock(dest);
if (Profiler::g_ProfileBlocks) if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock); EndTimeProfile(js.curBlock);
if (!LK)
{
B(dispatcher); B(dispatcher);
}
else
{
// Push {ARM_PC, PPC_PC} on the stack
MOVI2R(X1, exit_address_after_return);
ADR(X0, 12);
STP(INDEX_PRE, X0, X1, SP, -16);
BL(dispatcher);
// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
}
}
void JitArm64::FakeLKExit(u32 exit_address_after_return)
{
if (!m_enable_blr_optimization)
return;
// We may need to fake the BLR stack on inlined CALL instructions.
// Else we can't return to this location any more.
ARM64Reg after_reg = gpr.GetReg();
ARM64Reg code_reg = gpr.GetReg();
MOVI2R(after_reg, exit_address_after_return);
ADR(EncodeRegTo64(code_reg), 12);
STP(INDEX_PRE, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), SP, -16);
gpr.Unlock(after_reg, code_reg);
FixupBranch skip_exit = BL();
// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);
MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
SetJumpTarget(skip_exit);
}
void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest)
{
if (!m_enable_blr_optimization)
{
WriteExit(dest);
return;
}
Cleanup();
if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
ARM64Reg code = gpr.GetReg();
ARM64Reg pc = gpr.GetReg();
// Check if {ARM_PC, PPC_PC} matches the current state.
LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16);
CMP(pc, dest);
FixupBranch no_match = B(CC_NEQ);
DoDownCount();
RET(EncodeRegTo64(code));
SetJumpTarget(no_match);
DoDownCount();
if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
ResetStack();
B(dispatcher);
gpr.Unlock(dest, pc, code);
} }
void JitArm64::WriteExceptionExit(u32 destination, bool only_external) void JitArm64::WriteExceptionExit(u32 destination, bool only_external)
@ -399,11 +536,11 @@ void JitArm64::Jit(u32)
} }
JitBlock* b = blocks.AllocateBlock(em_address); JitBlock* b = blocks.AllocateBlock(em_address);
const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC); DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses); blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
} }
const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC) void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
{ {
if (em_address == 0) if (em_address == 0)
{ {
@ -629,5 +766,4 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
FlushIcache(); FlushIcache();
farcode.FlushIcache(); farcode.FlushIcache();
return start;
} }

View File

@ -190,6 +190,8 @@ private:
// Do we support cycle counter profiling? // Do we support cycle counter profiling?
bool m_supports_cycle_counter; bool m_supports_cycle_counter;
bool m_enable_blr_optimization;
void EmitResetCycleCounters(); void EmitResetCycleCounters();
void EmitGetCycles(Arm64Gen::ARM64Reg reg); void EmitGetCycles(Arm64Gen::ARM64Reg reg);
@ -219,10 +221,11 @@ private:
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC); void DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
void DoDownCount(); void DoDownCount();
void Cleanup(); void Cleanup();
void ResetStack();
// AsmRoutines // AsmRoutines
void GenerateAsm(); void GenerateAsm();
@ -234,10 +237,12 @@ private:
void EndTimeProfile(JitBlock* b); void EndTimeProfile(JitBlock* b);
// Exits // Exits
void WriteExit(u32 destination); void WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0);
void WriteExit(Arm64Gen::ARM64Reg dest); void WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0);
void WriteExceptionExit(u32 destination, bool only_external = false); void WriteExceptionExit(u32 destination, bool only_external = false);
void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false); void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false);
void FakeLKExit(u32 exit_address_after_return);
void WriteBLRExit(Arm64Gen::ARM64Reg dest);
FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);

View File

@ -18,6 +18,12 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const
ARM64XEmitter emit(location); ARM64XEmitter emit(location);
if (dest) if (dest)
{
if (source.call)
{
emit.BL(dest->checkedEntry);
}
else
{ {
// Are we able to jump directly to the normal entry? // Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2; s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
@ -30,9 +36,13 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const
// or if we're not able to inline the downcount check here. // or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry); emit.B(dest->checkedEntry);
} }
}
else else
{ {
emit.MOVI2R(DISPATCHER_PC, source.exitAddress); emit.MOVI2R(DISPATCHER_PC, source.exitAddress);
if (source.call)
emit.BL(m_jit.GetAsmRoutines()->dispatcher);
else
emit.B(m_jit.GetAsmRoutines()->dispatcher); emit.B(m_jit.GetAsmRoutines()->dispatcher);
} }
emit.FlushIcache(); emit.FlushIcache();

View File

@ -92,6 +92,13 @@ void JitArm64::bx(UGeckoInstruction inst)
if (!js.isLastInstruction) if (!js.isLastInstruction)
{ {
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
// found in the same block. This is a big overhead, but still
// better than calling the dispatcher.
FakeLKExit(js.compilerPC + 4);
}
return; return;
} }
@ -112,7 +119,7 @@ void JitArm64::bx(UGeckoInstruction inst)
return; return;
} }
WriteExit(destination); WriteExit(destination, inst.LK, js.compilerPC + 4);
} }
void JitArm64::bcx(UGeckoInstruction inst) void JitArm64::bcx(UGeckoInstruction inst)
@ -162,7 +169,7 @@ void JitArm64::bcx(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
WriteExit(destination); WriteExit(destination, inst.LK, js.compilerPC + 4);
SwitchToNearCode(); SwitchToNearCode();
@ -211,7 +218,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst)
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR])); LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR]));
AND(WA, WA, 30, 29); // Wipe the bottom 2 bits. AND(WA, WA, 30, 29); // Wipe the bottom 2 bits.
WriteExit(WA);
WriteExit(WA, inst.LK_3, js.compilerPC + 4);
} }
void JitArm64::bclrx(UGeckoInstruction inst) void JitArm64::bclrx(UGeckoInstruction inst)
@ -264,7 +272,7 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
WriteExit(WA); WriteBLRExit(WA);
if (conditional) if (conditional)
SwitchToNearCode(); SwitchToNearCode();

View File

@ -56,6 +56,10 @@ void JitArm64::mtmsr(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_ALL); gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL); fpr.Flush(FlushMode::FLUSH_ALL);
// Our jit cache also stores some MSR bits, as they have changed, we either
// have to validate them in the BLR/RET check, or just flush the stack here.
ResetStack();
WriteExceptionExit(js.compilerPC + 4, true); WriteExceptionExit(js.compilerPC + 4, true);
} }

View File

@ -28,6 +28,14 @@ void JitArm64::GenerateAsm()
MOVP2R(PPC_REG, &PowerPC::ppcState); MOVP2R(PPC_REG, &PowerPC::ppcState);
// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(X0, SP, 0);
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
// Push {nullptr; -1} as invalid destination on the stack.
MOVI2R(X0, 0xFFFFFFFF);
STP(INDEX_PRE, ZR, X0, SP, -16);
// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance(). // The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
// Advance() does an exception check so we don't know what PC to use until afterwards. // Advance() does an exception check so we don't know what PC to use until afterwards.
FixupBranch to_start_of_timing_slice = B(); FixupBranch to_start_of_timing_slice = B();
@ -119,6 +127,7 @@ void JitArm64::GenerateAsm()
// Call JIT // Call JIT
SetJumpTarget(no_block_available); SetJumpTarget(no_block_available);
ResetStack();
MOV(W0, DISPATCHER_PC); MOV(W0, DISPATCHER_PC);
MOVP2R(X30, reinterpret_cast<void*>(&JitTrampoline)); MOVP2R(X30, reinterpret_cast<void*>(&JitTrampoline));
BLR(X30); BLR(X30);
@ -150,6 +159,11 @@ void JitArm64::GenerateAsm()
B(dispatcherNoCheck); B(dispatcherNoCheck);
SetJumpTarget(Exit); SetJumpTarget(Exit);
// Reset the stack pointer, as the BLR optimization have touched it.
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
ADD(SP, X0, 0);
ABI_PopRegisters(regs_to_save); ABI_PopRegisters(regs_to_save);
RET(X30); RET(X30);

View File

@ -58,6 +58,7 @@ struct JitBlock
u8* exitPtrs; // to be able to rewrite the exit jump u8* exitPtrs; // to be able to rewrite the exit jump
u32 exitAddress; u32 exitAddress;
bool linkStatus; // is it already linked? bool linkStatus; // is it already linked?
bool call;
}; };
std::vector<LinkData> linkData; std::vector<LinkData> linkData;

View File

@ -116,6 +116,9 @@ struct PowerPCState
// also for power management, but we don't care about that. // also for power management, but we don't care about that.
u32 spr[1024]; u32 spr[1024];
// Storage for the stack pointer of the BLR optimization.
u8* stored_stack_pointer;
std::array<std::array<tlb_entry, TLB_SIZE / TLB_WAYS>, NUM_TLBS> tlb; std::array<std::array<tlb_entry, TLB_SIZE / TLB_WAYS>, NUM_TLBS> tlb;
u32 pagetable_base; u32 pagetable_base;