Merge pull request #4796 from degasus/blr
JitArm64: Initial implementation of the BLR optimization.
This commit is contained in:
commit
318a387e92
|
@ -46,9 +46,7 @@ void JitArm64::Init()
|
|||
UpdateMemoryOptions();
|
||||
gpr.Init(this);
|
||||
fpr.Init(this);
|
||||
|
||||
blocks.Init();
|
||||
GenerateAsm();
|
||||
|
||||
code_block.m_stats = &js.st;
|
||||
code_block.m_gpa = &js.gpa;
|
||||
|
@ -56,6 +54,9 @@ void JitArm64::Init()
|
|||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||
m_enable_blr_optimization = true;
|
||||
|
||||
GenerateAsm();
|
||||
|
||||
m_supports_cycle_counter = HasCycleCounters();
|
||||
}
|
||||
|
@ -192,8 +193,16 @@ void JitArm64::DoDownCount()
|
|||
gpr.Unlock(WA, WB);
|
||||
}
|
||||
|
||||
// Exits
|
||||
void JitArm64::WriteExit(u32 destination)
|
||||
void JitArm64::ResetStack()
|
||||
{
|
||||
if (!m_enable_blr_optimization)
|
||||
return;
|
||||
|
||||
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
|
||||
SUB(SP, X0, 16);
|
||||
}
|
||||
|
||||
void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return)
|
||||
{
|
||||
Cleanup();
|
||||
DoDownCount();
|
||||
|
@ -201,31 +210,159 @@ void JitArm64::WriteExit(u32 destination)
|
|||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
// If nobody has taken care of this yet (this can be removed when all branches are done)
|
||||
LK &= m_enable_blr_optimization;
|
||||
|
||||
if (LK)
|
||||
{
|
||||
// Push {ARM_PC+20; PPC_PC} on the stack
|
||||
MOVI2R(X1, exit_address_after_return);
|
||||
ADR(X0, 20);
|
||||
STP(INDEX_PRE, X0, X1, SP, -16);
|
||||
}
|
||||
|
||||
JitBlock* b = js.curBlock;
|
||||
JitBlock::LinkData linkData;
|
||||
linkData.exitAddress = destination;
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
linkData.linkStatus = false;
|
||||
linkData.call = LK;
|
||||
b->linkData.push_back(linkData);
|
||||
|
||||
MOVI2R(DISPATCHER_PC, destination);
|
||||
|
||||
if (!LK)
|
||||
{
|
||||
B(dispatcher);
|
||||
}
|
||||
else
|
||||
{
|
||||
BL(dispatcher);
|
||||
|
||||
// MOVI2R might only require one instruction. So the const offset of 20 bytes
|
||||
// might be wrong. Be sure and just add a NOP here.
|
||||
HINT(HINT_NOP);
|
||||
|
||||
// Write the regular exit node after the return.
|
||||
linkData.exitAddress = exit_address_after_return;
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
linkData.linkStatus = false;
|
||||
linkData.call = false;
|
||||
b->linkData.push_back(linkData);
|
||||
|
||||
MOVI2R(DISPATCHER_PC, exit_address_after_return);
|
||||
B(dispatcher);
|
||||
}
|
||||
}
|
||||
|
||||
void JitArm64::WriteExit(ARM64Reg Reg)
|
||||
void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return)
|
||||
{
|
||||
Cleanup();
|
||||
DoDownCount();
|
||||
|
||||
if (Reg != DISPATCHER_PC)
|
||||
MOV(DISPATCHER_PC, Reg);
|
||||
gpr.Unlock(Reg);
|
||||
LK &= m_enable_blr_optimization;
|
||||
|
||||
if (dest != DISPATCHER_PC)
|
||||
MOV(DISPATCHER_PC, dest);
|
||||
gpr.Unlock(dest);
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
if (!LK)
|
||||
{
|
||||
B(dispatcher);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Push {ARM_PC, PPC_PC} on the stack
|
||||
MOVI2R(X1, exit_address_after_return);
|
||||
ADR(X0, 12);
|
||||
STP(INDEX_PRE, X0, X1, SP, -16);
|
||||
|
||||
BL(dispatcher);
|
||||
|
||||
// Write the regular exit node after the return.
|
||||
JitBlock* b = js.curBlock;
|
||||
JitBlock::LinkData linkData;
|
||||
linkData.exitAddress = exit_address_after_return;
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
linkData.linkStatus = false;
|
||||
linkData.call = false;
|
||||
b->linkData.push_back(linkData);
|
||||
|
||||
MOVI2R(DISPATCHER_PC, exit_address_after_return);
|
||||
B(dispatcher);
|
||||
}
|
||||
}
|
||||
|
||||
void JitArm64::FakeLKExit(u32 exit_address_after_return)
|
||||
{
|
||||
if (!m_enable_blr_optimization)
|
||||
return;
|
||||
|
||||
// We may need to fake the BLR stack on inlined CALL instructions.
|
||||
// Else we can't return to this location any more.
|
||||
ARM64Reg after_reg = gpr.GetReg();
|
||||
ARM64Reg code_reg = gpr.GetReg();
|
||||
MOVI2R(after_reg, exit_address_after_return);
|
||||
ADR(EncodeRegTo64(code_reg), 12);
|
||||
STP(INDEX_PRE, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), SP, -16);
|
||||
gpr.Unlock(after_reg, code_reg);
|
||||
|
||||
FixupBranch skip_exit = BL();
|
||||
|
||||
// Write the regular exit node after the return.
|
||||
JitBlock* b = js.curBlock;
|
||||
JitBlock::LinkData linkData;
|
||||
linkData.exitAddress = exit_address_after_return;
|
||||
linkData.exitPtrs = GetWritableCodePtr();
|
||||
linkData.linkStatus = false;
|
||||
linkData.call = false;
|
||||
b->linkData.push_back(linkData);
|
||||
|
||||
MOVI2R(DISPATCHER_PC, exit_address_after_return);
|
||||
B(dispatcher);
|
||||
|
||||
SetJumpTarget(skip_exit);
|
||||
}
|
||||
|
||||
void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest)
|
||||
{
|
||||
if (!m_enable_blr_optimization)
|
||||
{
|
||||
WriteExit(dest);
|
||||
return;
|
||||
}
|
||||
|
||||
Cleanup();
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
ARM64Reg code = gpr.GetReg();
|
||||
ARM64Reg pc = gpr.GetReg();
|
||||
|
||||
// Check if {ARM_PC, PPC_PC} matches the current state.
|
||||
LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16);
|
||||
CMP(pc, dest);
|
||||
FixupBranch no_match = B(CC_NEQ);
|
||||
|
||||
DoDownCount();
|
||||
|
||||
RET(EncodeRegTo64(code));
|
||||
|
||||
SetJumpTarget(no_match);
|
||||
|
||||
DoDownCount();
|
||||
|
||||
if (dest != DISPATCHER_PC)
|
||||
MOV(DISPATCHER_PC, dest);
|
||||
|
||||
ResetStack();
|
||||
|
||||
B(dispatcher);
|
||||
|
||||
gpr.Unlock(dest, pc, code);
|
||||
}
|
||||
|
||||
void JitArm64::WriteExceptionExit(u32 destination, bool only_external)
|
||||
|
@ -399,11 +536,11 @@ void JitArm64::Jit(u32)
|
|||
}
|
||||
|
||||
JitBlock* b = blocks.AllocateBlock(em_address);
|
||||
const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC);
|
||||
DoJit(em_address, &code_buffer, b, nextPC);
|
||||
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
|
||||
}
|
||||
|
||||
const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
|
||||
void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
|
||||
{
|
||||
if (em_address == 0)
|
||||
{
|
||||
|
@ -629,5 +766,4 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
|
|||
|
||||
FlushIcache();
|
||||
farcode.FlushIcache();
|
||||
return start;
|
||||
}
|
||||
|
|
|
@ -190,6 +190,8 @@ private:
|
|||
// Do we support cycle counter profiling?
|
||||
bool m_supports_cycle_counter;
|
||||
|
||||
bool m_enable_blr_optimization;
|
||||
|
||||
void EmitResetCycleCounters();
|
||||
void EmitGetCycles(Arm64Gen::ARM64Reg reg);
|
||||
|
||||
|
@ -219,10 +221,11 @@ private:
|
|||
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
|
||||
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);
|
||||
|
||||
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
|
||||
void DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
|
||||
|
||||
void DoDownCount();
|
||||
void Cleanup();
|
||||
void ResetStack();
|
||||
|
||||
// AsmRoutines
|
||||
void GenerateAsm();
|
||||
|
@ -234,10 +237,12 @@ private:
|
|||
void EndTimeProfile(JitBlock* b);
|
||||
|
||||
// Exits
|
||||
void WriteExit(u32 destination);
|
||||
void WriteExit(Arm64Gen::ARM64Reg dest);
|
||||
void WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0);
|
||||
void WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0);
|
||||
void WriteExceptionExit(u32 destination, bool only_external = false);
|
||||
void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false);
|
||||
void FakeLKExit(u32 exit_address_after_return);
|
||||
void WriteBLRExit(Arm64Gen::ARM64Reg dest);
|
||||
|
||||
FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
|
||||
|
||||
|
|
|
@ -18,6 +18,12 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const
|
|||
ARM64XEmitter emit(location);
|
||||
|
||||
if (dest)
|
||||
{
|
||||
if (source.call)
|
||||
{
|
||||
emit.BL(dest->checkedEntry);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Are we able to jump directly to the normal entry?
|
||||
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
|
||||
|
@ -30,9 +36,13 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const
|
|||
// or if we're not able to inline the downcount check here.
|
||||
emit.B(dest->checkedEntry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
emit.MOVI2R(DISPATCHER_PC, source.exitAddress);
|
||||
if (source.call)
|
||||
emit.BL(m_jit.GetAsmRoutines()->dispatcher);
|
||||
else
|
||||
emit.B(m_jit.GetAsmRoutines()->dispatcher);
|
||||
}
|
||||
emit.FlushIcache();
|
||||
|
|
|
@ -92,6 +92,13 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||
|
||||
if (!js.isLastInstruction)
|
||||
{
|
||||
if (inst.LK && !js.op->skipLRStack)
|
||||
{
|
||||
// We have to fake the stack as the RET instruction was not
|
||||
// found in the same block. This is a big overhead, but still
|
||||
// better than calling the dispatcher.
|
||||
FakeLKExit(js.compilerPC + 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -112,7 +119,7 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||
return;
|
||||
}
|
||||
|
||||
WriteExit(destination);
|
||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
||||
}
|
||||
|
||||
void JitArm64::bcx(UGeckoInstruction inst)
|
||||
|
@ -162,7 +169,7 @@ void JitArm64::bcx(UGeckoInstruction inst)
|
|||
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
|
||||
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
|
||||
|
||||
WriteExit(destination);
|
||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
||||
|
||||
SwitchToNearCode();
|
||||
|
||||
|
@ -211,7 +218,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst)
|
|||
|
||||
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR]));
|
||||
AND(WA, WA, 30, 29); // Wipe the bottom 2 bits.
|
||||
WriteExit(WA);
|
||||
|
||||
WriteExit(WA, inst.LK_3, js.compilerPC + 4);
|
||||
}
|
||||
|
||||
void JitArm64::bclrx(UGeckoInstruction inst)
|
||||
|
@ -264,7 +272,7 @@ void JitArm64::bclrx(UGeckoInstruction inst)
|
|||
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
|
||||
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
|
||||
|
||||
WriteExit(WA);
|
||||
WriteBLRExit(WA);
|
||||
|
||||
if (conditional)
|
||||
SwitchToNearCode();
|
||||
|
|
|
@ -56,6 +56,10 @@ void JitArm64::mtmsr(UGeckoInstruction inst)
|
|||
gpr.Flush(FlushMode::FLUSH_ALL);
|
||||
fpr.Flush(FlushMode::FLUSH_ALL);
|
||||
|
||||
// Our jit cache also stores some MSR bits, as they have changed, we either
|
||||
// have to validate them in the BLR/RET check, or just flush the stack here.
|
||||
ResetStack();
|
||||
|
||||
WriteExceptionExit(js.compilerPC + 4, true);
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,14 @@ void JitArm64::GenerateAsm()
|
|||
|
||||
MOVP2R(PPC_REG, &PowerPC::ppcState);
|
||||
|
||||
// Store the stack pointer, so we can reset it if the BLR optimization fails.
|
||||
ADD(X0, SP, 0);
|
||||
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
|
||||
|
||||
// Push {nullptr; -1} as invalid destination on the stack.
|
||||
MOVI2R(X0, 0xFFFFFFFF);
|
||||
STP(INDEX_PRE, ZR, X0, SP, -16);
|
||||
|
||||
// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
|
||||
// Advance() does an exception check so we don't know what PC to use until afterwards.
|
||||
FixupBranch to_start_of_timing_slice = B();
|
||||
|
@ -119,6 +127,7 @@ void JitArm64::GenerateAsm()
|
|||
|
||||
// Call JIT
|
||||
SetJumpTarget(no_block_available);
|
||||
ResetStack();
|
||||
MOV(W0, DISPATCHER_PC);
|
||||
MOVP2R(X30, reinterpret_cast<void*>(&JitTrampoline));
|
||||
BLR(X30);
|
||||
|
@ -150,6 +159,11 @@ void JitArm64::GenerateAsm()
|
|||
B(dispatcherNoCheck);
|
||||
|
||||
SetJumpTarget(Exit);
|
||||
|
||||
// Reset the stack pointer, as the BLR optimization have touched it.
|
||||
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
|
||||
ADD(SP, X0, 0);
|
||||
|
||||
ABI_PopRegisters(regs_to_save);
|
||||
RET(X30);
|
||||
|
||||
|
|
|
@ -58,6 +58,7 @@ struct JitBlock
|
|||
u8* exitPtrs; // to be able to rewrite the exit jump
|
||||
u32 exitAddress;
|
||||
bool linkStatus; // is it already linked?
|
||||
bool call;
|
||||
};
|
||||
std::vector<LinkData> linkData;
|
||||
|
||||
|
|
|
@ -116,6 +116,9 @@ struct PowerPCState
|
|||
// also for power management, but we don't care about that.
|
||||
u32 spr[1024];
|
||||
|
||||
// Storage for the stack pointer of the BLR optimization.
|
||||
u8* stored_stack_pointer;
|
||||
|
||||
std::array<std::array<tlb_entry, TLB_SIZE / TLB_WAYS>, NUM_TLBS> tlb;
|
||||
|
||||
u32 pagetable_base;
|
||||
|
|
Loading…
Reference in New Issue