From 8ab34b1a3e51342fe40b9f98907bc10a2786b28d Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 12 Aug 2017 21:09:27 +0200 Subject: [PATCH 1/7] Jit64: Inline the profiler calls. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 25 ++++++++++++++------- Source/Core/Core/PowerPC/Profiler.cpp | 1 + Source/Core/Core/PowerPC/Profiler.h | 31 -------------------------- 3 files changed, 18 insertions(+), 39 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 9f45d150f6..25b391aac7 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -16,6 +16,7 @@ #include "Common/File.h" #include "Common/Logging/Log.h" #include "Common/MemoryUtil.h" +#include "Common/PerformanceCounter.h" #include "Common/StringUtil.h" #include "Common/x64ABI.h" #include "Core/Core.h" @@ -648,15 +649,17 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc } // Conditionally add profiling code. + b->ticCounter = 0; + b->ticStart = 0; + b->ticStop = 0; if (Profiler::g_ProfileBlocks) { MOV(64, R(RSCRATCH), ImmPtr(&b->runCount)); ADD(32, MatR(RSCRATCH), Imm8(1)); - b->ticCounter = 0; - b->ticStart = 0; - b->ticStop = 0; + // get start tic - PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStart); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->ticStart))); + ABI_CallFunction(QueryPerformanceCounter); } #if defined(_DEBUG) || defined(DEBUGFAST) || defined(NAN_CHECK) // should help logged stack-traces become more accurate @@ -734,12 +737,18 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc if (Profiler::g_ProfileBlocks) { // WARNING - cmp->branch merging will screw this up. - PROFILER_VPUSH; + BitSet32 registersInUse = CallerSavedRegistersInUse(); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); // get end tic - PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->ticStop))); + ABI_CallFunction(QueryPerformanceCounter); // tic counter += (end tic - start tic) - PROFILER_UPDATE_TIME(b); - PROFILER_VPOP; + MOV(64, R(RSCRATCH2), Imm64((u64)b)); + MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStop))); + SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStart))); + ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter))); + MOV(64, MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter)), R(RSCRATCH)); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); } js.isLastInstruction = true; } diff --git a/Source/Core/Core/PowerPC/Profiler.cpp b/Source/Core/Core/PowerPC/Profiler.cpp index 25a31cf740..0e1a176727 100644 --- a/Source/Core/Core/PowerPC/Profiler.cpp +++ b/Source/Core/Core/PowerPC/Profiler.cpp @@ -5,6 +5,7 @@ #include "Core/PowerPC/Profiler.h" #include +#include "Common/PerformanceCounter.h" #include "Core/PowerPC/JitInterface.h" namespace Profiler diff --git a/Source/Core/Core/PowerPC/Profiler.h b/Source/Core/Core/PowerPC/Profiler.h index 6146bee4d1..33a8b225bb 100644 --- a/Source/Core/Core/PowerPC/Profiler.h +++ b/Source/Core/Core/PowerPC/Profiler.h @@ -10,37 +10,6 @@ #include "Common/CommonTypes.h" -#include "Common/PerformanceCounter.h" - -#if defined(_M_X86_64) - -#define PROFILER_QUERY_PERFORMANCE_COUNTER(pt) \ - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(pt))); \ - ABI_CallFunction(QueryPerformanceCounter) - -// block->ticCounter += block->ticStop - block->ticStart -#define PROFILER_UPDATE_TIME(block) \ - MOV(64, R(RSCRATCH2), Imm64((u64)block)); \ - MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStop))); \ - SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStart))); \ - ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter))); \ - MOV(64, MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter)), R(RSCRATCH)); - -#define PROFILER_VPUSH \ - BitSet32 registersInUse = CallerSavedRegistersInUse(); \ - ABI_PushRegistersAndAdjustStack(registersInUse, 0); - -#define PROFILER_VPOP ABI_PopRegistersAndAdjustStack(registersInUse, 0); - -#else - -#define PROFILER_QUERY_PERFORMANCE_COUNTER(pt) -#define PROFILER_UPDATE_TIME(b) -#define PROFILER_VPUSH -#define PROFILER_VPOP - -#endif - struct BlockStat { BlockStat(u32 _addr, u64 c, u64 ticks, u64 run, u32 size) From 95ce8602659576ec9beb0c61f0f1db73fedfc841 Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 12 Aug 2017 21:10:16 +0200 Subject: [PATCH 2/7] DolphinWX: Enable branch following in the JIT debug interface. --- Source/Core/DolphinWX/Debugger/JitWindow.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/Source/Core/DolphinWX/Debugger/JitWindow.cpp b/Source/Core/DolphinWX/Debugger/JitWindow.cpp index a62d1ba2f2..bf2eab365c 100644 --- a/Source/Core/DolphinWX/Debugger/JitWindow.cpp +++ b/Source/Core/DolphinWX/Debugger/JitWindow.cpp @@ -87,6 +87,7 @@ void CJitWindow::Compare(u32 em_address) PPCAnalyst::CodeBlock code_block; PPCAnalyst::PPCAnalyzer analyzer; analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); + analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW); code_block.m_stats = &st; code_block.m_gpa = &gpa; From 958b75b707e78e3b5bcc7fa9b54ea5a1d87926de Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 12 Aug 2017 22:18:22 +0200 Subject: [PATCH 3/7] JitCommon: Restructure the profiler calls. --- .../CachedInterpreter/CachedInterpreter.cpp | 1 - Source/Core/Core/PowerPC/Jit64/Jit.cpp | 28 +++++----- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 53 +++++++++---------- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 15 +++--- Source/Core/Core/PowerPC/JitInterface.cpp | 10 ++-- 5 files changed, 52 insertions(+), 55 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 0067a7f3f4..ff7739905c 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -200,7 +200,6 @@ void CachedInterpreter::Jit(u32 address) b->checkedEntry = GetCodePtr(); b->normalEntry = GetCodePtr(); - b->runCount = 0; for (u32 i = 0; i < code_block.m_num_instructions; i++) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 25b391aac7..875a3a298b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -628,7 +628,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc const u8* start = AlignCode4(); // TODO: Test if this or AlignCode16 make a difference from GetCodePtr b->checkedEntry = start; - b->runCount = 0; // Downcount flag check. The last block decremented downcounter, and the flag should still be // available. @@ -649,16 +648,13 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc } // Conditionally add profiling code. - b->ticCounter = 0; - b->ticStart = 0; - b->ticStop = 0; if (Profiler::g_ProfileBlocks) { - MOV(64, R(RSCRATCH), ImmPtr(&b->runCount)); - ADD(32, MatR(RSCRATCH), Imm8(1)); - // get start tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->ticStart))); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->profile_data.ticStart))); + int offset = static_cast(offsetof(JitBlock::ProfileData, runCount)) - + static_cast(offsetof(JitBlock::ProfileData, ticStart)); + ADD(64, MDisp(ABI_PARAM1, offset), Imm8(1)); ABI_CallFunction(QueryPerformanceCounter); } #if defined(_DEBUG) || defined(DEBUGFAST) || defined(NAN_CHECK) @@ -736,18 +732,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc { if (Profiler::g_ProfileBlocks) { - // WARNING - cmp->branch merging will screw this up. + // TODO: Move this to WriteExit() calls. BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); // get end tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->ticStop))); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->profile_data.ticStop))); ABI_CallFunction(QueryPerformanceCounter); // tic counter += (end tic - start tic) - MOV(64, R(RSCRATCH2), Imm64((u64)b)); - MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStop))); - SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStart))); - ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter))); - MOV(64, MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter)), R(RSCRATCH)); + MOV(64, R(RSCRATCH2), Imm64(reinterpret_cast(&b->profile_data))); + MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStop))); + SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStart))); + ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter))); + ADD(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, downcountCounter)), + Imm32(js.downcountAmount)); + MOV(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter)), R(RSCRATCH)); ABI_PopRegistersAndAdjustStack(registersInUse, 0); } js.isLastInstruction = true; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index a507485043..b7dbf5a161 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -520,10 +520,10 @@ void JitArm64::EmitResetCycleCounters() const u32 PMCR_EL0_P = 2; const u32 PMCR_EL0_C = 4; const u32 PMCR_EL0_LC = 0x40; - _MSR(FIELD_PMCR_EL0, X0); - MOVI2R(X1, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC); - ORR(X0, X0, X1); - MRS(X0, FIELD_PMCR_EL0); + _MSR(FIELD_PMCR_EL0, X10); + MOVI2R(X11, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC); + ORR(X10, X10, X11); + MRS(X10, FIELD_PMCR_EL0); } void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg) @@ -533,47 +533,54 @@ void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg) void JitArm64::BeginTimeProfile(JitBlock* b) { - b->ticCounter = 0; - b->ticStart = 0; - b->ticStop = 0; + MOVP2R(X0, &b->profile_data); + LDR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); + ADD(X1, X1, 1); if (m_supports_cycle_counter) { EmitResetCycleCounters(); - EmitGetCycles(X1); - MOVP2R(X0, &b->ticStart); - STR(INDEX_UNSIGNED, X1, X0, 0); + EmitGetCycles(X2); + + // stores runCount and ticStart + STP(INDEX_UNSIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount)); } else { + STR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); + MOVP2R(X1, &QueryPerformanceCounter); - MOVP2R(X0, &b->ticStart); + ADD(X0, X0, offsetof(JitBlock::ProfileData, ticStart)); BLR(X1); } } void JitArm64::EndTimeProfile(JitBlock* b) { + MOVP2R(X20, &b->profile_data); if (m_supports_cycle_counter) { EmitGetCycles(X2); - MOVP2R(X0, &b->ticStart); } else { MOVP2R(X1, &QueryPerformanceCounter); - MOVP2R(X0, &b->ticStop); + ADD(X0, X20, offsetof(JitBlock::ProfileData, ticStop)); BLR(X1); - MOVP2R(X0, &b->ticStart); - LDR(INDEX_UNSIGNED, X2, X0, 8); // Stop + LDR(INDEX_UNSIGNED, X2, X20, offsetof(JitBlock::ProfileData, ticStop)); } - LDR(INDEX_UNSIGNED, X1, X0, 0); // Start - LDR(INDEX_UNSIGNED, X3, X0, 16); // Counter + LDR(INDEX_UNSIGNED, X1, X20, offsetof(JitBlock::ProfileData, ticStart)); + + // loads ticCounter and downcountCounter + LDP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); SUB(X2, X2, X1); ADD(X3, X3, X2); - STR(INDEX_UNSIGNED, X3, X0, 16); + ADDI2R(X4, X4, js.downcountAmount); + + // stores ticCounter and downcountCounter + STP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); } void JitArm64::Run() @@ -657,7 +664,6 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* const u8* start = GetCodePtr(); b->checkedEntry = start; - b->runCount = 0; // Downcount flag check, Only valid for linked blocks { @@ -673,15 +679,6 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* // Conditionally add profiling code. if (Profiler::g_ProfileBlocks) { - ARM64Reg WA = gpr.GetReg(); - ARM64Reg WB = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); - ARM64Reg XB = EncodeRegTo64(WB); - MOVP2R(XA, &b->runCount); - LDR(INDEX_UNSIGNED, XB, XA, 0); - ADD(XB, XB, 1); - STR(INDEX_UNSIGNED, XB, XA, 0); - gpr.Unlock(WA, WB); // get start tic BeginTimeProfile(b); } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 76e2d4dbf9..7b8e44b1b2 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -49,7 +49,6 @@ struct JitBlock // The number of PPC instructions represented by this block. Mostly // useful for logging. u32 originalSize; - int runCount; // for profiling. // Information about exits to a known address from this block. // This is used to implement block linking. @@ -65,11 +64,15 @@ struct JitBlock // This set stores all physical addresses of all occupied instructions. std::set physical_addresses; - // we don't really need to save start and stop - // TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;) - u64 ticStart; // for profiling - time. - u64 ticStop; // for profiling - time. - u64 ticCounter; // for profiling - time. + // Block profiling data, structure is inlined in Jit.cpp + struct ProfileData + { + u64 ticCounter; + u64 downcountCounter; + u64 runCount; + u64 ticStart; + u64 ticStop; + } profile_data = {}; // This tracks the position if this block within the fast block cache. // We allow each block to have only one map entry. diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 2369eb75d6..284b7ea366 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -119,12 +119,12 @@ void GetProfileResults(ProfileStats* prof_stats) QueryPerformanceFrequency((LARGE_INTEGER*)&prof_stats->countsPerSec); g_jit->GetBlockCache()->RunOnBlocks([&prof_stats](const JitBlock& block) { - // Rough heuristic. Mem instructions should cost more. - u64 cost = block.originalSize * (block.runCount / 4); - u64 timecost = block.ticCounter; + const auto& data = block.profile_data; + u64 cost = data.downcountCounter; + u64 timecost = data.ticCounter; // Todo: tweak. - if (block.runCount >= 1) - prof_stats->block_stats.emplace_back(block.effectiveAddress, cost, timecost, block.runCount, + if (data.runCount >= 1) + prof_stats->block_stats.emplace_back(block.effectiveAddress, cost, timecost, data.runCount, block.codeSize); prof_stats->cost_sum += cost; prof_stats->timecost_sum += timecost; From 9080192a656d0ab70f079cdb9bd3cbb31b05a0d4 Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 12 Aug 2017 22:46:50 +0200 Subject: [PATCH 4/7] Jit64: Move profiler to the end of th block. Within Cleanup(), it is called at *every* end of the block. This generates bigger code, but it is the only way to handle blocks with multiple exit nodes. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 35 +++++++++++++------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 875a3a298b..d046a639ad 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -371,6 +371,23 @@ bool Jit64::Cleanup() did_something = true; } + if (Profiler::g_ProfileBlocks) + { + ABI_PushRegistersAndAdjustStack({}, 0); + // get end tic + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&js.curBlock->profile_data.ticStop))); + ABI_CallFunction(QueryPerformanceCounter); + // tic counter += (end tic - start tic) + MOV(64, R(RSCRATCH2), Imm64(reinterpret_cast(&js.curBlock->profile_data))); + MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStop))); + SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStart))); + ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter))); + ADD(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, downcountCounter)), + Imm32(js.downcountAmount)); + MOV(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter)), R(RSCRATCH)); + ABI_PopRegistersAndAdjustStack({}, 0); + } + return did_something; } @@ -730,24 +747,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc if (i == (code_block.m_num_instructions - 1)) { - if (Profiler::g_ProfileBlocks) - { - // TODO: Move this to WriteExit() calls. - BitSet32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, 0); - // get end tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->profile_data.ticStop))); - ABI_CallFunction(QueryPerformanceCounter); - // tic counter += (end tic - start tic) - MOV(64, R(RSCRATCH2), Imm64(reinterpret_cast(&b->profile_data))); - MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStop))); - SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStart))); - ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter))); - ADD(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, downcountCounter)), - Imm32(js.downcountAmount)); - MOV(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter)), R(RSCRATCH)); - ABI_PopRegistersAndAdjustStack(registersInUse, 0); - } js.isLastInstruction = true; } From 304e601ad349f192406eaa356eb160a9a5e95f5a Mon Sep 17 00:00:00 2001 From: degasus Date: Sun, 13 Aug 2017 01:51:37 +0200 Subject: [PATCH 5/7] JitArm64: Reimplement aarch64 cycle counters. CNTVCT_EL0 is force-enabled on all linux plattforms. Windows is untested, but as this is the best way to get *any* low overhead performance counters, they likely use it as well. --- Source/Core/Common/Arm64Emitter.cpp | 8 +++ Source/Core/Common/Arm64Emitter.h | 2 +- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 85 +++++++---------------- Source/Core/Core/PowerPC/JitArm64/Jit.h | 6 -- Source/Core/Core/PowerPC/Profiler.cpp | 2 +- 5 files changed, 34 insertions(+), 69 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index bd72df2e94..ed1fa2b626 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1218,6 +1218,14 @@ void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt)); } +void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt) +{ + _assert_msg_(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit"); + + // MRS , CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt + EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt)); +} + void ARM64XEmitter::HINT(SystemHint op) { EncodeSystemInst(0, 3, 2, 0, op, WSP); diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 4b5bc9f137..2f49795519 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -603,9 +603,9 @@ public: // System void _MSR(PStateField field, u8 imm); - void _MSR(PStateField field, ARM64Reg Rt); void MRS(ARM64Reg Rt, PStateField field); + void CNTVCT(ARM64Reg Rt); void HINT(SystemHint op); void CLREX(); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index b7dbf5a161..7cc2c476c2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -36,15 +36,6 @@ constexpr size_t SAFE_STACK_SIZE = 512 * 1024; constexpr size_t GUARD_SIZE = 0x10000; // two guards - bottom (permanent) and middle (see above) constexpr size_t GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE; -static bool HasCycleCounters() -{ - // Bit needs to be set to support cycle counters - const u32 PMUSERENR_CR = 0x4; - u32 reg; - asm("mrs %[val], PMUSERENR_EL0" : [val] "=r"(reg)); - return !!(reg & PMUSERENR_CR); -} - void JitArm64::Init() { InitializeInstructionTables(); @@ -72,8 +63,6 @@ void JitArm64::Init() AllocStack(); GenerateAsm(); - - m_supports_cycle_counter = HasCycleCounters(); } bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx) @@ -514,73 +503,47 @@ void JitArm64::DumpCode(const u8* start, const u8* end) WARN_LOG(DYNA_REC, "Code dump from %p to %p:\n%s", start, end, output.c_str()); } -void JitArm64::EmitResetCycleCounters() -{ - const u32 PMCR_EL0_E = 1; - const u32 PMCR_EL0_P = 2; - const u32 PMCR_EL0_C = 4; - const u32 PMCR_EL0_LC = 0x40; - _MSR(FIELD_PMCR_EL0, X10); - MOVI2R(X11, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC); - ORR(X10, X10, X11); - MRS(X10, FIELD_PMCR_EL0); -} - -void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg) -{ - _MSR(FIELD_PMCCNTR_EL0, reg); -} - void JitArm64::BeginTimeProfile(JitBlock* b) { MOVP2R(X0, &b->profile_data); LDR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); ADD(X1, X1, 1); - if (m_supports_cycle_counter) - { - EmitResetCycleCounters(); - EmitGetCycles(X2); + // Fetch the current counter register + CNTVCT(X2); - // stores runCount and ticStart - STP(INDEX_UNSIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount)); - } - else - { - STR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); - - MOVP2R(X1, &QueryPerformanceCounter); - ADD(X0, X0, offsetof(JitBlock::ProfileData, ticStart)); - BLR(X1); - } + // stores runCount and ticStart + STP(INDEX_SIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount)); } void JitArm64::EndTimeProfile(JitBlock* b) { - MOVP2R(X20, &b->profile_data); - if (m_supports_cycle_counter) - { - EmitGetCycles(X2); - } - else - { - MOVP2R(X1, &QueryPerformanceCounter); - ADD(X0, X20, offsetof(JitBlock::ProfileData, ticStop)); - BLR(X1); + ARM64Reg WA = gpr.GetReg(); + ARM64Reg XA = EncodeRegTo64(WA); + ARM64Reg WB = gpr.GetReg(); + ARM64Reg XB = EncodeRegTo64(WB); + ARM64Reg WC = gpr.GetReg(); + ARM64Reg XC = EncodeRegTo64(WC); + ARM64Reg WD = gpr.GetReg(); + ARM64Reg XD = EncodeRegTo64(WD); - LDR(INDEX_UNSIGNED, X2, X20, offsetof(JitBlock::ProfileData, ticStop)); - } + // Fetch the current counter register + CNTVCT(XB); - LDR(INDEX_UNSIGNED, X1, X20, offsetof(JitBlock::ProfileData, ticStart)); + MOVP2R(XA, &b->profile_data); + + LDR(INDEX_UNSIGNED, XC, XA, offsetof(JitBlock::ProfileData, ticStart)); + SUB(XB, XB, XC); // loads ticCounter and downcountCounter - LDP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); - SUB(X2, X2, X1); - ADD(X3, X3, X2); - ADDI2R(X4, X4, js.downcountAmount); + LDP(INDEX_SIGNED, XC, XD, XA, offsetof(JitBlock::ProfileData, ticCounter)); + ADD(XC, XC, XB); + ADDI2R(XD, XD, js.downcountAmount); // stores ticCounter and downcountCounter - STP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); + STP(INDEX_SIGNED, XC, XD, XA, offsetof(JitBlock::ProfileData, ticCounter)); + + gpr.Unlock(WA, WB, WC, WD); } void JitArm64::Run() diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 5685440ca1..918ceb678c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -174,9 +174,6 @@ private: static void InitializeInstructionTables(); void CompileInstruction(PPCAnalyst::CodeOp& op); - void EmitResetCycleCounters(); - void EmitGetCycles(Arm64Gen::ARM64Reg reg); - // Simple functions to switch between near and far code emitting void SwitchToFarCode() { @@ -253,9 +250,6 @@ private: Arm64Gen::ARM64CodeBlock farcode; u8* nearcode; // Backed up when we switch to far code. - // Do we support cycle counter profiling? - bool m_supports_cycle_counter; - bool m_enable_blr_optimization; bool m_cleanup_after_stackfault = false; u8* m_stack_base = nullptr; diff --git a/Source/Core/Core/PowerPC/Profiler.cpp b/Source/Core/Core/PowerPC/Profiler.cpp index 0e1a176727..0378c4a5b5 100644 --- a/Source/Core/Core/PowerPC/Profiler.cpp +++ b/Source/Core/Core/PowerPC/Profiler.cpp @@ -10,7 +10,7 @@ namespace Profiler { -bool g_ProfileBlocks; +bool g_ProfileBlocks = false; void WriteProfileResults(const std::string& filename) { From da79ddbde7457ca1c8d7c5eee9fd53c08c969f0a Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 22 Aug 2017 08:04:16 +0200 Subject: [PATCH 6/7] JitArm64: Rewrite Exit functions. The gpr must not be touched in the Exit functions as they are maybe conditional. So just allocate everything here manually. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 113 +++++++----------- .../Core/PowerPC/JitArm64/JitArm64_Branch.cpp | 6 +- 2 files changed, 48 insertions(+), 71 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 7cc2c476c2..4b27c13f00 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -161,6 +161,7 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) ARM64Reg WA = gpr.GetReg(); LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(npc)); WriteExceptionExit(WA); + gpr.Unlock(WA); } else { @@ -174,6 +175,7 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) FixupBranch c = B(CC_EQ); WriteExceptionExit(WA); SetJumpTarget(c); + gpr.Unlock(WA); } } @@ -211,6 +213,7 @@ void JitArm64::HLEFunction(UGeckoInstruction inst) ARM64Reg WA = gpr.GetReg(); LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(npc)); WriteExit(WA); + gpr.Unlock(WA); } void JitArm64::DoNothing(UGeckoInstruction inst) @@ -228,21 +231,16 @@ void JitArm64::Cleanup() { if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) { - gpr.Lock(W0); MOVP2R(X0, &GPFifo::FastCheckGatherPipe); BLR(X0); - gpr.Unlock(W0); } } void JitArm64::DoDownCount() { - ARM64Reg WA = gpr.GetReg(); - LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(downcount)); - ARM64Reg WB = gpr.GetReg(); - SUBSI2R(WA, WA, js.downcountAmount, WB); - STR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(downcount)); - gpr.Unlock(WA, WB); + LDR(INDEX_UNSIGNED, W0, PPC_REG, PPCSTATE_OFF(downcount)); + SUBSI2R(W0, W0, js.downcountAmount, W1); + STR(INDEX_UNSIGNED, W0, PPC_REG, PPCSTATE_OFF(downcount)); } void JitArm64::ResetStack() @@ -292,9 +290,7 @@ void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return { Cleanup(); DoDownCount(); - - if (Profiler::g_ProfileBlocks) - EndTimeProfile(js.curBlock); + EndTimeProfile(js.curBlock); LK &= m_enable_blr_optimization; @@ -331,17 +327,14 @@ void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return) { - Cleanup(); - DoDownCount(); - - LK &= m_enable_blr_optimization; - if (dest != DISPATCHER_PC) MOV(DISPATCHER_PC, dest); - gpr.Unlock(dest); - if (Profiler::g_ProfileBlocks) - EndTimeProfile(js.curBlock); + Cleanup(); + DoDownCount(); + EndTimeProfile(js.curBlock); + + LK &= m_enable_blr_optimization; if (!LK) { @@ -407,35 +400,28 @@ void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest) return; } + if (dest != DISPATCHER_PC) + MOV(DISPATCHER_PC, dest); + Cleanup(); - - if (Profiler::g_ProfileBlocks) - EndTimeProfile(js.curBlock); - - ARM64Reg code = gpr.GetReg(); - ARM64Reg pc = gpr.GetReg(); + EndTimeProfile(js.curBlock); // Check if {ARM_PC, PPC_PC} matches the current state. - LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16); - CMP(pc, dest); + LDP(INDEX_POST, X2, X1, SP, 16); + CMP(W1, DISPATCHER_PC); FixupBranch no_match = B(CC_NEQ); - DoDownCount(); + DoDownCount(); // overwrites X0 + X1 - RET(EncodeRegTo64(code)); + RET(X2); SetJumpTarget(no_match); DoDownCount(); - if (dest != DISPATCHER_PC) - MOV(DISPATCHER_PC, dest); - ResetStack(); B(dispatcher); - - gpr.Unlock(dest, pc, code); } void JitArm64::WriteExceptionExit(u32 destination, bool only_external) @@ -458,39 +444,34 @@ void JitArm64::WriteExceptionExit(u32 destination, bool only_external) SetJumpTarget(no_exceptions); - if (Profiler::g_ProfileBlocks) - EndTimeProfile(js.curBlock); + EndTimeProfile(js.curBlock); B(dispatcher); } void JitArm64::WriteExceptionExit(ARM64Reg dest, bool only_external) { + if (dest != DISPATCHER_PC) + MOV(DISPATCHER_PC, dest); + Cleanup(); DoDownCount(); - ARM64Reg WA = gpr.GetReg(); - LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); - FixupBranch no_exceptions = CBZ(WA); - gpr.Unlock(WA); + LDR(INDEX_UNSIGNED, W30, PPC_REG, PPCSTATE_OFF(Exceptions)); + FixupBranch no_exceptions = CBZ(W30); - STR(INDEX_UNSIGNED, dest, PPC_REG, PPCSTATE_OFF(pc)); - STR(INDEX_UNSIGNED, dest, PPC_REG, PPCSTATE_OFF(npc)); + STR(INDEX_UNSIGNED, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); + STR(INDEX_UNSIGNED, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(npc)); if (only_external) - MOVP2R(EncodeRegTo64(dest), &PowerPC::CheckExternalExceptions); + MOVP2R(EncodeRegTo64(DISPATCHER_PC), &PowerPC::CheckExternalExceptions); else - MOVP2R(EncodeRegTo64(dest), &PowerPC::CheckExceptions); - BLR(EncodeRegTo64(dest)); - LDR(INDEX_UNSIGNED, dest, PPC_REG, PPCSTATE_OFF(npc)); + MOVP2R(EncodeRegTo64(DISPATCHER_PC), &PowerPC::CheckExceptions); + BLR(EncodeRegTo64(DISPATCHER_PC)); + LDR(INDEX_UNSIGNED, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(npc)); SetJumpTarget(no_exceptions); - if (dest != DISPATCHER_PC) - MOV(DISPATCHER_PC, dest); - gpr.Unlock(dest); - - if (Profiler::g_ProfileBlocks) - EndTimeProfile(js.curBlock); + EndTimeProfile(js.curBlock); B(dispatcher); } @@ -518,32 +499,24 @@ void JitArm64::BeginTimeProfile(JitBlock* b) void JitArm64::EndTimeProfile(JitBlock* b) { - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); - ARM64Reg WB = gpr.GetReg(); - ARM64Reg XB = EncodeRegTo64(WB); - ARM64Reg WC = gpr.GetReg(); - ARM64Reg XC = EncodeRegTo64(WC); - ARM64Reg WD = gpr.GetReg(); - ARM64Reg XD = EncodeRegTo64(WD); + if (!Profiler::g_ProfileBlocks) + return; // Fetch the current counter register - CNTVCT(XB); + CNTVCT(X1); - MOVP2R(XA, &b->profile_data); + MOVP2R(X0, &b->profile_data); - LDR(INDEX_UNSIGNED, XC, XA, offsetof(JitBlock::ProfileData, ticStart)); - SUB(XB, XB, XC); + LDR(INDEX_UNSIGNED, X2, X0, offsetof(JitBlock::ProfileData, ticStart)); + SUB(X1, X1, X2); // loads ticCounter and downcountCounter - LDP(INDEX_SIGNED, XC, XD, XA, offsetof(JitBlock::ProfileData, ticCounter)); - ADD(XC, XC, XB); - ADDI2R(XD, XD, js.downcountAmount); + LDP(INDEX_SIGNED, X2, X3, X0, offsetof(JitBlock::ProfileData, ticCounter)); + ADD(X2, X2, X1); + ADDI2R(X3, X3, js.downcountAmount, X1); // stores ticCounter and downcountCounter - STP(INDEX_SIGNED, XC, XD, XA, offsetof(JitBlock::ProfileData, ticCounter)); - - gpr.Unlock(WA, WB, WC, WD); + STP(INDEX_SIGNED, X2, X3, X0, offsetof(JitBlock::ProfileData, ticCounter)); } void JitArm64::Run() diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index 82325b0f49..da49e0a3bf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -67,8 +67,8 @@ void JitArm64::rfi(UGeckoInstruction inst) LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_SRR0])); gpr.Unlock(WB, WC); - // WA is unlocked in this function WriteExceptionExit(WA); + gpr.Unlock(WA); } void JitArm64::bx(UGeckoInstruction inst) @@ -220,6 +220,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst) AND(WA, WA, 30, 29); // Wipe the bottom 2 bits. WriteExit(WA, inst.LK_3, js.compilerPC + 4); + + gpr.Unlock(WA); } void JitArm64::bclrx(UGeckoInstruction inst) @@ -275,6 +277,8 @@ void JitArm64::bclrx(UGeckoInstruction inst) WriteBLRExit(WA); + gpr.Unlock(WA); + if (conditional) SwitchToNearCode(); From 992893ba412fe3248d4add3f84f874638b7110c0 Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 2 Sep 2017 14:33:07 +0200 Subject: [PATCH 7/7] Jit64: Use ImmPtr. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index d046a639ad..1a8cd83972 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -375,10 +375,10 @@ bool Jit64::Cleanup() { ABI_PushRegistersAndAdjustStack({}, 0); // get end tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&js.curBlock->profile_data.ticStop))); + MOV(64, R(ABI_PARAM1), ImmPtr(&js.curBlock->profile_data.ticStop)); ABI_CallFunction(QueryPerformanceCounter); // tic counter += (end tic - start tic) - MOV(64, R(RSCRATCH2), Imm64(reinterpret_cast(&js.curBlock->profile_data))); + MOV(64, R(RSCRATCH2), ImmPtr(&js.curBlock->profile_data)); MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStop))); SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStart))); ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter))); @@ -668,7 +668,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc if (Profiler::g_ProfileBlocks) { // get start tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->profile_data.ticStart))); + MOV(64, R(ABI_PARAM1), ImmPtr(&b->profile_data.ticStart)); int offset = static_cast(offsetof(JitBlock::ProfileData, runCount)) - static_cast(offsetof(JitBlock::ProfileData, ticStart)); ADD(64, MDisp(ABI_PARAM1, offset), Imm8(1));