Implements block time profiling on ARMv7.
This was interesting implementing. Our generic QueryPerformanceCounter function on ARMv7 was so slow that profiling a block was impossible. I waited about five minutes and I couldn't even get a single frame to output. This instead uses ARMv7's PMU to get cycle counts, which are a relatively minor performance drop in my testing. One disadvantage of this method is that the kernel can lock us out of using these co-processor registers, but it seems to work on my Jetson board. Another disadvantage is that we aren't having block times in "real" time but cycles instead, not too big of a deal. This also removes instruction run counts from profiling because that's just annoying and we don't expose an interface for even getting those results from our UI.
This commit is contained in:
parent
b24197f913
commit
30e1749d00
|
@ -150,6 +150,10 @@ void JitArm::WriteExitDestInR(ARMReg Reg)
|
|||
STR(Reg, R9, PPCSTATE_OFF(pc));
|
||||
Cleanup();
|
||||
DoDownCount();
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
MOVI2R(Reg, (u32)asm_routines.dispatcher);
|
||||
B(Reg);
|
||||
gpr.Unlock(Reg);
|
||||
|
@ -160,6 +164,9 @@ void JitArm::WriteRfiExitDestInR(ARMReg Reg)
|
|||
Cleanup();
|
||||
DoDownCount();
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
ARMReg A = gpr.GetReg(false);
|
||||
|
||||
LDR(A, R9, PPCSTATE_OFF(pc));
|
||||
|
@ -177,6 +184,9 @@ void JitArm::WriteExceptionExit()
|
|||
Cleanup();
|
||||
DoDownCount();
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
ARMReg A = gpr.GetReg(false);
|
||||
|
||||
LDR(A, R9, PPCSTATE_OFF(pc));
|
||||
|
@ -193,6 +203,10 @@ void JitArm::WriteExit(u32 destination)
|
|||
Cleanup();
|
||||
|
||||
DoDownCount();
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
EndTimeProfile(js.curBlock);
|
||||
|
||||
//If nobody has taken care of this yet (this can be removed when all branches are done)
|
||||
JitBlock *b = js.curBlock;
|
||||
JitBlock::LinkData linkData;
|
||||
|
@ -273,6 +287,64 @@ void JitArm::Break(UGeckoInstruction inst)
|
|||
BKPT(0x4444);
|
||||
}
|
||||
|
||||
void JitArm::BeginTimeProfile(JitBlock* b)
|
||||
{
|
||||
b->ticCounter = 0;
|
||||
b->ticStart = 0;
|
||||
b->ticStop = 0;
|
||||
|
||||
// Performance counters are bit finnicky on ARM
|
||||
// We must first enable and program the PMU before using it
|
||||
// This is a per core operation so with thread scheduling we may jump to a core we haven't enabled PMU yet
|
||||
// Work around this by enabling PMU each time at the start of a block
|
||||
// Some ARM CPUs are getting absurd core counts(48+!)
|
||||
// We have to reset counters at the start of every block anyway, so may as well.
|
||||
// One thing to note about performance counters on ARM
|
||||
// The kernel can block access to these co-processor registers
|
||||
// In the case that this happens, these will generate a SIGILL
|
||||
|
||||
// Refer to the ARM ARM about PMCR for what these do exactly
|
||||
enum
|
||||
{
|
||||
PERF_OPTION_ENABLE = (1 << 0),
|
||||
PERF_OPTION_RESET_CR = (1 << 1),
|
||||
PERF_OPTION_RESET_CCR = (1 << 2),
|
||||
PERF_OPTION_DIVIDER_MODE = (1 << 3),
|
||||
PERF_OPTION_EXPORT_ENABLE = (1 << 4),
|
||||
};
|
||||
const u32 perf_options =
|
||||
PERF_OPTION_ENABLE |
|
||||
PERF_OPTION_RESET_CR |
|
||||
PERF_OPTION_RESET_CCR |
|
||||
PERF_OPTION_EXPORT_ENABLE;
|
||||
MOVI2R(R0, perf_options);
|
||||
// Programs the PMCR
|
||||
MCR(15, 0, R0, 9, 12, 0);
|
||||
|
||||
MOVI2R(R0, 0x8000000F);
|
||||
// Enables all counters
|
||||
MCR(15, 0, R0, 9, 12, 1);
|
||||
// Clears all counter overflows
|
||||
MCR(15, 0, R0, 9, 12, 3);
|
||||
|
||||
// Gets the cycle counter
|
||||
MRC(15, 0, R1, 9, 13, 0);
|
||||
MOVI2R(R0, (u32)&b->ticStart);
|
||||
STR(R1, R0, 0);
|
||||
}
|
||||
|
||||
void JitArm::EndTimeProfile(JitBlock* b)
|
||||
{
|
||||
// Gets the cycle counter
|
||||
MRC(15, 0, R1, 9, 13, 0);
|
||||
MOVI2R(R0, (u32)&b->ticStop);
|
||||
STR(R1, R0, 0);
|
||||
|
||||
MOVI2R(R0, (u32)&b->ticStart);
|
||||
MOVI2R(R14, (u32)asm_routines.m_increment_profile_counter);
|
||||
BL(R14);
|
||||
}
|
||||
|
||||
const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlock *b)
|
||||
{
|
||||
int blockSize = code_buf->GetSize();
|
||||
|
@ -362,8 +434,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
|||
LDR(rB, rA); // Load the actual value in to R11.
|
||||
ADD(rB, rB, 1); // Add one to the value
|
||||
STR(rB, rA); // Now store it back in the memory location
|
||||
// get start tic
|
||||
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStart);
|
||||
BeginTimeProfile(b);
|
||||
gpr.Unlock(rA, rB);
|
||||
}
|
||||
gpr.Start(js.gpa);
|
||||
|
@ -390,16 +461,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
|||
// WARNING - cmp->branch merging will screw this up.
|
||||
js.isLastInstruction = true;
|
||||
js.next_inst = 0;
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
{
|
||||
// CAUTION!!! push on stack regs you use, do your stuff, then pop
|
||||
PROFILER_VPUSH;
|
||||
// get end tic
|
||||
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop);
|
||||
// tic counter += (end tic - start tic)
|
||||
PROFILER_UPDATE_TIME(&b);
|
||||
PROFILER_VPOP;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -416,26 +477,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
|||
POP(4, R0, R1, R2, R3);
|
||||
}
|
||||
|
||||
if (Profiler::g_ProfileBlocks)
|
||||
{
|
||||
// Add run count
|
||||
static const u64 One = 1;
|
||||
ARMReg RA = gpr.GetReg();
|
||||
ARMReg RB = gpr.GetReg();
|
||||
ARMReg VA = fpr.GetReg();
|
||||
ARMReg VB = fpr.GetReg();
|
||||
MOVI2R(RA, (u32)&opinfo->runCount);
|
||||
MOVI2R(RB, (u32)&One);
|
||||
VLDR(VA, RA, 0);
|
||||
VLDR(VB, RB, 0);
|
||||
NEONXEmitter nemit(this);
|
||||
nemit.VADD(I_64, VA, VA, VB);
|
||||
VSTR(VA, RA, 0);
|
||||
gpr.Unlock(RA, RB);
|
||||
fpr.Unlock(VA);
|
||||
fpr.Unlock(VB);
|
||||
}
|
||||
|
||||
if (!ops[i].skip)
|
||||
{
|
||||
if (js.memcheck && (opinfo->flags & FL_USE_FPU))
|
||||
|
|
|
@ -58,6 +58,10 @@ private:
|
|||
ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
|
||||
|
||||
bool BackPatch(SContext* ctx);
|
||||
|
||||
void BeginTimeProfile(JitBlock* b);
|
||||
void EndTimeProfile(JitBlock* b);
|
||||
|
||||
public:
|
||||
JitArm() : code_buffer(32000) {}
|
||||
~JitArm() {}
|
||||
|
|
|
@ -609,4 +609,15 @@ void JitArmAsmRoutineManager::GenerateCommon()
|
|||
pairedStoreQuantized[14] = storeSingleS8;
|
||||
pairedStoreQuantized[15] = storeSingleS16;
|
||||
|
||||
m_increment_profile_counter = AlignCode16();
|
||||
|
||||
nemit.VLD1(I_64, D0, R0); // Start
|
||||
ADD(R0, R0, 8);
|
||||
nemit.VLD1(I_64, D1, R0); // End
|
||||
ADD(R0, R0, 8);
|
||||
nemit.VLD1(I_64, D2, R0); // Counter
|
||||
nemit.VSUB(I_64, D1, D1, D0);
|
||||
nemit.VADD(I_64, D2, D2, D1);
|
||||
nemit.VST1(I_64, D2, R0);
|
||||
MOV(_PC, _LR);
|
||||
}
|
||||
|
|
|
@ -14,6 +14,8 @@ private:
|
|||
void GenerateCommon();
|
||||
|
||||
public:
|
||||
const u8* m_increment_profile_counter;
|
||||
|
||||
void Init()
|
||||
{
|
||||
AllocCodeSpace(8192);
|
||||
|
|
Loading…
Reference in New Issue