From 6683b194ffb06b68643c041f2f5b8df6eabe9863 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 15 Nov 2014 22:12:24 +0000 Subject: [PATCH 1/3] ARMv7 register cache optimizations. Enable support for not loading a destination register on FPR cache. Dump registers if they won't be used later in the block. Stolen from Fiora. --- Source/Core/Core/PowerPC/JitArm32/Jit.cpp | 7 +++++ .../Core/PowerPC/JitArm32/JitFPRCache.cpp | 29 +++++++++++++++++-- .../Core/Core/PowerPC/JitArm32/JitFPRCache.h | 2 ++ .../Core/PowerPC/JitArm32/JitRegCache.cpp | 17 +++++++++++ .../Core/Core/PowerPC/JitArm32/JitRegCache.h | 2 ++ 5 files changed, 55 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp index cf2ef79562..b8a4ad91a3 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp @@ -444,6 +444,13 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo BKPT(0x7777); } JitArmTables::CompileInstruction(ops[i]); + + // If we have a register that will never be used again, flush it. + for (int j : ~ops[i].gprInUse) + gpr.StoreFromRegister(j); + for (int j : ~ops[i].fprInUse) + fpr.StoreFromRegister(j); + if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { // Don't do this yet diff --git a/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.cpp b/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.cpp index 3d2af8453e..3dd1b59a9f 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.cpp @@ -161,7 +161,8 @@ ARMReg ArmFPRCache::GetPPCReg(u32 preg, bool PS1, bool preLoad) ArmCRegs[regindex].PS1 = PS1; _regs[preg][PS1].LoadToReg(regindex); - emit->VLDR(ArmCRegs[regindex].Reg, R9, offset); + if (preLoad) + emit->VLDR(ArmCRegs[regindex].Reg, R9, offset); return ArmCRegs[regindex].Reg; } @@ -178,7 +179,8 @@ ARMReg ArmFPRCache::GetPPCReg(u32 preg, bool PS1, bool preLoad) ArmCRegs[lastRegIndex].PS1 = PS1; _regs[preg][PS1].LoadToReg(lastRegIndex); - emit->VLDR(ArmCRegs[lastRegIndex].Reg, R9, offsetNew); + if (preLoad) + emit->VLDR(ArmCRegs[lastRegIndex].Reg, R9, offsetNew); return ArmCRegs[lastRegIndex].Reg; } @@ -225,3 +227,26 @@ void ArmFPRCache::Flush(FlushMode mode) } } +void ArmFPRCache::StoreFromRegister(u32 preg) +{ + if (_regs[preg][0].GetType() != REG_NOTLOADED) + { + s16 offset = PPCSTATE_OFF(ps) + (preg * 16); + u32 regindex = _regs[preg][0].GetRegIndex(); + emit->VSTR(ArmCRegs[regindex].Reg, R9, offset); + + ArmCRegs[regindex].PPCReg = 33; + ArmCRegs[regindex].LastLoad = 0; + _regs[preg][0].Flush(); + } + if (_regs[preg][1].GetType() != REG_NOTLOADED) + { + s16 offset = PPCSTATE_OFF(ps) + (preg * 16) + 8; + u32 regindex = _regs[preg][1].GetRegIndex(); + emit->VSTR(ArmCRegs[regindex].Reg, R9, offset); + + ArmCRegs[regindex].PPCReg = 33; + ArmCRegs[regindex].LastLoad = 0; + _regs[preg][1].Flush(); + } +} diff --git a/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.h b/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.h index cf873eaa57..fd77b4da1b 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.h +++ b/Source/Core/Core/PowerPC/JitArm32/JitFPRCache.h @@ -45,4 +45,6 @@ public: void Flush(FlushMode mode = FLUSH_ALL); ArmGen::ARMReg R0(u32 preg, bool preLoad = true); // Returns a cached register ArmGen::ARMReg R1(u32 preg, bool preLoad = true); + + void StoreFromRegister(u32 preg); }; diff --git a/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp b/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp index fc057fdc9e..cb11ed5644 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp @@ -300,3 +300,20 @@ void ArmRegCache::Flush(FlushMode mode) } } +void ArmRegCache::StoreFromRegister(u32 preg) +{ + if (regs[preg].GetType() == REG_IMM) + { + // This changes the type over to a REG_REG and gets caught below. + BindToRegister(preg, true, true); + } + if (regs[preg].GetType() == REG_REG) + { + u32 regindex = regs[preg].GetRegIndex(); + emit->STR(ArmCRegs[regindex].Reg, R9, PPCSTATE_OFF(gpr) + preg * 4); + + ArmCRegs[regindex].PPCReg = 33; + ArmCRegs[regindex].LastLoad = 0; + regs[preg].Flush(); + } +} diff --git a/Source/Core/Core/PowerPC/JitArm32/JitRegCache.h b/Source/Core/Core/PowerPC/JitArm32/JitRegCache.h index 7e7acaaf9a..7ccf8eae2c 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitRegCache.h +++ b/Source/Core/Core/PowerPC/JitArm32/JitRegCache.h @@ -135,4 +135,6 @@ public: // Public function doesn't kill immediates // In reality when you call R(u32) it'll bind an immediate there void BindToRegister(u32 preg, bool doLoad = true); + + void StoreFromRegister(u32 preg); }; From b24197f913301dcef1655961da67cc94f278f696 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 16 Nov 2014 09:14:02 +0000 Subject: [PATCH 2/3] Adds MCR/MRC to the ARMv7 emitter. --- Source/Core/Common/ArmEmitter.cpp | 22 ++++++++++++++++++++++ Source/Core/Common/ArmEmitter.h | 4 ++++ 2 files changed, 26 insertions(+) diff --git a/Source/Core/Common/ArmEmitter.cpp b/Source/Core/Common/ArmEmitter.cpp index dda13cb4c1..15ef41044b 100644 --- a/Source/Core/Common/ArmEmitter.cpp +++ b/Source/Core/Common/ArmEmitter.cpp @@ -388,6 +388,28 @@ void ARMXEmitter::YIELD() Write32(condition | 0x0320F001); } +void ARMXEmitter::MRC(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2) +{ + _assert_msg_(DYNA_REC, coproc <= 0xF, "%s has co-processor that is %d when it must be under 16!", __FUNCTION__, coproc); + _assert_msg_(DYNA_REC, opc1 <= 7, "%s has opc1 that is %d when it must be under 8!", __FUNCTION__, opc1); + _assert_msg_(DYNA_REC, CRn <= 0xF, "%s has CRn that is %d when it must be under 16!", __FUNCTION__, CRn); + _assert_msg_(DYNA_REC, opc2 <= 7, "%s has opc2 that is %d when it must be under 8!", __FUNCTION__, opc2); + + Write32(condition | (0b1110 << 24) | (opc1 << 21) | (1 << 20) | (CRn << 16) \ + | (Rt << 12) | (coproc << 8) | (opc2 << 5) | (1 << 4) | CRm); +} + +void ARMXEmitter::MCR(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2) +{ + _assert_msg_(DYNA_REC, coproc <= 0xF, "%s has co-processor that is %d when it must be under 16!", __FUNCTION__, coproc); + _assert_msg_(DYNA_REC, opc1 <= 7, "%s has opc1 that is %d when it must be under 8!", __FUNCTION__, opc1); + _assert_msg_(DYNA_REC, CRn <= 0xF, "%s has CRn that is %d when it must be under 16!", __FUNCTION__, CRn); + _assert_msg_(DYNA_REC, opc2 <= 7, "%s has opc2 that is %d when it must be under 8!", __FUNCTION__, opc2); + + Write32(condition | (0b1110 << 24) | (opc1 << 21) | (CRn << 16) \ + | (Rt << 12) | (coproc << 8) | (opc2 << 5) | (1 << 4) | CRm); +} + FixupBranch ARMXEmitter::B() { FixupBranch branch; diff --git a/Source/Core/Common/ArmEmitter.h b/Source/Core/Common/ArmEmitter.h index ed0376e9c8..cde7e9db66 100644 --- a/Source/Core/Common/ArmEmitter.h +++ b/Source/Core/Common/ArmEmitter.h @@ -385,6 +385,10 @@ public: // Hint instruction void YIELD(); + // System + void MRC(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2 = 0); + void MCR(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2 = 0); + // Do nothing void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals) From 30e1749d004f78ff631baab3db93d61396cc9355 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 16 Nov 2014 09:20:01 +0000 Subject: [PATCH 3/3] Implements block time profiling on ARMv7. This was interesting implementing. Our generic QueryPerformanceCounter function on ARMv7 was so slow that profiling a block was impossible. I waited about five minutes and I couldn't even get a single frame to output. This instead uses ARMv7's PMU to get cycle counts, which are a relatively minor performance drop in my testing. One disadvantage of this method is that the kernel can lock us out of using these co-processor registers, but it seems to work on my Jetson board. Another disadvantage is that we aren't having block times in "real" time but cycles instead, not too big of a deal. This also removes instruction run counts from profiling because that's just annoying and we don't expose an interface for even getting those results from our UI. --- Source/Core/Core/PowerPC/JitArm32/Jit.cpp | 105 +++++++++++++------ Source/Core/Core/PowerPC/JitArm32/Jit.h | 4 + Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp | 11 ++ Source/Core/Core/PowerPC/JitArm32/JitAsm.h | 2 + 4 files changed, 90 insertions(+), 32 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp index b8a4ad91a3..7a95e38f5a 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp @@ -150,6 +150,10 @@ void JitArm::WriteExitDestInR(ARMReg Reg) STR(Reg, R9, PPCSTATE_OFF(pc)); Cleanup(); DoDownCount(); + + if (Profiler::g_ProfileBlocks) + EndTimeProfile(js.curBlock); + MOVI2R(Reg, (u32)asm_routines.dispatcher); B(Reg); gpr.Unlock(Reg); @@ -160,6 +164,9 @@ void JitArm::WriteRfiExitDestInR(ARMReg Reg) Cleanup(); DoDownCount(); + if (Profiler::g_ProfileBlocks) + EndTimeProfile(js.curBlock); + ARMReg A = gpr.GetReg(false); LDR(A, R9, PPCSTATE_OFF(pc)); @@ -177,6 +184,9 @@ void JitArm::WriteExceptionExit() Cleanup(); DoDownCount(); + if (Profiler::g_ProfileBlocks) + EndTimeProfile(js.curBlock); + ARMReg A = gpr.GetReg(false); LDR(A, R9, PPCSTATE_OFF(pc)); @@ -193,6 +203,10 @@ void JitArm::WriteExit(u32 destination) Cleanup(); DoDownCount(); + + if (Profiler::g_ProfileBlocks) + EndTimeProfile(js.curBlock); + //If nobody has taken care of this yet (this can be removed when all branches are done) JitBlock *b = js.curBlock; JitBlock::LinkData linkData; @@ -273,6 +287,64 @@ void JitArm::Break(UGeckoInstruction inst) BKPT(0x4444); } +void JitArm::BeginTimeProfile(JitBlock* b) +{ + b->ticCounter = 0; + b->ticStart = 0; + b->ticStop = 0; + + // Performance counters are bit finnicky on ARM + // We must first enable and program the PMU before using it + // This is a per core operation so with thread scheduling we may jump to a core we haven't enabled PMU yet + // Work around this by enabling PMU each time at the start of a block + // Some ARM CPUs are getting absurd core counts(48+!) + // We have to reset counters at the start of every block anyway, so may as well. + // One thing to note about performance counters on ARM + // The kernel can block access to these co-processor registers + // In the case that this happens, these will generate a SIGILL + + // Refer to the ARM ARM about PMCR for what these do exactly + enum + { + PERF_OPTION_ENABLE = (1 << 0), + PERF_OPTION_RESET_CR = (1 << 1), + PERF_OPTION_RESET_CCR = (1 << 2), + PERF_OPTION_DIVIDER_MODE = (1 << 3), + PERF_OPTION_EXPORT_ENABLE = (1 << 4), + }; + const u32 perf_options = + PERF_OPTION_ENABLE | + PERF_OPTION_RESET_CR | + PERF_OPTION_RESET_CCR | + PERF_OPTION_EXPORT_ENABLE; + MOVI2R(R0, perf_options); + // Programs the PMCR + MCR(15, 0, R0, 9, 12, 0); + + MOVI2R(R0, 0x8000000F); + // Enables all counters + MCR(15, 0, R0, 9, 12, 1); + // Clears all counter overflows + MCR(15, 0, R0, 9, 12, 3); + + // Gets the cycle counter + MRC(15, 0, R1, 9, 13, 0); + MOVI2R(R0, (u32)&b->ticStart); + STR(R1, R0, 0); +} + +void JitArm::EndTimeProfile(JitBlock* b) +{ + // Gets the cycle counter + MRC(15, 0, R1, 9, 13, 0); + MOVI2R(R0, (u32)&b->ticStop); + STR(R1, R0, 0); + + MOVI2R(R0, (u32)&b->ticStart); + MOVI2R(R14, (u32)asm_routines.m_increment_profile_counter); + BL(R14); +} + const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlock *b) { int blockSize = code_buf->GetSize(); @@ -362,8 +434,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo LDR(rB, rA); // Load the actual value in to R11. ADD(rB, rB, 1); // Add one to the value STR(rB, rA); // Now store it back in the memory location - // get start tic - PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStart); + BeginTimeProfile(b); gpr.Unlock(rA, rB); } gpr.Start(js.gpa); @@ -390,16 +461,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo // WARNING - cmp->branch merging will screw this up. js.isLastInstruction = true; js.next_inst = 0; - if (Profiler::g_ProfileBlocks) - { - // CAUTION!!! push on stack regs you use, do your stuff, then pop - PROFILER_VPUSH; - // get end tic - PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop); - // tic counter += (end tic - start tic) - PROFILER_UPDATE_TIME(&b); - PROFILER_VPOP; - } } else { @@ -416,26 +477,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo POP(4, R0, R1, R2, R3); } - if (Profiler::g_ProfileBlocks) - { - // Add run count - static const u64 One = 1; - ARMReg RA = gpr.GetReg(); - ARMReg RB = gpr.GetReg(); - ARMReg VA = fpr.GetReg(); - ARMReg VB = fpr.GetReg(); - MOVI2R(RA, (u32)&opinfo->runCount); - MOVI2R(RB, (u32)&One); - VLDR(VA, RA, 0); - VLDR(VB, RB, 0); - NEONXEmitter nemit(this); - nemit.VADD(I_64, VA, VA, VB); - VSTR(VA, RA, 0); - gpr.Unlock(RA, RB); - fpr.Unlock(VA); - fpr.Unlock(VB); - } - if (!ops[i].skip) { if (js.memcheck && (opinfo->flags & FL_USE_FPU)) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.h b/Source/Core/Core/PowerPC/JitArm32/Jit.h index 7021113d73..4d9493a463 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.h @@ -58,6 +58,10 @@ private: ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); bool BackPatch(SContext* ctx); + + void BeginTimeProfile(JitBlock* b); + void EndTimeProfile(JitBlock* b); + public: JitArm() : code_buffer(32000) {} ~JitArm() {} diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp index 6bbbefb744..bb19c300c4 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp @@ -609,4 +609,15 @@ void JitArmAsmRoutineManager::GenerateCommon() pairedStoreQuantized[14] = storeSingleS8; pairedStoreQuantized[15] = storeSingleS16; + m_increment_profile_counter = AlignCode16(); + + nemit.VLD1(I_64, D0, R0); // Start + ADD(R0, R0, 8); + nemit.VLD1(I_64, D1, R0); // End + ADD(R0, R0, 8); + nemit.VLD1(I_64, D2, R0); // Counter + nemit.VSUB(I_64, D1, D1, D0); + nemit.VADD(I_64, D2, D2, D1); + nemit.VST1(I_64, D2, R0); + MOV(_PC, _LR); } diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.h b/Source/Core/Core/PowerPC/JitArm32/JitAsm.h index 41cd248336..610b9c827c 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.h +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.h @@ -14,6 +14,8 @@ private: void GenerateCommon(); public: + const u8* m_increment_profile_counter; + void Init() { AllocCodeSpace(8192);