From 7ba9a8537bf15acae011aa01fbbebc3bcc634596 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 9 Oct 2014 16:11:10 -0700 Subject: [PATCH] JIT: add basic register allocation heuristics Should be at least a bit better than the previous LRU approach. Currently has two basic components: whether a register is dirty (dirty registers need to be stored, so clobbering them hurts more) and how many other registers will be used between now and the next time a register gets used. Also don't pre-load values that don't need to be in registers. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 3 +- .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 130 +++++++++++++----- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 10 +- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 1 + Source/Core/Core/PowerPC/PPCAnalyst.cpp | 55 ++++---- Source/Core/Core/PowerPC/PPCAnalyst.h | 4 +- 6 files changed, 134 insertions(+), 69 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 10ad1ee4c1..e9b7333ed2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -603,6 +603,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.compilerPC = ops[i].address; js.op = &ops[i]; js.instructionNumber = i; + js.instructionsLeft = (code_block.m_num_instructions - 1) - i; const GekkoOPInfo *opinfo = ops[i].opinfo; js.downcountAmount += opinfo->numCycles; @@ -737,7 +738,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++) { int reg = ops[i].regsIn[k]; - if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm()) + if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm()) gpr.BindToRegister(reg, true, false); } for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++) diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index 622d0b535d..a4e3c6c8b8 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -11,7 +11,7 @@ using namespace Gen; using namespace PowerPC; -RegCache::RegCache() : emit(nullptr), cur_use_quantum(0) +RegCache::RegCache() : emit(nullptr) { } @@ -29,7 +29,6 @@ void RegCache::Start() regs[i].location = GetDefaultLocation(i); regs[i].away = false; regs[i].locked = false; - regs[i].last_used_quantum = 0; } // todo: sort to find the most popular regs @@ -96,6 +95,82 @@ void RegCache::UnlockAllX() xreg.locked = false; } +u32 GPRRegCache::GetRegUtilization() +{ + return jit->js.op->gprInReg; +} + +u32 FPURegCache::GetRegUtilization() +{ + return jit->js.op->gprInReg; +} + +u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead) +{ + u32 regsUsed = 0; + for (u32 i = 1; i < lookahead; i++) + { + for (int j = 0; j < 3; j++) + if (jit->js.op[i].regsIn[j] >= 0) + regsUsed |= 1 << jit->js.op[i].regsIn[j]; + for (int j = 0; j < 3; j++) + if (jit->js.op[i].regsIn[j] == preg) + return regsUsed; + } + return regsUsed; +} + +u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead) +{ + u32 regsUsed = 0; + for (u32 i = 1; i < lookahead; i++) + { + for (int j = 0; j < 4; j++) + if (jit->js.op[i].fregsIn[j] >= 0) + regsUsed |= 1 << jit->js.op[i].fregsIn[j]; + for (int j = 0; j < 4; j++) + if (jit->js.op[i].fregsIn[j] == preg) + return regsUsed; + } + return regsUsed; +} + +// Estimate roughly how bad it would be to de-allocate this register. Higher score +// means more bad. +float RegCache::ScoreRegister(X64Reg xr) +{ + size_t preg = xregs[xr].ppcReg; + float score = 0; + + // If it's not dirty, we don't need a store to write it back to the register file, so + // bias a bit against dirty registers. Testing shows that a bias of 2 seems roughly + // right: 3 causes too many extra clobbers, while 1 saves very few clobbers relative + // to the number of extra stores it causes. + if (xregs[xr].dirty) + score += 2; + + // If the register isn't actually needed in a physical register for a later instruction, + // writing it back to the register file isn't quite as bad. + if (GetRegUtilization() & (1 << preg)) + { + u32 regsUsed = 0; + // Don't look too far ahead; we don't want to have quadratic compilation times for + // enormous block sizes! + // This actually improves register allocation a tiny bit; I'm not sure why. + u32 lookahead = std::min(jit->js.instructionsLeft, 64); + // Count how many other registers are going to be used before we need this one again. + u32 regs_in = CountRegsIn(preg, lookahead); + u32 regs_in_count = 0; + for (int i = 0; i < 32; i++) + regs_in_count += !!(regs_in & (1 << i)); + // Totally ad-hoc heuristic to bias based on how many other registers we'll need + // before this one gets used again. + score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count)); + } + + return score; +} + X64Reg RegCache::GetFreeXReg() { size_t aCount; @@ -108,45 +183,31 @@ X64Reg RegCache::GetFreeXReg() return (X64Reg)xr; } } - // Okay, not found :( Force grab one! - // First, see if we have any registers that are only going to be used for a float store. - // These go through GPRs, so the cost of tossing them back into memory is lower than anything else. + // Okay, not found; run the register allocator heuristic and figure out which register we should + // clobber. + float min_score = std::numeric_limits::max(); + X64Reg best_xreg = INVALID_REG; + size_t best_preg = 0; for (size_t i = 0; i < aCount; i++) { - X64Reg xr = (X64Reg)aOrder[i]; - if (xregs[xr].locked) + X64Reg xreg = (X64Reg)aOrder[i]; + size_t preg = xregs[xreg].ppcReg; + if (xregs[xreg].locked || regs[preg].locked) continue; - size_t preg = xregs[xr].ppcReg; - if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg))) + float score = ScoreRegister(xreg); + if (score < min_score) { - StoreFromRegister(preg); - return xr; + min_score = score; + best_xreg = xreg; + best_preg = preg; } } - //TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions - u32 last_used = 0xFFFFFFFF; - X64Reg last_used_xr = INVALID_REG; - size_t last_used_preg = 0; - for (size_t i = 0; i < aCount; i++) + if (best_xreg != INVALID_REG) { - X64Reg xr = (X64Reg)aOrder[i]; - if (xregs[xr].locked) - continue; - size_t preg = xregs[xr].ppcReg; - if (!regs[preg].locked && regs[preg].last_used_quantum < last_used) - { - last_used = regs[preg].last_used_quantum; - last_used_xr = xr; - last_used_preg = preg; - } - } - - if (last_used_xr != INVALID_REG) - { - StoreFromRegister(last_used_preg); - return last_used_xr; + StoreFromRegister(best_preg); + return best_xreg; } //Still no dice? Die! @@ -197,7 +258,6 @@ void RegCache::DiscardRegContentsIfCached(size_t preg) xregs[xr].ppcReg = INVALID_REG; regs[preg].away = false; regs[preg].location = GetDefaultLocation(preg); - regs[preg].last_used_quantum = 0; } } @@ -279,7 +339,6 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty) } regs[i].away = true; regs[i].location = ::Gen::R(xr); - regs[i].last_used_quantum = ++cur_use_quantum; } else { @@ -322,7 +381,6 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode) { regs[i].location = newLoc; regs[i].away = false; - regs[i].last_used_quantum = 0; } } } @@ -378,8 +436,6 @@ void RegCache::Flush(FlushMode mode) } } } - - cur_use_quantum = 0; } int RegCache::NumFreeRegisters() diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 0fe3e9fe5f..59b1a5abe6 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -20,7 +20,6 @@ struct PPCCachedReg Gen::OpArg location; bool away; // value not in source register bool locked; - u32 last_used_quantum; }; struct X64CachedReg @@ -44,9 +43,12 @@ protected: virtual const int *GetAllocationOrder(size_t& count) = 0; + virtual u32 GetRegUtilization() = 0; + virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0; + Gen::XEmitter *emit; - u32 cur_use_quantum; + float ScoreRegister(Gen::X64Reg xreg); public: RegCache(); @@ -134,6 +136,8 @@ public: Gen::OpArg GetDefaultLocation(size_t reg) const override; const int* GetAllocationOrder(size_t& count) override; void SetImmediate32(size_t preg, u32 immValue); + u32 GetRegUtilization(); + u32 CountRegsIn(size_t preg, u32 lookahead); }; @@ -144,4 +148,6 @@ public: void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; const int* GetAllocationOrder(size_t& count) override; Gen::OpArg GetDefaultLocation(size_t reg) const override; + u32 GetRegUtilization(); + u32 CountRegsIn(size_t preg, u32 lookahead); }; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 4c3cd2562b..22c4ac9d2b 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -74,6 +74,7 @@ protected: u32 blockStart; UGeckoInstruction next_inst; // for easy peephole opt. int instructionNumber; + int instructionsLeft; int downcountAmount; u32 numLoadStoreInst; u32 numFloatingPointInst; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index e0c76192b6..4bd5b60a14 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -796,57 +796,56 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // Scan for flag dependencies; assume the next block (or any branch that can leave the block) // wants flags, to be safe. - bool wantsCR0 = true; - bool wantsCR1 = true; - bool wantsFPRF = true; - bool wantsCA = true; - u32 fregInUse = 0; - u32 regInUse = 0; - u32 fregInXmm = 0; + bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true; + u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0; for (int i = block->m_num_instructions - 1; i >= 0; i--) { - bool opWantsCR0 = code[i].wantsCR0; - bool opWantsCR1 = code[i].wantsCR1; + bool opWantsCR0 = code[i].wantsCR0; + bool opWantsCR1 = code[i].wantsCR1; bool opWantsFPRF = code[i].wantsFPRF; - bool opWantsCA = code[i].wantsCA; - code[i].wantsCR0 = wantsCR0 || code[i].canEndBlock; - code[i].wantsCR1 = wantsCR1 || code[i].canEndBlock; + bool opWantsCA = code[i].wantsCA; + code[i].wantsCR0 = wantsCR0 || code[i].canEndBlock; + code[i].wantsCR1 = wantsCR1 || code[i].canEndBlock; code[i].wantsFPRF = wantsFPRF || code[i].canEndBlock; - code[i].wantsCA = wantsCA || code[i].canEndBlock; - wantsCR0 |= opWantsCR0 || code[i].canEndBlock; - wantsCR1 |= opWantsCR1 || code[i].canEndBlock; + code[i].wantsCA = wantsCA || code[i].canEndBlock; + wantsCR0 |= opWantsCR0 || code[i].canEndBlock; + wantsCR1 |= opWantsCR1 || code[i].canEndBlock; wantsFPRF |= opWantsFPRF || code[i].canEndBlock; - wantsCA |= opWantsCA || code[i].canEndBlock; - wantsCR0 &= !code[i].outputCR0 || opWantsCR0; - wantsCR1 &= !code[i].outputCR1 || opWantsCR1; + wantsCA |= opWantsCA || code[i].canEndBlock; + wantsCR0 &= !code[i].outputCR0 || opWantsCR0; + wantsCR1 &= !code[i].outputCR1 || opWantsCR1; wantsFPRF &= !code[i].outputFPRF || opWantsFPRF; - wantsCA &= !code[i].outputCA || opWantsCA; - code[i].gprInUse = regInUse; - code[i].fprInUse = fregInUse; - code[i].fprInXmm = fregInXmm; + wantsCA &= !code[i].outputCA || opWantsCA; + code[i].gprInUse = gprInUse; + code[i].fprInUse = fprInUse; + code[i].gprInReg = gprInReg; + code[i].fprInXmm = fprInXmm; // TODO: if there's no possible endblocks or exceptions in between, tell the regcache // we can throw away a register if it's going to be overwritten later. for (int j = 0; j < 3; j++) if (code[i].regsIn[j] >= 0) - regInUse |= 1 << code[i].regsIn[j]; + { + gprInUse |= 1 << code[i].regsIn[j]; + gprInReg |= 1 << code[i].regsIn[j]; + } for (int j = 0; j < 4; j++) if (code[i].fregsIn[j] >= 0) { - fregInUse |= 1 << code[i].fregsIn[j]; + fprInUse |= 1 << code[i].fregsIn[j]; if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fregInXmm |= 1 << code[i].fregsIn[j]; + fprInXmm |= 1 << code[i].fregsIn[j]; } // For now, we need to count output registers as "used" though; otherwise the flush // will result in a redundant store (e.g. store to regcache, then store again to // the same location later). for (int j = 0; j < 2; j++) if (code[i].regsOut[j] >= 0) - regInUse |= 1 << code[i].regsOut[j]; + gprInUse |= 1 << code[i].regsOut[j]; if (code[i].fregOut >= 0) { - fregInUse |= 1 << code[i].fregOut; + fprInUse |= 1 << code[i].fregOut; if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fregInXmm |= 1 << code[i].fregOut; + fprInXmm |= 1 << code[i].fregOut; } } return address; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index a591c7f489..2c7ca62f34 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -43,8 +43,10 @@ struct CodeOp //16B bool canEndBlock; bool skip; // followed BL-s for example // which registers are still needed after this instruction in this block - u32 gprInUse; u32 fprInUse; + u32 gprInUse; + // just because a register is in use doesn't mean we actually need or want it in an x86 register. + u32 gprInReg; // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into // an XMM only to move it again to a GPR afterwards. u32 fprInXmm;