From e883a6fb2e106bfecabaaa93c4f65d3ea021e899 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 12 Sep 2014 05:55:06 -0500 Subject: [PATCH] Fix all the current issues with the AArch64 register cache. Fixes all the current issues I've been experiencing. Scaled back the register cache idea for now so I can actually work on some real instructions. Tested this work with unit tests so I know it works. Unit tests are pretty great things. --- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 233 +++++++++++++----- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 69 ++++-- 2 files changed, 218 insertions(+), 84 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index a165828037..052d0ea9bf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -13,8 +13,12 @@ void Arm64RegCache::Init(ARM64XEmitter *emitter) GetAllocationOrder(); } -ARM64Reg Arm64RegCache::GetReg(void) +ARM64Reg Arm64RegCache::GetReg() { + // If we have no registers left, dump the most stale register first + if (!GetUnlockedRegisterCount()) + FlushMostStaleRegister(); + for (auto& it : m_host_registers) { if (!it.IsLocked()) @@ -29,6 +33,15 @@ ARM64Reg Arm64RegCache::GetReg(void) return INVALID_REG; } +u32 Arm64RegCache::GetUnlockedRegisterCount() +{ + u32 unlocked_registers = 0; + for (auto& it : m_host_registers) + if (!it.IsLocked()) + ++unlocked_registers; + return unlocked_registers; +} + void Arm64RegCache::LockRegister(ARM64Reg host_reg) { auto reg = std::find(m_host_registers.begin(), m_host_registers.end(), host_reg); @@ -101,6 +114,81 @@ bool Arm64GPRCache::IsCalleeSaved(ARM64Reg reg) return std::find(callee_regs.begin(), callee_regs.end(), EncodeRegTo64(reg)) != callee_regs.end(); } +void Arm64GPRCache::FlushRegister(u32 preg) +{ + u32 base_reg = preg; + OpArg& reg = m_guest_registers[preg]; + if (reg.GetType() == REG_REG) + { + ARM64Reg host_reg = reg.GetReg(); + + m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg])); + Unlock(host_reg); + + reg.Flush(); + } + else if (reg.GetType() == REG_IMM) + { + ARM64Reg host_reg = GetReg(); + + m_emit->MOVI2R(host_reg, reg.GetImm()); + m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg])); + + Unlock(host_reg); + + reg.Flush(); + } + else if (reg.GetType() == REG_AWAY) + { + u32 next_reg = 0; + if (reg.GetAwayLocation() == REG_LOW) + next_reg = base_reg + 1; + else + next_reg = base_reg - 1; + OpArg& reg2 = m_guest_registers[next_reg]; + ARM64Reg host_reg = reg.GetAwayReg(); + ARM64Reg host_reg_1 = reg.GetReg(); + ARM64Reg host_reg_2 = reg2.GetReg(); + // Flush if either of these shared registers are used. + if (host_reg_1 == INVALID_REG) + { + // We never loaded this register + // We've got to test the state of our shared register + // Currently it is always reg+1 + if (host_reg_2 == INVALID_REG) + { + // We didn't load either of these registers + // This can happen in cases where we had to flush register state + // or if we hit an interpreted instruction before we could use it + // Dump the whole thing in one go and flush both registers + + // 64bit host register will store 2 32bit store registers in one go + if (reg.GetAwayLocation() == REG_LOW) + m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[base_reg])); + else + m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[next_reg])); + } + else + { + // Alright, bottom register isn't used, but top one is + // Only store the top one + m_emit->STR(INDEX_UNSIGNED, host_reg_2, X29, PPCSTATE_OFF(gpr[next_reg])); + Unlock(host_reg_2); + } + } + else + { + m_emit->STR(INDEX_UNSIGNED, host_reg_1, X29, PPCSTATE_OFF(gpr[base_reg])); + Unlock(host_reg_1); + } + // Flush both registers + reg.Flush(); + reg2.Flush(); + Unlock(DecodeReg(host_reg)); + } + +} + void Arm64GPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) { for (int i = 0; i < 32; ++i) @@ -124,26 +212,12 @@ void Arm64GPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) // Has to be flushed if it isn't in a callee saved register ARM64Reg host_reg = m_guest_registers[i].GetReg(); if (flush || !IsCalleeSaved(host_reg)) - { - m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[i])); - Unlock(host_reg); - - m_guest_registers[i].Flush(); - } + FlushRegister(i); } else if (m_guest_registers[i].GetType() == REG_IMM) { if (flush) - { - ARM64Reg host_reg = GetReg(); - - m_emit->MOVI2R(host_reg, m_guest_registers[i].GetImm()); - m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[i])); - - Unlock(host_reg); - - m_guest_registers[i].Flush(); - } + FlushRegister(i); } else if (m_guest_registers[i].GetType() == REG_AWAY) { @@ -173,39 +247,7 @@ void Arm64GPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) !IsCalleeSaved(host_reg_1) || !IsCalleeSaved(host_reg_2)) { - - if (host_reg_1 == INVALID_REG) - { - // We never loaded this register - // We've got to test the state of our shared register - // Currently it is always reg+1 - if (host_reg_2 == INVALID_REG) - { - // We didn't load either of these registers - // This can happen in cases where we had to flush register state - // or if we hit an interpreted instruction before we could use it - // Dump the whole thing in one go and flush both registers - - // 64bit host register will store 2 32bit store registers in one go - m_emit->STR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[i])); - } - else - { - // Alright, bottom register isn't used, but top one is - // Only store the top one - m_emit->STR(INDEX_UNSIGNED, host_reg_2, X29, PPCSTATE_OFF(gpr[i + 1])); - Unlock(host_reg_2); - } - } - else - { - m_emit->STR(INDEX_UNSIGNED, host_reg_1, X29, PPCSTATE_OFF(gpr[i])); - Unlock(host_reg_1); - } - // Flush both registers - m_guest_registers[i].Flush(); - m_guest_registers[i + 1].Flush(); - Unlock(DecodeReg(host_reg)); + FlushRegister(i); // Will flush both pairs of registers } // Skip the next register since we've handled it here ++i; @@ -216,6 +258,9 @@ void Arm64GPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) ARM64Reg Arm64GPRCache::R(u32 preg) { OpArg& reg = m_guest_registers[preg]; + IncrementAllUsed(); + reg.ResetLastUsed(); + switch (reg.GetType()) { case REG_REG: // already in a reg @@ -225,6 +270,8 @@ ARM64Reg Arm64GPRCache::R(u32 preg) { ARM64Reg host_reg = GetReg(); m_emit->MOVI2R(host_reg, reg.GetImm()); + reg.LoadToReg(host_reg); + return host_reg; } break; case REG_AWAY: // Register is away in a shared register @@ -232,22 +279,57 @@ ARM64Reg Arm64GPRCache::R(u32 preg) // Let's do the voodoo that we dodo if (reg.GetReg() == INVALID_REG) { - // Alright, we need to move to a valid location - ARM64Reg host_reg = GetReg(); - reg.LoadAwayToReg(host_reg); - // Alright, we need to extract from our away register // To our new 32bit register if (reg.GetAwayLocation() == REG_LOW) { - // We are in the low bits - // Just move it over to the low bits of the new register - m_emit->UBFM(EncodeRegTo64(host_reg), reg.GetAwayReg(), 0, 31); + OpArg& upper_reg = m_guest_registers[preg + 1]; + if (upper_reg.GetType() == REG_REG) + { + // If the upper reg is already moved away, just claim this one as ours now + ARM64Reg host_reg = reg.GetAwayReg(); + reg.LoadToReg(DecodeReg(host_reg)); + return host_reg; + } + else + { + // Top register is still loaded + // Make sure to move to a new register + ARM64Reg host_reg = GetReg(); + ARM64Reg current_reg = reg.GetAwayReg(); + reg.LoadToReg(host_reg); + + // We are in the low bits + // Just move it over to the low bits of the new register + m_emit->UBFM(EncodeRegTo64(host_reg), current_reg, 0, 31); + return host_reg; + } } else { - // We are in the high bits - m_emit->UBFM(EncodeRegTo64(host_reg), reg.GetAwayReg(), 32, 63); + OpArg& lower_reg = m_guest_registers[preg - 1]; + if (lower_reg.GetType() == REG_REG) + { + // If the lower register is moved away, claim this one as ours + ARM64Reg host_reg = reg.GetAwayReg(); + reg.LoadToReg(DecodeReg(host_reg)); + + // Make sure to move our register from the high bits to the low bits + m_emit->UBFM(EncodeRegTo64(host_reg), host_reg, 32, 63); + return host_reg; + } + else + { + // Load this register in to the new low bits + // We are no longer away + ARM64Reg host_reg = GetReg(); + ARM64Reg current_reg = reg.GetAwayReg(); + reg.LoadToReg(host_reg); + + // We are in the high bits + m_emit->UBFM(EncodeRegTo64(host_reg), current_reg, 32, 63); + return host_reg; + } } } else @@ -259,10 +341,8 @@ ARM64Reg Arm64GPRCache::R(u32 preg) break; case REG_NOTLOADED: // Register isn't loaded at /all/ { - // This is kind of annoying, we shouldn't have gotten here - // This can happen with instructions that use multiple registers(eg lmw) - // The PPCAnalyst needs to be modified to handle these cases - _dbg_assert_msg_(DYNA_REC, false, "Hit REG_NOTLOADED type oparg. Fix the PPCAnalyst"); + // This is a bit annoying. We try to keep these preloaded as much as possible + // This can also happen on cases where PPCAnalyst isn't feeing us proper register usage statistics ARM64Reg host_reg = GetReg(); reg.LoadToReg(host_reg); m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg])); @@ -277,7 +357,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg) return INVALID_REG; } -void Arm64GPRCache::GetAllocationOrder(void) +void Arm64GPRCache::GetAllocationOrder() { // Callee saved registers first in hopes that we will keep everything stored there first const std::vector allocation_order = @@ -292,6 +372,24 @@ void Arm64GPRCache::GetAllocationOrder(void) m_host_registers.push_back(HostReg(reg)); } +void Arm64GPRCache::FlushMostStaleRegister() +{ + u32 most_stale_preg = 0; + u32 most_stale_amount = 0; + for (u32 i = 0; i < 32; ++i) + { + u32 last_used = m_guest_registers[i].GetLastUsed(); + if (last_used > most_stale_amount && + m_guest_registers[i].GetType() != REG_IMM && + m_guest_registers[i].GetType() != REG_NOTLOADED) + { + most_stale_preg = i; + most_stale_amount = last_used; + } + } + FlushRegister(most_stale_preg); +} + // FPR Cache void Arm64FPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) { @@ -303,7 +401,7 @@ ARM64Reg Arm64FPRCache::R(u32 preg) // XXX: return a host reg holding a guest register } -void Arm64FPRCache::GetAllocationOrder(void) +void Arm64FPRCache::GetAllocationOrder() { const std::vector allocation_order = { @@ -317,3 +415,8 @@ void Arm64FPRCache::GetAllocationOrder(void) m_host_registers.push_back(HostReg(reg)); } +void Arm64FPRCache::FlushMostStaleRegister() +{ + // XXX: Flush a register +} + diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 58c21c2ae1..1eb1afd3d2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -42,10 +42,9 @@ class OpArg { public: OpArg() + : m_type(REG_NOTLOADED), m_reg(INVALID_REG), + m_value(0), m_last_used(0) { - m_type = REG_NOTLOADED; - m_reg = INVALID_REG; - m_value = 0; } RegType GetType() @@ -73,30 +72,40 @@ public: { m_type = REG_REG; m_reg = reg; + + m_away_reg = INVALID_REG; } void LoadToAway(ARM64Reg reg, RegLocation location) { m_type = REG_AWAY; - m_reg = INVALID_REG; m_away_reg = reg; m_away_location = location; - } - void LoadAwayToReg(ARM64Reg reg) - { - // We are still an away type - // We just are also in another register - m_reg = reg; + + m_reg = INVALID_REG; } void LoadToImm(u32 imm) { m_type = REG_IMM; m_value = imm; + + m_reg = INVALID_REG; + m_away_reg = INVALID_REG; } void Flush() { + // Invalidate any previous information m_type = REG_NOTLOADED; + m_reg = INVALID_REG; + m_away_reg = INVALID_REG; + + // Arbitrarily large value that won't roll over on a lot of increments + m_last_used = 0xFFFF; } + u32 GetLastUsed() { return m_last_used; } + void ResetLastUsed() { m_last_used = 0; } + void IncrementLastUsed() { ++m_last_used; } + private: // For REG_REG RegType m_type; // store type @@ -110,6 +119,8 @@ private: // For REG_IMM u32 m_value; // IMM value + + u32 m_last_used; }; class HostReg @@ -117,10 +128,10 @@ class HostReg public: HostReg() : m_reg(INVALID_REG), m_locked(false) {} HostReg(ARM64Reg reg) : m_reg(reg), m_locked(false) {} - bool IsLocked(void) { return m_locked; } - void Lock(void) { m_locked = true; } - void Unlock(void) { m_locked = false; } - ARM64Reg GetReg(void) { return m_reg; } + bool IsLocked() { return m_locked; } + void Lock() { m_locked = true; } + void Unlock() { m_locked = false; } + ARM64Reg GetReg() { return m_reg; } bool operator==(const ARM64Reg& reg) { @@ -135,7 +146,7 @@ private: class Arm64RegCache { public: - Arm64RegCache(void) : m_emit(nullptr), m_reg_stats(nullptr) {}; + Arm64RegCache() : m_emit(nullptr), m_reg_stats(nullptr) {}; virtual ~Arm64RegCache() {}; void Init(ARM64XEmitter *emitter); @@ -151,7 +162,7 @@ public: // Returns a temporary register for use // Requires unlocking after done - ARM64Reg GetReg(void); + ARM64Reg GetReg(); // Locks a register so a cache cannot use it // Useful for function calls @@ -177,7 +188,10 @@ public: protected: // Get the order of the host registers - virtual void GetAllocationOrder(void) = 0; + virtual void GetAllocationOrder() = 0; + + // Flushes the most stale register + virtual void FlushMostStaleRegister() = 0; // Lock a register void LockRegister(ARM64Reg host_reg); @@ -185,6 +199,9 @@ protected: // Unlock a register void UnlockRegister(ARM64Reg host_reg); + // Get available host registers + u32 GetUnlockedRegisterCount(); + // Code emitter ARM64XEmitter *m_emit; @@ -220,7 +237,10 @@ public: protected: // Get the order of the host registers - void GetAllocationOrder(void); + void GetAllocationOrder(); + + // Flushes the most stale register + void FlushMostStaleRegister(); // Our guest GPRs // PowerPC has 32 GPRs @@ -228,6 +248,14 @@ protected: private: bool IsCalleeSaved(ARM64Reg reg); + + void IncrementAllUsed() + { + for (auto& reg : m_guest_registers) + reg.IncrementLastUsed(); + } + + void FlushRegister(u32 preg); }; class Arm64FPRCache : public Arm64RegCache @@ -243,7 +271,10 @@ public: protected: // Get the order of the host registers - void GetAllocationOrder(void); + void GetAllocationOrder(); + + // Flushes the most stale register + void FlushMostStaleRegister(); // Our guest FPRs // Gekko has 32 paired registers(32x2)