From 8fe730194b4a4a8786ad0bea842e5c880bf4d50f Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 25 Sep 2014 16:01:29 -0700 Subject: [PATCH] JIT: load registers if they're going to be used later in the block --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 20 +++++++++++++ .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 28 ++++++++++++++++++- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 1 + Source/Core/Core/PowerPC/PPCAnalyst.cpp | 10 +++++++ Source/Core/Core/PowerPC/PPCAnalyst.h | 3 ++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 5877073377..c99f707f88 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -727,6 +727,26 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc SetJumpTarget(noBreakpoint); } + // If we have an input register that is going to be used again, load it pre-emptively, + // even if the instruction doesn't strictly need it in a register, to avoid redundant + // loads later. Of course, don't do this if we're already out of registers. + // As a bit of a heuristic, make sure we have at least one register left over for the + // output, which needs to be bound in the actual instruction compilation. + // TODO: make this smarter in the case that we're actually register-starved, i.e. + // prioritize the more important registers. + for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++) + { + int reg = ops[i].regsIn[k]; + if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm()) + gpr.BindToRegister(reg, true, false); + } + for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++) + { + int reg = ops[i].fregsIn[k]; + if (reg >= 0 && (ops[i].fprInXmm & (1 << reg))) + fpr.BindToRegister(reg, true, false); + } + Jit64Tables::CompileInstruction(ops[i]); // If we have a register that will never be used again, flush it. diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index 11eb9de2c7..622d0b535d 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -108,7 +108,22 @@ X64Reg RegCache::GetFreeXReg() return (X64Reg)xr; } } - //Okay, not found :( Force grab one + // Okay, not found :( Force grab one! + + // First, see if we have any registers that are only going to be used for a float store. + // These go through GPRs, so the cost of tossing them back into memory is lower than anything else. + for (size_t i = 0; i < aCount; i++) + { + X64Reg xr = (X64Reg)aOrder[i]; + if (xregs[xr].locked) + continue; + size_t preg = xregs[xr].ppcReg; + if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg))) + { + StoreFromRegister(preg); + return xr; + } + } //TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions u32 last_used = 0xFFFFFFFF; @@ -366,3 +381,14 @@ void RegCache::Flush(FlushMode mode) cur_use_quantum = 0; } + +int RegCache::NumFreeRegisters() +{ + int count = 0; + size_t aCount; + const int* aOrder = GetAllocationOrder(aCount); + for (size_t i = 0; i < aCount; i++) + if (!xregs[aOrder[i]].locked && xregs[aOrder[i]].free) + count++; + return count; +} diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 7a79086f54..0fe3e9fe5f 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -123,6 +123,7 @@ public: Gen::X64Reg GetFreeXReg(); + int NumFreeRegisters(); }; class GPRRegCache : public RegCache diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 923e621dcd..e0c76192b6 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -802,6 +802,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 bool wantsCA = true; u32 fregInUse = 0; u32 regInUse = 0; + u32 fregInXmm = 0; for (int i = block->m_num_instructions - 1; i >= 0; i--) { bool opWantsCR0 = code[i].wantsCR0; @@ -822,6 +823,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 wantsCA &= !code[i].outputCA || opWantsCA; code[i].gprInUse = regInUse; code[i].fprInUse = fregInUse; + code[i].fprInXmm = fregInXmm; // TODO: if there's no possible endblocks or exceptions in between, tell the regcache // we can throw away a register if it's going to be overwritten later. for (int j = 0; j < 3; j++) @@ -829,7 +831,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 regInUse |= 1 << code[i].regsIn[j]; for (int j = 0; j < 4; j++) if (code[i].fregsIn[j] >= 0) + { fregInUse |= 1 << code[i].fregsIn[j]; + if (strncmp(code[i].opinfo->opname, "stfd", 4)) + fregInXmm |= 1 << code[i].fregsIn[j]; + } // For now, we need to count output registers as "used" though; otherwise the flush // will result in a redundant store (e.g. store to regcache, then store again to // the same location later). @@ -837,7 +843,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 if (code[i].regsOut[j] >= 0) regInUse |= 1 << code[i].regsOut[j]; if (code[i].fregOut >= 0) + { fregInUse |= 1 << code[i].fregOut; + if (strncmp(code[i].opinfo->opname, "stfd", 4)) + fregInXmm |= 1 << code[i].fregOut; + } } return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 4936f854c5..a591c7f489 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -45,6 +45,9 @@ struct CodeOp //16B // which registers are still needed after this instruction in this block u32 gprInUse; u32 fprInUse; + // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into + // an XMM only to move it again to a GPR afterwards. + u32 fprInXmm; }; struct BlockStats