Merge pull request #1252 from FioraAeterna/regallocator
JIT: add basic register allocation heuristics
This commit is contained in:
commit
e50dad67ce
|
@ -603,6 +603,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
||||||
js.compilerPC = ops[i].address;
|
js.compilerPC = ops[i].address;
|
||||||
js.op = &ops[i];
|
js.op = &ops[i];
|
||||||
js.instructionNumber = i;
|
js.instructionNumber = i;
|
||||||
|
js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
|
||||||
const GekkoOPInfo *opinfo = ops[i].opinfo;
|
const GekkoOPInfo *opinfo = ops[i].opinfo;
|
||||||
js.downcountAmount += opinfo->numCycles;
|
js.downcountAmount += opinfo->numCycles;
|
||||||
|
|
||||||
|
@ -737,7 +738,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
||||||
for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
|
for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
|
||||||
{
|
{
|
||||||
int reg = ops[i].regsIn[k];
|
int reg = ops[i].regsIn[k];
|
||||||
if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm())
|
if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm())
|
||||||
gpr.BindToRegister(reg, true, false);
|
gpr.BindToRegister(reg, true, false);
|
||||||
}
|
}
|
||||||
for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
|
for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
using namespace PowerPC;
|
using namespace PowerPC;
|
||||||
|
|
||||||
RegCache::RegCache() : emit(nullptr), cur_use_quantum(0)
|
RegCache::RegCache() : emit(nullptr)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,7 +29,6 @@ void RegCache::Start()
|
||||||
regs[i].location = GetDefaultLocation(i);
|
regs[i].location = GetDefaultLocation(i);
|
||||||
regs[i].away = false;
|
regs[i].away = false;
|
||||||
regs[i].locked = false;
|
regs[i].locked = false;
|
||||||
regs[i].last_used_quantum = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo: sort to find the most popular regs
|
// todo: sort to find the most popular regs
|
||||||
|
@ -96,6 +95,82 @@ void RegCache::UnlockAllX()
|
||||||
xreg.locked = false;
|
xreg.locked = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u32 GPRRegCache::GetRegUtilization()
|
||||||
|
{
|
||||||
|
return jit->js.op->gprInReg;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 FPURegCache::GetRegUtilization()
|
||||||
|
{
|
||||||
|
return jit->js.op->gprInReg;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
|
||||||
|
{
|
||||||
|
u32 regsUsed = 0;
|
||||||
|
for (u32 i = 1; i < lookahead; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < 3; j++)
|
||||||
|
if (jit->js.op[i].regsIn[j] >= 0)
|
||||||
|
regsUsed |= 1 << jit->js.op[i].regsIn[j];
|
||||||
|
for (int j = 0; j < 3; j++)
|
||||||
|
if (jit->js.op[i].regsIn[j] == preg)
|
||||||
|
return regsUsed;
|
||||||
|
}
|
||||||
|
return regsUsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
|
||||||
|
{
|
||||||
|
u32 regsUsed = 0;
|
||||||
|
for (u32 i = 1; i < lookahead; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < 4; j++)
|
||||||
|
if (jit->js.op[i].fregsIn[j] >= 0)
|
||||||
|
regsUsed |= 1 << jit->js.op[i].fregsIn[j];
|
||||||
|
for (int j = 0; j < 4; j++)
|
||||||
|
if (jit->js.op[i].fregsIn[j] == preg)
|
||||||
|
return regsUsed;
|
||||||
|
}
|
||||||
|
return regsUsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Estimate roughly how bad it would be to de-allocate this register. Higher score
|
||||||
|
// means more bad.
|
||||||
|
float RegCache::ScoreRegister(X64Reg xr)
|
||||||
|
{
|
||||||
|
size_t preg = xregs[xr].ppcReg;
|
||||||
|
float score = 0;
|
||||||
|
|
||||||
|
// If it's not dirty, we don't need a store to write it back to the register file, so
|
||||||
|
// bias a bit against dirty registers. Testing shows that a bias of 2 seems roughly
|
||||||
|
// right: 3 causes too many extra clobbers, while 1 saves very few clobbers relative
|
||||||
|
// to the number of extra stores it causes.
|
||||||
|
if (xregs[xr].dirty)
|
||||||
|
score += 2;
|
||||||
|
|
||||||
|
// If the register isn't actually needed in a physical register for a later instruction,
|
||||||
|
// writing it back to the register file isn't quite as bad.
|
||||||
|
if (GetRegUtilization() & (1 << preg))
|
||||||
|
{
|
||||||
|
u32 regsUsed = 0;
|
||||||
|
// Don't look too far ahead; we don't want to have quadratic compilation times for
|
||||||
|
// enormous block sizes!
|
||||||
|
// This actually improves register allocation a tiny bit; I'm not sure why.
|
||||||
|
u32 lookahead = std::min(jit->js.instructionsLeft, 64);
|
||||||
|
// Count how many other registers are going to be used before we need this one again.
|
||||||
|
u32 regs_in = CountRegsIn(preg, lookahead);
|
||||||
|
u32 regs_in_count = 0;
|
||||||
|
for (int i = 0; i < 32; i++)
|
||||||
|
regs_in_count += !!(regs_in & (1 << i));
|
||||||
|
// Totally ad-hoc heuristic to bias based on how many other registers we'll need
|
||||||
|
// before this one gets used again.
|
||||||
|
score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count));
|
||||||
|
}
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
X64Reg RegCache::GetFreeXReg()
|
X64Reg RegCache::GetFreeXReg()
|
||||||
{
|
{
|
||||||
size_t aCount;
|
size_t aCount;
|
||||||
|
@ -108,45 +183,31 @@ X64Reg RegCache::GetFreeXReg()
|
||||||
return (X64Reg)xr;
|
return (X64Reg)xr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Okay, not found :( Force grab one!
|
|
||||||
|
|
||||||
// First, see if we have any registers that are only going to be used for a float store.
|
// Okay, not found; run the register allocator heuristic and figure out which register we should
|
||||||
// These go through GPRs, so the cost of tossing them back into memory is lower than anything else.
|
// clobber.
|
||||||
|
float min_score = std::numeric_limits<float>::max();
|
||||||
|
X64Reg best_xreg = INVALID_REG;
|
||||||
|
size_t best_preg = 0;
|
||||||
for (size_t i = 0; i < aCount; i++)
|
for (size_t i = 0; i < aCount; i++)
|
||||||
{
|
{
|
||||||
X64Reg xr = (X64Reg)aOrder[i];
|
X64Reg xreg = (X64Reg)aOrder[i];
|
||||||
if (xregs[xr].locked)
|
size_t preg = xregs[xreg].ppcReg;
|
||||||
|
if (xregs[xreg].locked || regs[preg].locked)
|
||||||
continue;
|
continue;
|
||||||
size_t preg = xregs[xr].ppcReg;
|
float score = ScoreRegister(xreg);
|
||||||
if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg)))
|
if (score < min_score)
|
||||||
{
|
{
|
||||||
StoreFromRegister(preg);
|
min_score = score;
|
||||||
return xr;
|
best_xreg = xreg;
|
||||||
|
best_preg = preg;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions
|
if (best_xreg != INVALID_REG)
|
||||||
u32 last_used = 0xFFFFFFFF;
|
|
||||||
X64Reg last_used_xr = INVALID_REG;
|
|
||||||
size_t last_used_preg = 0;
|
|
||||||
for (size_t i = 0; i < aCount; i++)
|
|
||||||
{
|
{
|
||||||
X64Reg xr = (X64Reg)aOrder[i];
|
StoreFromRegister(best_preg);
|
||||||
if (xregs[xr].locked)
|
return best_xreg;
|
||||||
continue;
|
|
||||||
size_t preg = xregs[xr].ppcReg;
|
|
||||||
if (!regs[preg].locked && regs[preg].last_used_quantum < last_used)
|
|
||||||
{
|
|
||||||
last_used = regs[preg].last_used_quantum;
|
|
||||||
last_used_xr = xr;
|
|
||||||
last_used_preg = preg;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (last_used_xr != INVALID_REG)
|
|
||||||
{
|
|
||||||
StoreFromRegister(last_used_preg);
|
|
||||||
return last_used_xr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Still no dice? Die!
|
//Still no dice? Die!
|
||||||
|
@ -197,7 +258,6 @@ void RegCache::DiscardRegContentsIfCached(size_t preg)
|
||||||
xregs[xr].ppcReg = INVALID_REG;
|
xregs[xr].ppcReg = INVALID_REG;
|
||||||
regs[preg].away = false;
|
regs[preg].away = false;
|
||||||
regs[preg].location = GetDefaultLocation(preg);
|
regs[preg].location = GetDefaultLocation(preg);
|
||||||
regs[preg].last_used_quantum = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -279,7 +339,6 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
|
||||||
}
|
}
|
||||||
regs[i].away = true;
|
regs[i].away = true;
|
||||||
regs[i].location = ::Gen::R(xr);
|
regs[i].location = ::Gen::R(xr);
|
||||||
regs[i].last_used_quantum = ++cur_use_quantum;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -322,7 +381,6 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)
|
||||||
{
|
{
|
||||||
regs[i].location = newLoc;
|
regs[i].location = newLoc;
|
||||||
regs[i].away = false;
|
regs[i].away = false;
|
||||||
regs[i].last_used_quantum = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -378,8 +436,6 @@ void RegCache::Flush(FlushMode mode)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_use_quantum = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int RegCache::NumFreeRegisters()
|
int RegCache::NumFreeRegisters()
|
||||||
|
|
|
@ -20,7 +20,6 @@ struct PPCCachedReg
|
||||||
Gen::OpArg location;
|
Gen::OpArg location;
|
||||||
bool away; // value not in source register
|
bool away; // value not in source register
|
||||||
bool locked;
|
bool locked;
|
||||||
u32 last_used_quantum;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct X64CachedReg
|
struct X64CachedReg
|
||||||
|
@ -44,9 +43,12 @@ protected:
|
||||||
|
|
||||||
virtual const int *GetAllocationOrder(size_t& count) = 0;
|
virtual const int *GetAllocationOrder(size_t& count) = 0;
|
||||||
|
|
||||||
|
virtual u32 GetRegUtilization() = 0;
|
||||||
|
virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0;
|
||||||
|
|
||||||
Gen::XEmitter *emit;
|
Gen::XEmitter *emit;
|
||||||
|
|
||||||
u32 cur_use_quantum;
|
float ScoreRegister(Gen::X64Reg xreg);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
RegCache();
|
RegCache();
|
||||||
|
@ -134,6 +136,8 @@ public:
|
||||||
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
||||||
const int* GetAllocationOrder(size_t& count) override;
|
const int* GetAllocationOrder(size_t& count) override;
|
||||||
void SetImmediate32(size_t preg, u32 immValue);
|
void SetImmediate32(size_t preg, u32 immValue);
|
||||||
|
u32 GetRegUtilization();
|
||||||
|
u32 CountRegsIn(size_t preg, u32 lookahead);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -144,4 +148,6 @@ public:
|
||||||
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
|
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
|
||||||
const int* GetAllocationOrder(size_t& count) override;
|
const int* GetAllocationOrder(size_t& count) override;
|
||||||
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
||||||
|
u32 GetRegUtilization();
|
||||||
|
u32 CountRegsIn(size_t preg, u32 lookahead);
|
||||||
};
|
};
|
||||||
|
|
|
@ -74,6 +74,7 @@ protected:
|
||||||
u32 blockStart;
|
u32 blockStart;
|
||||||
UGeckoInstruction next_inst; // for easy peephole opt.
|
UGeckoInstruction next_inst; // for easy peephole opt.
|
||||||
int instructionNumber;
|
int instructionNumber;
|
||||||
|
int instructionsLeft;
|
||||||
int downcountAmount;
|
int downcountAmount;
|
||||||
u32 numLoadStoreInst;
|
u32 numLoadStoreInst;
|
||||||
u32 numFloatingPointInst;
|
u32 numFloatingPointInst;
|
||||||
|
|
|
@ -796,13 +796,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||||
|
|
||||||
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
|
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
|
||||||
// wants flags, to be safe.
|
// wants flags, to be safe.
|
||||||
bool wantsCR0 = true;
|
bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true;
|
||||||
bool wantsCR1 = true;
|
u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0;
|
||||||
bool wantsFPRF = true;
|
|
||||||
bool wantsCA = true;
|
|
||||||
u32 fregInUse = 0;
|
|
||||||
u32 regInUse = 0;
|
|
||||||
u32 fregInXmm = 0;
|
|
||||||
for (int i = block->m_num_instructions - 1; i >= 0; i--)
|
for (int i = block->m_num_instructions - 1; i >= 0; i--)
|
||||||
{
|
{
|
||||||
bool opWantsCR0 = code[i].wantsCR0;
|
bool opWantsCR0 = code[i].wantsCR0;
|
||||||
|
@ -821,32 +816,36 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||||
wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
|
wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
|
||||||
wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
|
wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
|
||||||
wantsCA &= !code[i].outputCA || opWantsCA;
|
wantsCA &= !code[i].outputCA || opWantsCA;
|
||||||
code[i].gprInUse = regInUse;
|
code[i].gprInUse = gprInUse;
|
||||||
code[i].fprInUse = fregInUse;
|
code[i].fprInUse = fprInUse;
|
||||||
code[i].fprInXmm = fregInXmm;
|
code[i].gprInReg = gprInReg;
|
||||||
|
code[i].fprInXmm = fprInXmm;
|
||||||
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
|
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
|
||||||
// we can throw away a register if it's going to be overwritten later.
|
// we can throw away a register if it's going to be overwritten later.
|
||||||
for (int j = 0; j < 3; j++)
|
for (int j = 0; j < 3; j++)
|
||||||
if (code[i].regsIn[j] >= 0)
|
if (code[i].regsIn[j] >= 0)
|
||||||
regInUse |= 1 << code[i].regsIn[j];
|
{
|
||||||
|
gprInUse |= 1 << code[i].regsIn[j];
|
||||||
|
gprInReg |= 1 << code[i].regsIn[j];
|
||||||
|
}
|
||||||
for (int j = 0; j < 4; j++)
|
for (int j = 0; j < 4; j++)
|
||||||
if (code[i].fregsIn[j] >= 0)
|
if (code[i].fregsIn[j] >= 0)
|
||||||
{
|
{
|
||||||
fregInUse |= 1 << code[i].fregsIn[j];
|
fprInUse |= 1 << code[i].fregsIn[j];
|
||||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||||
fregInXmm |= 1 << code[i].fregsIn[j];
|
fprInXmm |= 1 << code[i].fregsIn[j];
|
||||||
}
|
}
|
||||||
// For now, we need to count output registers as "used" though; otherwise the flush
|
// For now, we need to count output registers as "used" though; otherwise the flush
|
||||||
// will result in a redundant store (e.g. store to regcache, then store again to
|
// will result in a redundant store (e.g. store to regcache, then store again to
|
||||||
// the same location later).
|
// the same location later).
|
||||||
for (int j = 0; j < 2; j++)
|
for (int j = 0; j < 2; j++)
|
||||||
if (code[i].regsOut[j] >= 0)
|
if (code[i].regsOut[j] >= 0)
|
||||||
regInUse |= 1 << code[i].regsOut[j];
|
gprInUse |= 1 << code[i].regsOut[j];
|
||||||
if (code[i].fregOut >= 0)
|
if (code[i].fregOut >= 0)
|
||||||
{
|
{
|
||||||
fregInUse |= 1 << code[i].fregOut;
|
fprInUse |= 1 << code[i].fregOut;
|
||||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||||
fregInXmm |= 1 << code[i].fregOut;
|
fprInXmm |= 1 << code[i].fregOut;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return address;
|
return address;
|
||||||
|
|
|
@ -43,8 +43,10 @@ struct CodeOp //16B
|
||||||
bool canEndBlock;
|
bool canEndBlock;
|
||||||
bool skip; // followed BL-s for example
|
bool skip; // followed BL-s for example
|
||||||
// which registers are still needed after this instruction in this block
|
// which registers are still needed after this instruction in this block
|
||||||
u32 gprInUse;
|
|
||||||
u32 fprInUse;
|
u32 fprInUse;
|
||||||
|
u32 gprInUse;
|
||||||
|
// just because a register is in use doesn't mean we actually need or want it in an x86 register.
|
||||||
|
u32 gprInReg;
|
||||||
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
||||||
// an XMM only to move it again to a GPR afterwards.
|
// an XMM only to move it again to a GPR afterwards.
|
||||||
u32 fprInXmm;
|
u32 fprInXmm;
|
||||||
|
|
Loading…
Reference in New Issue