From 7ba9a8537bf15acae011aa01fbbebc3bcc634596 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Thu, 9 Oct 2014 16:11:10 -0700
Subject: [PATCH] JIT: add basic register allocation heuristics

Should be at least a bit better than the previous LRU approach. Currently
has two basic components: whether a register is dirty (dirty registers need
to be stored, so clobbering them hurts more) and how many other registers will
be used between now and the next time a register gets used.

Also don't pre-load values that don't need to be in registers.
---
 Source/Core/Core/PowerPC/Jit64/Jit.cpp        |   3 +-
 .../Core/Core/PowerPC/Jit64/JitRegCache.cpp   | 130 +++++++++++++-----
 Source/Core/Core/PowerPC/Jit64/JitRegCache.h  |  10 +-
 Source/Core/Core/PowerPC/JitCommon/JitBase.h  |   1 +
 Source/Core/Core/PowerPC/PPCAnalyst.cpp       |  55 ++++----
 Source/Core/Core/PowerPC/PPCAnalyst.h         |   4 +-
 6 files changed, 134 insertions(+), 69 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
index 10ad1ee4c1..e9b7333ed2 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@@ -603,6 +603,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 		js.compilerPC = ops[i].address;
 		js.op = &ops[i];
 		js.instructionNumber = i;
+		js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
 		const GekkoOPInfo *opinfo = ops[i].opinfo;
 		js.downcountAmount += opinfo->numCycles;
 
@@ -737,7 +738,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 			for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
 			{
 				int reg = ops[i].regsIn[k];
-				if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm())
+				if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm())
 					gpr.BindToRegister(reg, true, false);
 			}
 			for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
index 622d0b535d..a4e3c6c8b8 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@@ -11,7 +11,7 @@
 using namespace Gen;
 using namespace PowerPC;
 
-RegCache::RegCache() : emit(nullptr), cur_use_quantum(0)
+RegCache::RegCache() : emit(nullptr)
 {
 }
 
@@ -29,7 +29,6 @@ void RegCache::Start()
 		regs[i].location = GetDefaultLocation(i);
 		regs[i].away = false;
 		regs[i].locked = false;
-		regs[i].last_used_quantum = 0;
 	}
 
 	// todo: sort to find the most popular regs
@@ -96,6 +95,82 @@ void RegCache::UnlockAllX()
 		xreg.locked = false;
 }
 
+u32 GPRRegCache::GetRegUtilization()
+{
+	return jit->js.op->gprInReg;
+}
+
+u32 FPURegCache::GetRegUtilization()
+{
+	return jit->js.op->gprInReg;
+}
+
+u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
+{
+	u32 regsUsed = 0;
+	for (u32 i = 1; i < lookahead; i++)
+	{
+		for (int j = 0; j < 3; j++)
+			if (jit->js.op[i].regsIn[j] >= 0)
+				regsUsed |= 1 << jit->js.op[i].regsIn[j];
+		for (int j = 0; j < 3; j++)
+			if (jit->js.op[i].regsIn[j] == preg)
+				return regsUsed;
+	}
+	return regsUsed;
+}
+
+u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
+{
+	u32 regsUsed = 0;
+	for (u32 i = 1; i < lookahead; i++)
+	{
+		for (int j = 0; j < 4; j++)
+			if (jit->js.op[i].fregsIn[j] >= 0)
+				regsUsed |= 1 << jit->js.op[i].fregsIn[j];
+		for (int j = 0; j < 4; j++)
+			if (jit->js.op[i].fregsIn[j] == preg)
+				return regsUsed;
+	}
+	return regsUsed;
+}
+
+// Estimate roughly how bad it would be to de-allocate this register. Higher score
+// means more bad.
+float RegCache::ScoreRegister(X64Reg xr)
+{
+	size_t preg = xregs[xr].ppcReg;
+	float score = 0;
+
+	// If it's not dirty, we don't need a store to write it back to the register file, so
+	// bias a bit against dirty registers. Testing shows that a bias of 2 seems roughly
+	// right: 3 causes too many extra clobbers, while 1 saves very few clobbers relative
+	// to the number of extra stores it causes.
+	if (xregs[xr].dirty)
+		score += 2;
+
+	// If the register isn't actually needed in a physical register for a later instruction,
+	// writing it back to the register file isn't quite as bad.
+	if (GetRegUtilization() & (1 << preg))
+	{
+		u32 regsUsed = 0;
+		// Don't look too far ahead; we don't want to have quadratic compilation times for
+		// enormous block sizes!
+		// This actually improves register allocation a tiny bit; I'm not sure why.
+		u32 lookahead = std::min(jit->js.instructionsLeft, 64);
+		// Count how many other registers are going to be used before we need this one again.
+		u32 regs_in = CountRegsIn(preg, lookahead);
+		u32 regs_in_count = 0;
+		for (int i = 0; i < 32; i++)
+			regs_in_count += !!(regs_in & (1 << i));
+		// Totally ad-hoc heuristic to bias based on how many other registers we'll need
+		// before this one gets used again.
+		score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count));
+	}
+
+	return score;
+}
+
 X64Reg RegCache::GetFreeXReg()
 {
 	size_t aCount;
@@ -108,45 +183,31 @@ X64Reg RegCache::GetFreeXReg()
 			return (X64Reg)xr;
 		}
 	}
-	// Okay, not found :( Force grab one!
 
-	// First, see if we have any registers that are only going to be used for a float store.
-	// These go through GPRs, so the cost of tossing them back into memory is lower than anything else.
+	// Okay, not found; run the register allocator heuristic and figure out which register we should
+	// clobber.
+	float min_score = std::numeric_limits<float>::max();
+	X64Reg best_xreg = INVALID_REG;
+	size_t best_preg = 0;
 	for (size_t i = 0; i < aCount; i++)
 	{
-		X64Reg xr = (X64Reg)aOrder[i];
-		if (xregs[xr].locked)
+		X64Reg xreg = (X64Reg)aOrder[i];
+		size_t preg = xregs[xreg].ppcReg;
+		if (xregs[xreg].locked || regs[preg].locked)
 			continue;
-		size_t preg = xregs[xr].ppcReg;
-		if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg)))
+		float score = ScoreRegister(xreg);
+		if (score < min_score)
 		{
-			StoreFromRegister(preg);
-			return xr;
+			min_score = score;
+			best_xreg = xreg;
+			best_preg = preg;
 		}
 	}
 
-	//TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions
-	u32 last_used = 0xFFFFFFFF;
-	X64Reg last_used_xr = INVALID_REG;
-	size_t last_used_preg = 0;
-	for (size_t i = 0; i < aCount; i++)
+	if (best_xreg != INVALID_REG)
 	{
-		X64Reg xr = (X64Reg)aOrder[i];
-		if (xregs[xr].locked)
-			continue;
-		size_t preg = xregs[xr].ppcReg;
-		if (!regs[preg].locked && regs[preg].last_used_quantum < last_used)
-		{
-			last_used = regs[preg].last_used_quantum;
-			last_used_xr = xr;
-			last_used_preg = preg;
-		}
-	}
-
-	if (last_used_xr != INVALID_REG)
-	{
-		StoreFromRegister(last_used_preg);
-		return last_used_xr;
+		StoreFromRegister(best_preg);
+		return best_xreg;
 	}
 
 	//Still no dice? Die!
@@ -197,7 +258,6 @@ void RegCache::DiscardRegContentsIfCached(size_t preg)
 		xregs[xr].ppcReg = INVALID_REG;
 		regs[preg].away = false;
 		regs[preg].location = GetDefaultLocation(preg);
-		regs[preg].last_used_quantum = 0;
 	}
 }
 
@@ -279,7 +339,6 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
 		}
 		regs[i].away = true;
 		regs[i].location = ::Gen::R(xr);
-		regs[i].last_used_quantum = ++cur_use_quantum;
 	}
 	else
 	{
@@ -322,7 +381,6 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)
 		{
 			regs[i].location = newLoc;
 			regs[i].away = false;
-			regs[i].last_used_quantum = 0;
 		}
 	}
 }
@@ -378,8 +436,6 @@ void RegCache::Flush(FlushMode mode)
 			}
 		}
 	}
-
-	cur_use_quantum = 0;
 }
 
 int RegCache::NumFreeRegisters()
diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
index 0fe3e9fe5f..59b1a5abe6 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
@@ -20,7 +20,6 @@ struct PPCCachedReg
 	Gen::OpArg location;
 	bool away;  // value not in source register
 	bool locked;
-	u32 last_used_quantum;
 };
 
 struct X64CachedReg
@@ -44,9 +43,12 @@ protected:
 
 	virtual const int *GetAllocationOrder(size_t& count) = 0;
 
+	virtual u32 GetRegUtilization() = 0;
+	virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0;
+
 	Gen::XEmitter *emit;
 
-	u32 cur_use_quantum;
+	float ScoreRegister(Gen::X64Reg xreg);
 
 public:
 	RegCache();
@@ -134,6 +136,8 @@ public:
 	Gen::OpArg GetDefaultLocation(size_t reg) const override;
 	const int* GetAllocationOrder(size_t& count) override;
 	void SetImmediate32(size_t preg, u32 immValue);
+	u32 GetRegUtilization();
+	u32 CountRegsIn(size_t preg, u32 lookahead);
 };
 
 
@@ -144,4 +148,6 @@ public:
 	void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
 	const int* GetAllocationOrder(size_t& count) override;
 	Gen::OpArg GetDefaultLocation(size_t reg) const override;
+	u32 GetRegUtilization();
+	u32 CountRegsIn(size_t preg, u32 lookahead);
 };
diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
index 4c3cd2562b..22c4ac9d2b 100644
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@@ -74,6 +74,7 @@ protected:
 		u32 blockStart;
 		UGeckoInstruction next_inst;  // for easy peephole opt.
 		int instructionNumber;
+		int instructionsLeft;
 		int downcountAmount;
 		u32 numLoadStoreInst;
 		u32 numFloatingPointInst;
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index e0c76192b6..4bd5b60a14 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -796,57 +796,56 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 
 	// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
 	// wants flags, to be safe.
-	bool wantsCR0 = true;
-	bool wantsCR1 = true;
-	bool wantsFPRF = true;
-	bool wantsCA = true;
-	u32 fregInUse = 0;
-	u32 regInUse = 0;
-	u32 fregInXmm = 0;
+	bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true;
+	u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
-		bool opWantsCR0  = code[i].wantsCR0;
-		bool opWantsCR1  = code[i].wantsCR1;
+		bool opWantsCR0 = code[i].wantsCR0;
+		bool opWantsCR1 = code[i].wantsCR1;
 		bool opWantsFPRF = code[i].wantsFPRF;
-		bool opWantsCA   = code[i].wantsCA;
-		code[i].wantsCR0  = wantsCR0  || code[i].canEndBlock;
-		code[i].wantsCR1  = wantsCR1  || code[i].canEndBlock;
+		bool opWantsCA = code[i].wantsCA;
+		code[i].wantsCR0 = wantsCR0 || code[i].canEndBlock;
+		code[i].wantsCR1 = wantsCR1 || code[i].canEndBlock;
 		code[i].wantsFPRF = wantsFPRF || code[i].canEndBlock;
-		code[i].wantsCA   = wantsCA   || code[i].canEndBlock;
-		wantsCR0  |= opWantsCR0  || code[i].canEndBlock;
-		wantsCR1  |= opWantsCR1  || code[i].canEndBlock;
+		code[i].wantsCA = wantsCA || code[i].canEndBlock;
+		wantsCR0 |= opWantsCR0 || code[i].canEndBlock;
+		wantsCR1 |= opWantsCR1 || code[i].canEndBlock;
 		wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
-		wantsCA   |= opWantsCA   || code[i].canEndBlock;
-		wantsCR0  &= !code[i].outputCR0  || opWantsCR0;
-		wantsCR1  &= !code[i].outputCR1  || opWantsCR1;
+		wantsCA |= opWantsCA || code[i].canEndBlock;
+		wantsCR0 &= !code[i].outputCR0 || opWantsCR0;
+		wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
 		wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
-		wantsCA   &= !code[i].outputCA   || opWantsCA;
-		code[i].gprInUse = regInUse;
-		code[i].fprInUse = fregInUse;
-		code[i].fprInXmm = fregInXmm;
+		wantsCA &= !code[i].outputCA || opWantsCA;
+		code[i].gprInUse = gprInUse;
+		code[i].fprInUse = fprInUse;
+		code[i].gprInReg = gprInReg;
+		code[i].fprInXmm = fprInXmm;
 		// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
 		// we can throw away a register if it's going to be overwritten later.
 		for (int j = 0; j < 3; j++)
 			if (code[i].regsIn[j] >= 0)
-				regInUse |= 1 << code[i].regsIn[j];
+			{
+				gprInUse |= 1 << code[i].regsIn[j];
+				gprInReg |= 1 << code[i].regsIn[j];
+			}
 		for (int j = 0; j < 4; j++)
 			if (code[i].fregsIn[j] >= 0)
 			{
-				fregInUse |= 1 << code[i].fregsIn[j];
+				fprInUse |= 1 << code[i].fregsIn[j];
 				if (strncmp(code[i].opinfo->opname, "stfd", 4))
-					fregInXmm |= 1 << code[i].fregsIn[j];
+					fprInXmm |= 1 << code[i].fregsIn[j];
 			}
 		// For now, we need to count output registers as "used" though; otherwise the flush
 		// will result in a redundant store (e.g. store to regcache, then store again to
 		// the same location later).
 		for (int j = 0; j < 2; j++)
 			if (code[i].regsOut[j] >= 0)
-				regInUse |= 1 << code[i].regsOut[j];
+				gprInUse |= 1 << code[i].regsOut[j];
 		if (code[i].fregOut >= 0)
 		{
-			fregInUse |= 1 << code[i].fregOut;
+			fprInUse |= 1 << code[i].fregOut;
 			if (strncmp(code[i].opinfo->opname, "stfd", 4))
-				fregInXmm |= 1 << code[i].fregOut;
+				fprInXmm |= 1 << code[i].fregOut;
 		}
 	}
 	return address;
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index a591c7f489..2c7ca62f34 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -43,8 +43,10 @@ struct CodeOp //16B
 	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 	// which registers are still needed after this instruction in this block
-	u32 gprInUse;
 	u32 fprInUse;
+	u32 gprInUse;
+	// just because a register is in use doesn't mean we actually need or want it in an x86 register.
+	u32 gprInReg;
 	// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
 	// an XMM only to move it again to a GPR afterwards.
 	u32 fprInXmm;