From 0367e7ee4d741f063451784ed322224b6f9fe114 Mon Sep 17 00:00:00 2001
From: magumagu9 <magumagu9@gmail.com>
Date: Thu, 1 Jan 2009 12:50:23 +0000
Subject: [PATCH] And a bit more JIT WIP work: improved code generation for
 integer load/store, and outlining the start of FP support.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1729 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp   | 266 ++++++++++++++----
 Source/Core/Core/Src/PowerPC/Jit64IL/IR.h     |  64 +++++
 .../Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp   |   6 +-
 3 files changed, 272 insertions(+), 64 deletions(-)

diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
index 9271532cb5..3545435881 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@@ -78,14 +78,30 @@ on the test I've been working on (which bounded by JIT performance and doesn't
 use any floating-point), it's roughly 25% faster than the current JIT, with the
 edge over the current JIT mostly due to the fast memory optimization.
 
+Update on perf:
+I've been doing a bit more tweaking for a small perf improvement (in the 
+range of 5-10%).  That said, it's getting to the point where I'm simply
+not seeing potential for improvements to codegen, at least for long,
+straightforward blocks.  For one long block that's at the top of my samples,
+I've managed to get the bloat% (number of instructions compared to PPC
+equivalent) down to 225%, and I can't really see it going down much further.
+It looks like the most promising paths to further improvement for pure
+integer code are more aggresively combining blocks and dead condition
+register elimination, which should be very helpful for small blocks.
+
 TODO (in no particular order):
-Floating-point JIT (both paired and unpaired): currently falls back
-	to the interpreter
+Floating-point JIT (both paired and unpaired)
+	(very large win for FP code, no effect for integer code)
+Inter-block dead condition register elimination (Likely significant win
+	combined with optimized conditions)
 Optimize conditions for conditional branches.
-Inter-block dead register elimination, especially for CR0.
+General dead register elimination.
 Inter-block inlining.
-Track down a few correctness bugs.
-Implement a select instruction
+Track down a few correctness bugs (I think there's something wrong
+	with my branches, but I haven't been able to figure it out).
+Specialized slw/srw/sraw; I think there are some tricks that could
+	have a non-trivial effect, and there are significantly shorter
+	implementations for 64-bit involving abusing 64-bit shifts.
 64-bit compat (it should only be a few tweaks to register allocation and
 	the load/store code)
 Scheduling to reduce register pressure: PowerPC	compilers like to push
@@ -93,8 +109,16 @@ Scheduling to reduce register pressure: PowerPC	compilers like to push
 	x86 processors, which are short on registers and extremely good at
 	instruction reordering.
 Common subexpression elimination
-Optimize load of sum using complex addressing (partially implemented)
+Optimize load/store of sum using complex addressing (partially implemented)
 Implement idle-skipping
+Loop optimizations (loop-carried registers, LICM); not sure how much
+	this will help on top of dead register elimination
+Fold loads (both register and memory) into arithmetic operations
+Code refactoring/cleanup
+Investigate performance of the JIT itself; this doesn't affect
+	framerates significantly, but it does take a visible amount
+	of time for a complicated piece of code like a video decoder
+	to compile.
 
 */
 
@@ -492,6 +516,9 @@ struct RegInfo {
 		exitNumber = 0;
 		MakeProfile = UseProfile = false;
 	}
+
+	private:
+	RegInfo(RegInfo&); // DO NOT IMPLEMENT
 };
 
 static void regMarkUse(RegInfo& R, InstLoc I, InstLoc Op, unsigned OpNum) {
@@ -635,48 +662,119 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I,
 	RI.regs[reg] = I;
 }
 
-static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
-	X64Reg reg;
-	unsigned offset;
-				
-	if (getOpcode(*getOp1(I)) == Add && isImm(*getOp2(getOp1(I)))) {
-		offset = RI.Build->GetImmValue(getOp2(getOp1(I)));
-		reg = regBinLHSReg(RI, getOp1(I));
-		if (RI.IInfo[I - RI.FirstI] & 4)
-			regClearInst(RI, getOp1(getOp1(I)));
-	} else {
-		offset = 0;
-		reg = regBinLHSReg(RI, I);
-		if (RI.IInfo[I - RI.FirstI] & 4)
-			regClearInst(RI, getOp1(I));
+// Mark and calculation routines for profiled load/store addresses
+// Could be extended to unprofiled addresses.
+// FIXME: Finish/activate!
+static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
+	if (isImm(*AI)) {
+		unsigned addr = RI.Build->GetImmValue(AI);	
+		if (Memory::IsRAMAddress(addr))
+			return;
 	}
-	if (RI.UseProfile) {
-		unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++];
-		if (!(curLoad & 0x0C000000)) {
-			if (regReadUse(RI, I)) {
-				unsigned addr = (u32)Memory::base - (curLoad & 0xC0000000) + offset;
-				RI.Jit->MOVZX(32, Size, reg, MDisp(reg, addr));
-				RI.Jit->BSWAP(Size, reg);
-				RI.regs[reg] = I;
-			}
+	if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) {
+		regMarkUse(RI, I, getOp1(AI), OpNum);
+		return;
+	}
+	regMarkUse(RI, I, AI, OpNum);
+}
+
+static void regClearDeadMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
+	if (!(RI.IInfo[I - RI.FirstI] & (2 << OpNum)))
+		return;
+	if (isImm(*AI)) {
+		unsigned addr = RI.Build->GetImmValue(AI);	
+		if (Memory::IsRAMAddress(addr)) {
 			return;
 		}
 	}
-	if (offset) {
-		RI.Jit->ADD(32, R(reg), Imm32(offset));
+	InstLoc AddrBase;
+	if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) {
+		AddrBase = getOp1(AI);
+	} else {
+		AddrBase = AI;
 	}
+	regClearInst(RI, AddrBase);
+}
+
+static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI,
+				unsigned OpNum,	unsigned Size, X64Reg* dest,
+				bool Profiled,
+				unsigned ProfileOffset = 0) {
+	if (isImm(*AI)) {
+		unsigned addr = RI.Build->GetImmValue(AI);	
+		if (Memory::IsRAMAddress(addr)) {
+			if (dest)
+				*dest = regFindFreeReg(RI);
+			if (Profiled) 
+				return M((void*)((u32)Memory::base + (addr & Memory::MEMVIEW32_MASK)));
+			return M((void*)addr);
+		}
+	}
+	unsigned offset;
+	InstLoc AddrBase;
+	if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) {
+		offset = RI.Build->GetImmValue(getOp2(AI));
+		AddrBase = getOp1(AI);
+	} else {
+		offset = 0;
+		AddrBase = AI;
+	}
+	X64Reg baseReg;
+	if (RI.IInfo[I - RI.FirstI] & (2 << OpNum)) {
+		baseReg = regEnsureInReg(RI, AddrBase);
+		regClearInst(RI, AddrBase);
+		if (dest)
+			*dest = baseReg;
+	} else if (dest) {
+		X64Reg reg = regFindFreeReg(RI);
+		if (!regLocForInst(RI, AddrBase).IsSimpleReg()) {
+			RI.Jit->MOV(32, R(reg), regLocForInst(RI, AddrBase));
+			baseReg = reg;
+		} else {
+			baseReg = regLocForInst(RI, AddrBase).GetSimpleReg();
+		}
+		*dest = reg;
+	} else {
+		baseReg = regEnsureInReg(RI, AddrBase);
+	}
+
+	if (Profiled) {
+		return MDisp(baseReg, (u32)Memory::base + offset + ProfileOffset);
+	}
+	return MDisp(baseReg, offset);
+}
+// end FIXME
+
+static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
+	if (RI.UseProfile) {
+		unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++];
+		if (!(curLoad & 0x0C000000)) {
+			X64Reg reg;
+			OpArg addr = regBuildMemAddress(RI, I, getOp1(I), 1,
+							Size, &reg, true,
+							-(curLoad & 0xC0000000));
+			RI.Jit->MOVZX(32, Size, reg, addr);
+			RI.Jit->BSWAP(Size, reg);
+			if (regReadUse(RI, I))
+				RI.regs[reg] = I;
+			return;
+		}
+	}
+	X64Reg reg;
+	OpArg addr = regBuildMemAddress(RI, I, getOp1(I), 1, Size, &reg, false);
+	RI.Jit->LEA(32, ECX, addr);
 	if (RI.MakeProfile) {
-		RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(reg));
+		RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX));
 	}
-	RI.Jit->TEST(32, R(reg), Imm32(0x0C000000));
+	RI.Jit->TEST(32, R(ECX), Imm32(0x0C000000));
 	FixupBranch argh = RI.Jit->J_CC(CC_Z);
 	if (reg != EAX)
 		RI.Jit->PUSH(32, R(EAX));
 	switch (Size)
 	{
-	case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
-	case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
-	case 8:  RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg);  break;
+	case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), ECX); break;
+	case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), ECX); break;
+	case 8:  RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), ECX);  break;
 	}
 	if (reg != EAX) {
 		RI.Jit->MOV(32, R(reg), R(EAX));
@@ -684,41 +782,87 @@ static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
 	}
 	FixupBranch arg2 = RI.Jit->J();
 	RI.Jit->SetJumpTarget(argh);
-	RI.Jit->UnsafeLoadRegToReg(reg, reg, Size, 0, false);
+	RI.Jit->UnsafeLoadRegToReg(ECX, reg, Size, 0, false);
 	RI.Jit->SetJumpTarget(arg2);
 	if (regReadUse(RI, I))
 		RI.regs[reg] = I;
 }
 
+static OpArg regSwappedImmForConst(RegInfo& RI, InstLoc I, unsigned Size) {
+	unsigned imm = RI.Build->GetImmValue(I);
+	if (Size == 32) {
+		imm = Common::swap32(imm);
+		return Imm32(imm);
+	} else if (Size == 16) {
+		imm = Common::swap16(imm);
+		return Imm16(imm);
+	} else {
+		return Imm8(imm);
+	}
+}
+
+static OpArg regImmForConst(RegInfo& RI, InstLoc I, unsigned Size) {
+	unsigned imm = RI.Build->GetImmValue(I);
+	if (Size == 32) {
+		return Imm32(imm);
+	} else if (Size == 16) {
+		return Imm16(imm);
+	} else {
+		return Imm8(imm);
+	}
+}
+
 static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
 	if (RI.UseProfile) {
 		unsigned curStore = ProfiledLoads[RI.numProfiledLoads++];
 		if (!(curStore & 0x0C000000)) {
-			X64Reg reg = regEnsureInReg(RI, getOp2(I));
-			RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
-			RI.Jit->BSWAP(Size, ECX);
-			unsigned addr = (u32)Memory::base - (curStore & 0xC0000000);
-			RI.Jit->MOV(Size, MDisp(reg, addr), R(ECX));
+			OpArg addr = regBuildMemAddress(RI, I, getOp2(I), 2,
+							Size, 0, true,
+							-(curStore & 0xC0000000));
+			if (isImm(*getOp1(I))) {
+				RI.Jit->MOV(Size, addr, regSwappedImmForConst(RI, getOp1(I), Size));
+			} else {
+				RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
+				RI.Jit->BSWAP(Size, ECX);
+				RI.Jit->MOV(Size, addr, R(ECX));
+			}
+			if (RI.IInfo[I - RI.FirstI] & 4)
+				regClearInst(RI, getOp1(I));
 			return;
 		} else if ((curStore & 0xFFFFF000) == 0xCC008000) {
 			regSpill(RI, EAX);
-			RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
-			RI.Jit->BSWAP(Size, ECX);
+			if (isImm(*getOp1(I))) {
+				RI.Jit->MOV(Size, R(ECX), regSwappedImmForConst(RI, getOp1(I), Size));
+			} else { 
+				RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
+				RI.Jit->BSWAP(Size, ECX);
+			}
 			RI.Jit->MOV(32, R(EAX), M(&GPFifo::m_gatherPipeCount));
 			RI.Jit->MOV(Size, MDisp(EAX, (u32)GPFifo::m_gatherPipe), R(ECX));
 			RI.Jit->ADD(32, R(EAX), Imm8(Size >> 3));
 			RI.Jit->MOV(32, M(&GPFifo::m_gatherPipeCount), R(EAX));
 			RI.Jit->js.fifoBytesThisBlock += Size >> 3;
+			if (RI.IInfo[I - RI.FirstI] & 4)
+				regClearInst(RI, getOp1(I));
+			//regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
+			regClearDeadMemAddress(RI, I, getOp2(I), 2);
 			return;
 		}
 	}
+	OpArg addr = regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
+	RI.Jit->LEA(32, ECX, addr);
 	regSpill(RI, EAX);
-	RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
-	RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
+	if (isImm(*getOp1(I))) {
+		RI.Jit->MOV(Size, R(EAX), regImmForConst(RI, getOp1(I), Size));
+	} else {
+		RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
+	}
 	if (RI.MakeProfile) {
 		RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX));
 	}
 	RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0);
+	if (RI.IInfo[I - RI.FirstI] & 4)
+		regClearInst(RI, getOp1(I));
 }
 
 static void regEmitShiftInst(RegInfo& RI, InstLoc I,
@@ -787,7 +931,6 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 	RI.Build = ibuild;
 	RI.UseProfile = UseProfile;
 	RI.MakeProfile = !RI.UseProfile;
-	unsigned bs = Jit->js.blockStart;
 	// Pass to compute liveness
 	ibuild->StartBackPass();
 	for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
@@ -825,21 +968,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			if (thisUsed)
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
-		case StoreCR:
-		case StoreCarry:
 		case Load8:
 		case Load16:
 		case Load32:
-			if (getOpcode(*getOp1(I)) == Add &&
-			    isImm(*getOp2(getOp1(I)))) {
-				regMarkUse(RI, I, getOp1(getOp1(I)), 1);
-				break;
-			}
+			regMarkMemAddress(RI, I, getOp1(I), 1);
+			break;
+		case StoreCR:
+		case StoreCarry:
+			regMarkUse(RI, I, getOp1(I), 1);
+			break;
 		case StoreGReg:
 		case StoreLink:
 		case StoreCTR:
 		case StoreMSR:
-			regMarkUse(RI, I, getOp1(I), 1);
+			if (!isImm(*getOp1(I)))
+				regMarkUse(RI, I, getOp1(I), 1);
 			break;
 		case Add:
 		case Sub:
@@ -866,8 +1009,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case Store8:
 		case Store16:
 		case Store32:
-			regMarkUse(RI, I, getOp1(I), 1);
-			regMarkUse(RI, I, getOp2(I), 2);
+			if (!isImm(*getOp1(I)))
+				regMarkUse(RI, I, getOp1(I), 1);
+			regMarkMemAddress(RI, I, getOp2(I), 2);
 			break;
 		case BranchUncond:
 			if (!isImm(*getOp1(I)))
@@ -1238,7 +1382,11 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		    getOpcode(*I) != BranchCond &&
 		    getOpcode(*I) != Load8 && 
 		    getOpcode(*I) != Load16 && 
-		    getOpcode(*I) != Load32) {
+		    getOpcode(*I) != Load32 &&
+		    getOpcode(*I) != Store8 &&
+		    getOpcode(*I) != Store16 &&
+		    getOpcode(*I) != Store32 &&
+		    1) {
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				regClearInst(RI, getOp1(I));
 			if (RI.IInfo[I - RI.FirstI] & 8)
@@ -1252,7 +1400,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		}
 	}
 
-	if (RI.numSpills)
+	if (UseProfile && RI.numSpills)
 		printf("Block: %x, numspills %d\n", Jit->js.blockStart, RI.numSpills);
 
 	Jit->UD2();
diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
index a7a1a47563..74bfaadd0f 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@@ -80,6 +80,69 @@ namespace IREmitter {
 		Store16,
 		Store32,
 		BranchCond,
+		// Floating-point
+		// There are three floating-point formats: single, double,
+		// and packed.  For any operation where the format of the
+		// operand isn't known, the ForceTo* operations are used;
+		// these are folded into the appropriate conversion
+		// (or no conversion) depending on the type of the operand.
+		// The "mreg" format is a pair of doubles; this is the
+		// most general possible represenation which is used
+		// in the register state.
+		// This might seem like overkill, but it's a huge advantage
+		// to keep operands in the right format because extra
+		// precision can screw up games.
+		// FIXME: Does the slight loss of precision due to not
+		// having a madd instruction matter?  It would be a
+		// performance loss for singles because the operations
+		// would have to be done in double precision, and a completely 
+		// accurate double madd would require an extremely expensive
+		// fallback.
+		FDAdd,
+		FDSub,
+		FDMul,
+		FDDiv,
+		FDNeg,
+		FSAdd,
+		FSSub,
+		FSMul,
+		FSDiv,
+		FSNeg,
+		FPSAdd,
+		FPSSub,
+		FPSMul,
+		FPSDiv,
+		FPSNeg,
+		// FP Loads
+		LoadSingle,
+		LoadDouble,
+		// LoadPacked, // FIXME: Work out how this instruction should
+				// be implemented
+		// FP Stores
+		StoreSingle,
+		StoreDouble,
+		// StorePacked, // FIXME: Work out how this instruction should
+				// be implemented
+		PackedToSingle, // Extract PS0 from packed (type-pun)
+		// PackedToDouble == PackedToSingle+SingleToDouble
+		PackedToMReg,   // Convert from packed format to mreg format (CVTPS2PD)
+		SingleToDouble, // Widen single to double (CVTSS2SD)
+		SingleToPacked, // Duplicate single to packed
+		// SingleToMReg == SingleToPacked+PackedToMReg
+		MRegToPacked,   // Convert from mreg format to packed format (CVTPD2PS)
+		MRegToDouble,   // Extract bottom half from mreg format. (type-pun)
+		// MRegToSingle == MRegToDouble + DoubleToSingle
+		DoubleToMReg,   // Convert from double format to mreg format
+		DoubleToSingle, // Convert from double to single format (CVTSD2SS)
+		// DoubleToPacked should never be needed
+
+		ForceToPacked,  // ForceTo* are "virtual"; they should be
+				// folded into the above conversions.
+		ForceToSingle,
+		ForceToDouble,
+		ForceToMReg,
+		LoadFPReg,
+		StoreFPReg,
 
 		// "Trinary" operators
 		// FIXME: Need to change representation!
@@ -330,6 +393,7 @@ namespace IREmitter {
 		IRBuilder() { Reset(); }
 
 		private:
+		IRBuilder(IRBuilder&); // DO NOT IMPLEMENT
 		std::vector<Inst> InstList; // FIXME: We must ensure this is 
 					    // continuous!
 		std::vector<unsigned> ConstList;
diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
index 09596548df..61868d80a6 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
@@ -81,14 +81,10 @@ using namespace Gen;
 				CRTest = ibuild.EmitXor(CRTest, CRCmp);
 		}
 
-		if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) {
+		if ((inst.BO & 4) == 0) {
 			IREmitter::InstLoc c = ibuild.EmitLoadCTR();
 			c = ibuild.EmitSub(c, ibuild.EmitIntConst(1));
 			ibuild.EmitStoreCTR(c);
-		}
-
-		if ((inst.BO & 4) == 0) {
-			IREmitter::InstLoc c = ibuild.EmitLoadCTR();
 			if (!(inst.BO & 2)) {
 				CTRTest = ibuild.EmitICmpEq(c,
 						ibuild.EmitIntConst(0));