And a bit more JIT WIP work: improved code generation for integer

load/store, and outlining the start of FP support. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1729 8ced0084-cf51-0410-be5f-012b33b47a6e
2009-01-01 12:50:23 +00:00 · 2009-01-01 12:50:23 +00:00 · 0367e7ee4d
parent bd3f468c37
commit 0367e7ee4d
3 changed files with 272 additions and 64 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@ -78,14 +78,30 @@ on the test I've been working on (which bounded by JIT performance and doesn't
 use any floating-point), it's roughly 25% faster than the current JIT, with the
 edge over the current JIT mostly due to the fast memory optimization.
 Update on perf:
 I've been doing a bit more tweaking for a small perf improvement (in the 
 range of 5-10%).  That said, it's getting to the point where I'm simply
 not seeing potential for improvements to codegen, at least for long,
 straightforward blocks.  For one long block that's at the top of my samples,
 I've managed to get the bloat% (number of instructions compared to PPC
 equivalent) down to 225%, and I can't really see it going down much further.
 It looks like the most promising paths to further improvement for pure
 integer code are more aggresively combining blocks and dead condition
 register elimination, which should be very helpful for small blocks.
 TODO (in no particular order):
-Floating-point JIT (both paired and unpaired): currently falls back
+Floating-point JIT (both paired and unpaired)
-	to the interpreter
+	(very large win for FP code, no effect for integer code)
 Inter-block dead condition register elimination (Likely significant win
 	combined with optimized conditions)
 Optimize conditions for conditional branches.
-Inter-block dead register elimination, especially for CR0.
+General dead register elimination.
 Inter-block inlining.
-Track down a few correctness bugs.
+Track down a few correctness bugs (I think there's something wrong
-Implement a select instruction
+	with my branches, but I haven't been able to figure it out).
 Specialized slw/srw/sraw; I think there are some tricks that could
 	have a non-trivial effect, and there are significantly shorter
 	implementations for 64-bit involving abusing 64-bit shifts.
 64-bit compat (it should only be a few tweaks to register allocation and
 	the load/store code)
 Scheduling to reduce register pressure: PowerPC	compilers like to push
@ -93,8 +109,16 @@ Scheduling to reduce register pressure: PowerPC	compilers like to push
 	x86 processors, which are short on registers and extremely good at
 	instruction reordering.
 Common subexpression elimination
-Optimize load of sum using complex addressing (partially implemented)
+Optimize load/store of sum using complex addressing (partially implemented)
 Implement idle-skipping
 Loop optimizations (loop-carried registers, LICM); not sure how much
 	this will help on top of dead register elimination
 Fold loads (both register and memory) into arithmetic operations
 Code refactoring/cleanup
 Investigate performance of the JIT itself; this doesn't affect
 	framerates significantly, but it does take a visible amount
 	of time for a complicated piece of code like a video decoder
 	to compile.
 */
@ -492,6 +516,9 @@ struct RegInfo {
 		exitNumber = 0;
 		MakeProfile = UseProfile = false;
 	}
 	private:
 	RegInfo(RegInfo&); // DO NOT IMPLEMENT
 };
 static void regMarkUse(RegInfo& R, InstLoc I, InstLoc Op, unsigned OpNum) {
@ -635,48 +662,119 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I,
 	RI.regs[reg] = I;
 }
-static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
+// Mark and calculation routines for profiled load/store addresses
-	X64Reg reg;
+// Could be extended to unprofiled addresses.
-	unsigned offset;
+// FIXME: Finish/activate!
-				
+static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
-	if (getOpcode(*getOp1(I)) == Add && isImm(*getOp2(getOp1(I)))) {
+	if (isImm(*AI)) {
-		offset = RI.Build->GetImmValue(getOp2(getOp1(I)));
+		unsigned addr = RI.Build->GetImmValue(AI);	
-		reg = regBinLHSReg(RI, getOp1(I));
+		if (Memory::IsRAMAddress(addr))
-		if (RI.IInfo[I - RI.FirstI] & 4)
+			return;
 			regClearInst(RI, getOp1(getOp1(I)));
 	} else {
 		offset = 0;
 		reg = regBinLHSReg(RI, I);
 		if (RI.IInfo[I - RI.FirstI] & 4)
 			regClearInst(RI, getOp1(I));
 	}
-	if (RI.UseProfile) {
+	if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) {
-		unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++];
+		regMarkUse(RI, I, getOp1(AI), OpNum);
-		if (!(curLoad & 0x0C000000)) {
+		return;
-			if (regReadUse(RI, I)) {
+	}
-				unsigned addr = (u32)Memory::base - (curLoad & 0xC0000000) + offset;
+	regMarkUse(RI, I, AI, OpNum);
-				RI.Jit->MOVZX(32, Size, reg, MDisp(reg, addr));
+}
-				RI.Jit->BSWAP(Size, reg);
+
-				RI.regs[reg] = I;
+static void regClearDeadMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
-			}
+	if (!(RI.IInfo[I - RI.FirstI] & (2 << OpNum)))
 		return;
 	if (isImm(*AI)) {
 		unsigned addr = RI.Build->GetImmValue(AI);	
 		if (Memory::IsRAMAddress(addr)) {
 			return;
 		}
 	}
-	if (offset) {
+	InstLoc AddrBase;
-		RI.Jit->ADD(32, R(reg), Imm32(offset));
+	if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) {
 		AddrBase = getOp1(AI);
 	} else {
 		AddrBase = AI;
 	}
 	regClearInst(RI, AddrBase);
 }
 static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI,
 				unsigned OpNum,	unsigned Size, X64Reg* dest,
 				bool Profiled,
 				unsigned ProfileOffset = 0) {
 	if (isImm(*AI)) {
 		unsigned addr = RI.Build->GetImmValue(AI);	
 		if (Memory::IsRAMAddress(addr)) {
 			if (dest)
 				*dest = regFindFreeReg(RI);
 			if (Profiled) 
 				return M((void*)((u32)Memory::base + (addr & Memory::MEMVIEW32_MASK)));
 			return M((void*)addr);
 		}
 	}
 	unsigned offset;
 	InstLoc AddrBase;
 	if (getOpcode(*AI) == Add && isImm(*getOp2(AI))) {
 		offset = RI.Build->GetImmValue(getOp2(AI));
 		AddrBase = getOp1(AI);
 	} else {
 		offset = 0;
 		AddrBase = AI;
 	}
 	X64Reg baseReg;
 	if (RI.IInfo[I - RI.FirstI] & (2 << OpNum)) {
 		baseReg = regEnsureInReg(RI, AddrBase);
 		regClearInst(RI, AddrBase);
 		if (dest)
 			*dest = baseReg;
 	} else if (dest) {
 		X64Reg reg = regFindFreeReg(RI);
 		if (!regLocForInst(RI, AddrBase).IsSimpleReg()) {
 			RI.Jit->MOV(32, R(reg), regLocForInst(RI, AddrBase));
 			baseReg = reg;
 		} else {
 			baseReg = regLocForInst(RI, AddrBase).GetSimpleReg();
 		}
 		*dest = reg;
 	} else {
 		baseReg = regEnsureInReg(RI, AddrBase);
 	}
 	if (Profiled) {
 		return MDisp(baseReg, (u32)Memory::base + offset + ProfileOffset);
 	}
 	return MDisp(baseReg, offset);
 }
 // end FIXME
 static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
 	if (RI.UseProfile) {
 		unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++];
 		if (!(curLoad & 0x0C000000)) {
 			X64Reg reg;
 			OpArg addr = regBuildMemAddress(RI, I, getOp1(I), 1,
 							Size, &reg, true,
 							-(curLoad & 0xC0000000));
 			RI.Jit->MOVZX(32, Size, reg, addr);
 			RI.Jit->BSWAP(Size, reg);
 			if (regReadUse(RI, I))
 				RI.regs[reg] = I;
 			return;
 		}
 	}
 	X64Reg reg;
 	OpArg addr = regBuildMemAddress(RI, I, getOp1(I), 1, Size, &reg, false);
 	RI.Jit->LEA(32, ECX, addr);
 	if (RI.MakeProfile) {
-		RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(reg));
+		RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX));
 	}
-	RI.Jit->TEST(32, R(reg), Imm32(0x0C000000));
+	RI.Jit->TEST(32, R(ECX), Imm32(0x0C000000));
 	FixupBranch argh = RI.Jit->J_CC(CC_Z);
 	if (reg != EAX)
 		RI.Jit->PUSH(32, R(EAX));
 	switch (Size)
 	{
-	case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
+	case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), ECX); break;
-	case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
+	case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), ECX); break;
-	case 8:  RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg);  break;
+	case 8:  RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), ECX);  break;
 	}
 	if (reg != EAX) {
 		RI.Jit->MOV(32, R(reg), R(EAX));
@ -684,41 +782,87 @@ static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
 	}
 	FixupBranch arg2 = RI.Jit->J();
 	RI.Jit->SetJumpTarget(argh);
-	RI.Jit->UnsafeLoadRegToReg(reg, reg, Size, 0, false);
+	RI.Jit->UnsafeLoadRegToReg(ECX, reg, Size, 0, false);
 	RI.Jit->SetJumpTarget(arg2);
 	if (regReadUse(RI, I))
 		RI.regs[reg] = I;
 }
 static OpArg regSwappedImmForConst(RegInfo& RI, InstLoc I, unsigned Size) {
 	unsigned imm = RI.Build->GetImmValue(I);
 	if (Size == 32) {
 		imm = Common::swap32(imm);
 		return Imm32(imm);
 	} else if (Size == 16) {
 		imm = Common::swap16(imm);
 		return Imm16(imm);
 	} else {
 		return Imm8(imm);
 	}
 }
 static OpArg regImmForConst(RegInfo& RI, InstLoc I, unsigned Size) {
 	unsigned imm = RI.Build->GetImmValue(I);
 	if (Size == 32) {
 		return Imm32(imm);
 	} else if (Size == 16) {
 		return Imm16(imm);
 	} else {
 		return Imm8(imm);
 	}
 }
 static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
 	if (RI.UseProfile) {
 		unsigned curStore = ProfiledLoads[RI.numProfiledLoads++];
 		if (!(curStore & 0x0C000000)) {
-			X64Reg reg = regEnsureInReg(RI, getOp2(I));
+			OpArg addr = regBuildMemAddress(RI, I, getOp2(I), 2,
-			RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
+							Size, 0, true,
-			RI.Jit->BSWAP(Size, ECX);
+							-(curStore & 0xC0000000));
-			unsigned addr = (u32)Memory::base - (curStore & 0xC0000000);
+			if (isImm(*getOp1(I))) {
-			RI.Jit->MOV(Size, MDisp(reg, addr), R(ECX));
+				RI.Jit->MOV(Size, addr, regSwappedImmForConst(RI, getOp1(I), Size));
 			} else {
 				RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
 				RI.Jit->BSWAP(Size, ECX);
 				RI.Jit->MOV(Size, addr, R(ECX));
 			}
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				regClearInst(RI, getOp1(I));
 			return;
 		} else if ((curStore & 0xFFFFF000) == 0xCC008000) {
 			regSpill(RI, EAX);
-			RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
+			if (isImm(*getOp1(I))) {
-			RI.Jit->BSWAP(Size, ECX);
+				RI.Jit->MOV(Size, R(ECX), regSwappedImmForConst(RI, getOp1(I), Size));
 			} else { 
 				RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
 				RI.Jit->BSWAP(Size, ECX);
 			}
 			RI.Jit->MOV(32, R(EAX), M(&GPFifo::m_gatherPipeCount));
 			RI.Jit->MOV(Size, MDisp(EAX, (u32)GPFifo::m_gatherPipe), R(ECX));
 			RI.Jit->ADD(32, R(EAX), Imm8(Size >> 3));
 			RI.Jit->MOV(32, M(&GPFifo::m_gatherPipeCount), R(EAX));
 			RI.Jit->js.fifoBytesThisBlock += Size >> 3;
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				regClearInst(RI, getOp1(I));
 			//regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
 			regClearDeadMemAddress(RI, I, getOp2(I), 2);
 			return;
 		}
 	}
 	OpArg addr = regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
 	RI.Jit->LEA(32, ECX, addr);
 	regSpill(RI, EAX);
-	RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
+	if (isImm(*getOp1(I))) {
-	RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
+		RI.Jit->MOV(Size, R(EAX), regImmForConst(RI, getOp1(I), Size));
 	} else {
 		RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
 	}
 	if (RI.MakeProfile) {
 		RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX));
 	}
 	RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0);
 	if (RI.IInfo[I - RI.FirstI] & 4)
 		regClearInst(RI, getOp1(I));
 }
 static void regEmitShiftInst(RegInfo& RI, InstLoc I,
@ -787,7 +931,6 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 	RI.Build = ibuild;
 	RI.UseProfile = UseProfile;
 	RI.MakeProfile = !RI.UseProfile;
 	unsigned bs = Jit->js.blockStart;
 	// Pass to compute liveness
 	ibuild->StartBackPass();
 	for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
@ -825,21 +968,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			if (thisUsed)
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
 		case StoreCR:
 		case StoreCarry:
 		case Load8:
 		case Load16:
 		case Load32:
-			if (getOpcode(*getOp1(I)) == Add &&
+			regMarkMemAddress(RI, I, getOp1(I), 1);
-			    isImm(*getOp2(getOp1(I)))) {
+			break;
-				regMarkUse(RI, I, getOp1(getOp1(I)), 1);
+		case StoreCR:
-				break;
+		case StoreCarry:
-			}
+			regMarkUse(RI, I, getOp1(I), 1);
 			break;
 		case StoreGReg:
 		case StoreLink:
 		case StoreCTR:
 		case StoreMSR:
-			regMarkUse(RI, I, getOp1(I), 1);
+			if (!isImm(*getOp1(I)))
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
 		case Add:
 		case Sub:
@ -866,8 +1009,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case Store8:
 		case Store16:
 		case Store32:
-			regMarkUse(RI, I, getOp1(I), 1);
+			if (!isImm(*getOp1(I)))
-			regMarkUse(RI, I, getOp2(I), 2);
+				regMarkUse(RI, I, getOp1(I), 1);
 			regMarkMemAddress(RI, I, getOp2(I), 2);
 			break;
 		case BranchUncond:
 			if (!isImm(*getOp1(I)))
@ -1238,7 +1382,11 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		    getOpcode(*I) != BranchCond &&
 		    getOpcode(*I) != Load8 && 
 		    getOpcode(*I) != Load16 && 
-		    getOpcode(*I) != Load32) {
+		    getOpcode(*I) != Load32 &&
 		    getOpcode(*I) != Store8 &&
 		    getOpcode(*I) != Store16 &&
 		    getOpcode(*I) != Store32 &&
 		    1) {
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				regClearInst(RI, getOp1(I));
 			if (RI.IInfo[I - RI.FirstI] & 8)
@ -1252,7 +1400,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		}
 	}
-	if (RI.numSpills)
+	if (UseProfile && RI.numSpills)
 		printf("Block: %x, numspills %d\n", Jit->js.blockStart, RI.numSpills);
 	Jit->UD2();
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@ -80,6 +80,69 @@ namespace IREmitter {
 		Store16,
 		Store32,
 		BranchCond,
 		// Floating-point
 		// There are three floating-point formats: single, double,
 		// and packed.  For any operation where the format of the
 		// operand isn't known, the ForceTo* operations are used;
 		// these are folded into the appropriate conversion
 		// (or no conversion) depending on the type of the operand.
 		// The "mreg" format is a pair of doubles; this is the
 		// most general possible represenation which is used
 		// in the register state.
 		// This might seem like overkill, but it's a huge advantage
 		// to keep operands in the right format because extra
 		// precision can screw up games.
 		// FIXME: Does the slight loss of precision due to not
 		// having a madd instruction matter?  It would be a
 		// performance loss for singles because the operations
 		// would have to be done in double precision, and a completely 
 		// accurate double madd would require an extremely expensive
 		// fallback.
 		FDAdd,
 		FDSub,
 		FDMul,
 		FDDiv,
 		FDNeg,
 		FSAdd,
 		FSSub,
 		FSMul,
 		FSDiv,
 		FSNeg,
 		FPSAdd,
 		FPSSub,
 		FPSMul,
 		FPSDiv,
 		FPSNeg,
 		// FP Loads
 		LoadSingle,
 		LoadDouble,
 		// LoadPacked, // FIXME: Work out how this instruction should
 				// be implemented
 		// FP Stores
 		StoreSingle,
 		StoreDouble,
 		// StorePacked, // FIXME: Work out how this instruction should
 				// be implemented
 		PackedToSingle, // Extract PS0 from packed (type-pun)
 		// PackedToDouble == PackedToSingle+SingleToDouble
 		PackedToMReg,   // Convert from packed format to mreg format (CVTPS2PD)
 		SingleToDouble, // Widen single to double (CVTSS2SD)
 		SingleToPacked, // Duplicate single to packed
 		// SingleToMReg == SingleToPacked+PackedToMReg
 		MRegToPacked,   // Convert from mreg format to packed format (CVTPD2PS)
 		MRegToDouble,   // Extract bottom half from mreg format. (type-pun)
 		// MRegToSingle == MRegToDouble + DoubleToSingle
 		DoubleToMReg,   // Convert from double format to mreg format
 		DoubleToSingle, // Convert from double to single format (CVTSD2SS)
 		// DoubleToPacked should never be needed
 		ForceToPacked,  // ForceTo* are "virtual"; they should be
 				// folded into the above conversions.
 		ForceToSingle,
 		ForceToDouble,
 		ForceToMReg,
 		LoadFPReg,
 		StoreFPReg,
 		// "Trinary" operators
 		// FIXME: Need to change representation!
@ -330,6 +393,7 @@ namespace IREmitter {
 		IRBuilder() { Reset(); }
 		private:
 		IRBuilder(IRBuilder&); // DO NOT IMPLEMENT
 		std::vector<Inst> InstList; // FIXME: We must ensure this is 
 					    // continuous!
 		std::vector<unsigned> ConstList;
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
@ -81,14 +81,10 @@ using namespace Gen;
 				CRTest = ibuild.EmitXor(CRTest, CRCmp);
 		}
-		if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) {
+		if ((inst.BO & 4) == 0) {
 			IREmitter::InstLoc c = ibuild.EmitLoadCTR();
 			c = ibuild.EmitSub(c, ibuild.EmitIntConst(1));
 			ibuild.EmitStoreCTR(c);
 		}
 		if ((inst.BO & 4) == 0) {
 			IREmitter::InstLoc c = ibuild.EmitLoadCTR();
 			if (!(inst.BO & 2)) {
 				CTRTest = ibuild.EmitICmpEq(c,
 						ibuild.EmitIntConst(0));