From 018cb993e30027b0261f96037f56fac107492743 Mon Sep 17 00:00:00 2001
From: magumagu9 <magumagu9@gmail.com>
Date: Sun, 11 Jan 2009 01:26:58 +0000
Subject: [PATCH] A tiny bit more JIT WIP work.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1847 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp | 79 +++++++++++++--------
 Source/Core/Core/Src/PowerPC/Jit64IL/IR.h   | 13 ++++
 2 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
index 2ae2c5cbae..007079338e 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@@ -90,16 +90,15 @@ integer code are more aggresively combining blocks and dead condition
 register elimination, which should be very helpful for small blocks.
 
 TODO (in no particular order):
-Floating-point JIT (both paired and unpaired)
-	(very large win for FP code, no effect for integer code)
-Inter-block dead condition register elimination (Likely significant win
-	combined with optimized conditions)
-Optimize conditions for conditional branches.
-General dead register elimination.
-Inter-block inlining.
-Track down issues with new JIT + dual-core mode (I think I'm going to
-	need help with this one; I'm not very familiar with the
-	dual-core code.)
+JIT for misc remaining FP instructions
+JIT for bcctrx
+Misc optimizations for FP instructions
+Inter-block dead register elimination; this seems likely to have large
+	performance benefits, although I'm not completely sure.
+Inter-block inlining; also likely to have large performance benefits.
+	The tricky parts are deciding which blocks to inline, and that the
+	IR can't really deal with branches whose destination is in the
+	the middle of a generated block.
 Specialized slw/srw/sraw; I think there are some tricks that could
 	have a non-trivial effect, and there are significantly shorter
 	implementations for 64-bit involving abusing 64-bit shifts.
@@ -111,15 +110,19 @@ Scheduling to reduce register pressure: PowerPC	compilers like to push
 	instruction reordering.
 Common subexpression elimination
 Optimize load/store of sum using complex addressing (partially implemented)
-Implement idle-skipping
-Loop optimizations (loop-carried registers, LICM); not sure how much
-	this will help on top of dead register elimination
-Fold loads (both register and memory) into arithmetic operations
+Loop optimizations (loop-carried registers, LICM)
+Fold register loads into arithmetic operations
 Code refactoring/cleanup
 Investigate performance of the JIT itself; this doesn't affect
 	framerates significantly, but it does take a visible amount
 	of time for a complicated piece of code like a video decoder
 	to compile.
+Fix profiled loads/stores to work safely.  On 32-bit, one solution is to
+	use a spare segment register, and expand the backpatch solution
+	to work in all the relevant situations.  On 64-bit, the existing
+	fast memory solution should basically work.  An alternative
+	would be to figure out a heuristic for what loads actually
+	vary their "type", and special-case them.
 
 */
 
@@ -464,6 +467,12 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
 		if (branchValue == 2)
 			return FoldBranchCond(EmitICmpEq(getOp1(getOp1(Op1)),
 					      getOp2(getOp1(Op1))), Op2);
+		if (branchValue == 4)
+			return FoldBranchCond(EmitICmpSgt(getOp1(getOp1(Op1)),
+					      getOp2(getOp1(Op1))), Op2);
+		if (branchValue == 8)
+			return FoldBranchCond(EmitICmpSlt(getOp1(getOp1(Op1)),
+					      getOp2(getOp1(Op1))), Op2);
 	}
 	if (getOpcode(*Op1) == Xor &&
 	    isImm(*getOp2(Op1))) {
@@ -475,10 +484,15 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
 			unsigned innerBranchValue = 
 				GetImmValue(getOp2(XOp1));
 			if (branchValue == innerBranchValue) {
-				if (branchValue == 4) {
+				if (branchValue == 2)
+					return FoldBranchCond(EmitICmpNe(getOp1(getOp1(XOp1)),
+						      getOp2(getOp1(XOp1))), Op2);
+				if (branchValue == 4)
 					return FoldBranchCond(EmitICmpSle(getOp1(getOp1(XOp1)),
 						      getOp2(getOp1(XOp1))), Op2);
-				}
+				if (branchValue == 8)
+					return FoldBranchCond(EmitICmpSge(getOp1(getOp1(XOp1)),
+						      getOp2(getOp1(XOp1))), Op2);
 			}
 		}
 	}
@@ -493,6 +507,9 @@ InstLoc IRBuilder::FoldICmp(unsigned Opcode, InstLoc Op1, InstLoc Op2) {
 			case ICmpEq:
 				result = GetImmValue(Op1) == GetImmValue(Op2);
 				break;
+			case ICmpNe:
+				result = GetImmValue(Op1) != GetImmValue(Op2);
+				break;
 			case ICmpUgt:
 				result = GetImmValue(Op1) > GetImmValue(Op2);
 				break;
@@ -1285,9 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
 		case BranchCond: {
-			unsigned CondOpcode = getOpcode(*getOp1(I));
-			if ((CondOpcode == ICmpEq ||
-			     CondOpcode == ICmpSle) &&
+			if (isICmp(*getOp1(I)) &&
 			    isImm(*getOp2(getOp1(I)))) {
 				regMarkUse(RI, I, getOp1(getOp1(I)), 1);
 			} else {
@@ -1904,20 +1919,24 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case BlockEnd:
 			break;
 		case BranchCond: {
-			if (getOpcode(*getOp1(I)) == ICmpEq &&
+			if (isICmp(*getOp1(I)) &&
 			    isImm(*getOp2(getOp1(I)))) {
 				Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
 					 Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
-				FixupBranch cont = Jit->J_CC(CC_NZ);
-				regWriteExit(RI, getOp2(I));
-				Jit->SetJumpTarget(cont);
-				if (RI.IInfo[I - RI.FirstI] & 4)
-					regClearInst(RI, getOp1(getOp1(I)));
-			} else if (getOpcode(*getOp1(I)) == ICmpSle &&
-				   isImm(*getOp2(getOp1(I)))) {
-				Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
-					 Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
-				FixupBranch cont = Jit->J_CC(CC_G);
+				CCFlags flag;
+				switch (getOpcode(*getOp1(I))) {
+					case ICmpEq: flag = CC_NE; break;
+					case ICmpNe: flag = CC_E; break;
+					case ICmpUgt: flag = CC_BE; break;
+					case ICmpUlt: flag = CC_AE; break;
+					case ICmpUge: flag = CC_L; break;
+					case ICmpUle: flag = CC_A; break;
+					case ICmpSgt: flag = CC_LE; break;
+					case ICmpSlt: flag = CC_GE; break;
+					case ICmpSge: flag = CC_L; break;
+					case ICmpSle: flag = CC_G; break;
+				}
+				FixupBranch cont = Jit->J_CC(flag);
 				regWriteExit(RI, getOp2(I));
 				Jit->SetJumpTarget(cont);
 				if (RI.IInfo[I - RI.FirstI] & 4)
diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
index 58f768dcd1..27edb78c5c 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@@ -216,6 +216,10 @@ namespace IREmitter {
 	unsigned inline isImm(Inst i) {
 		return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32;
 	}
+	
+	unsigned inline isICmp(Inst i) {
+		return getOpcode(i) >= ICmpEq && getOpcode(i) <= ICmpSle;
+	}
 
 	unsigned inline isFResult(Inst i) {
 		return getOpcode(i) > FResult_Start && 
@@ -329,12 +333,21 @@ namespace IREmitter {
 		InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpEq, op1, op2);
 		}
+		InstLoc EmitICmpNe(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpNe, op1, op2);
+		}
 		InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpUgt, op1, op2);
 		}
 		InstLoc EmitICmpSgt(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpSgt, op1, op2);
 		}
+		InstLoc EmitICmpSlt(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpSlt, op1, op2);
+		}
+		InstLoc EmitICmpSge(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(ICmpSge, op1, op2);
+		}
 		InstLoc EmitICmpSle(InstLoc op1, InstLoc op2) {
 			return FoldBiOp(ICmpSle, op1, op2);
 		}