JitIL: Added a new IR instruction, MulHighUnsigned, which computes the upper 32-bit of the multiplication of two unsigned 32-bit integers. Rewrote mulhwux with MulHighUnsigned.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6145 8ced0084-cf51-0410-be5f-012b33b47a6e
2010-08-29 08:00:51 +00:00 · 2010-08-29 08:00:51 +00:00 · a3df65bd02
parent e1d1a1eba0
commit a3df65bd02
4 changed files with 57 additions and 33 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@ -617,6 +617,36 @@ InstLoc IRBuilder::FoldMul(InstLoc Op1, InstLoc Op2) {
 	return EmitBiOp(Mul, Op1, Op2);
 }

+InstLoc IRBuilder::FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2) {
+	// (i0 * i1) >> 32
+	if (isImm(*Op1) && isImm(*Op2)) {
+		return EmitIntConst((u32)(((u64)GetImmValue(Op1) * (u64)GetImmValue(Op2)) >> 32));
+	}
+
+	if (isImm(*Op1) && !isImm(*Op2)) {
+		return FoldMulHighUnsigned(Op2, Op1);
+	}
+
+	if (isImm(*Op2)) {
+		const unsigned imm = GetImmValue(Op2);
+
+		// (x * 0) >> 32 => 0
+		if (imm == 0) {
+			return EmitIntConst(0);
+		}
+
+		for (unsigned i0 = 0; i0 < 30; ++i0) {
+			// (x * (1 << i0)) => x >> (32 - i0)
+			// One "shl" is faster than one "imul".
+			if (imm == (1U << i0)) {
+				return FoldShrl(Op1, EmitIntConst(32 - i0));
+			}
+		}
+	}
+
+	return EmitBiOp(MulHighUnsigned, Op1, Op2);
+}
+
 InstLoc IRBuilder::FoldAnd(InstLoc Op1, InstLoc Op2) {
 	simplifyCommutative(And, Op1, Op2);

@ -1001,6 +1031,7 @@ InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned
 		case Add: return FoldAdd(Op1, Op2);
 		case Sub: return FoldSub(Op1, Op2);
 		case Mul: return FoldMul(Op1, Op2);
+		case MulHighUnsigned: return FoldMulHighUnsigned(Op1, Op2);
 		case And: return FoldAnd(Op1, Op2);
 		case Or: return FoldOr(Op1, Op2);
 		case Xor: return FoldXor(Op1, Op2);
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@ -68,6 +68,7 @@ enum Opcode {
 	Or,
 	Xor,
 	// Non-commutative integer operators
+	MulHighUnsigned,
 	Sub,
 	Shl,  // Note that shifts ignore bits above the bottom 5
 	Shrl,
@ -224,6 +225,7 @@ private:
 	InstLoc FoldAdd(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldSub(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldMul(InstLoc Op1, InstLoc Op2);
+	InstLoc FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldAnd(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldOr(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldRol(InstLoc Op1, InstLoc Op2);
@ -306,6 +308,9 @@ public:
 	InstLoc EmitMul(InstLoc op1, InstLoc op2) {
 		return FoldBiOp(Mul, op1, op2);
 	}
+	InstLoc EmitMulHighUnsigned(InstLoc op1, InstLoc op2) {
+		return FoldBiOp(MulHighUnsigned, op1, op2);
+	}
 	InstLoc EmitRol(InstLoc op1, InstLoc op2) {
 		return FoldBiOp(Rol, op1, op2);
 	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp
@ -803,6 +803,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
 		case Or:
 		case Xor:
 		case Mul:
+		case MulHighUnsigned:
 		case Rol:
 		case Shl:
 		case Shrl:
@ -1105,6 +1106,23 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
 			regNormalRegClear(RI, I);
 			break;
 		}
+		case MulHighUnsigned: {
+			if (!thisUsed) break;
+			regSpill(RI, EAX);
+			regSpill(RI, EDX);
+			X64Reg reg = regBinReg(RI, I);
+			if (isImm(*getOp2(I))) {
+				unsigned RHS = RI.Build->GetImmValue(getOp2(I));
+				Jit->MOV(32, R(EAX), Imm32(RHS));
+			} else {
+				Jit->MOV(32, R(EAX), regLocForInst(RI, getOp2(I)));
+			}
+			Jit->MUL(32, regLocForInst(RI, getOp1(I)));
+			Jit->MOV(32, R(reg), R(EDX));
+			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
+			break;
+		}
 		case Rol: {
 			if (!thisUsed) break;
 			regEmitShiftInst(RI, I, &JitIL::ROL);
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp
@ -299,42 +299,12 @@ void JitIL::mulhwux(UGeckoInstruction inst)
 	INSTRUCTION_START
 	JITDISABLE(Integer)

-	// Compute upper 32-bit of (a * b) using Karatsuba algorithm
-	// Karatsuba algorithm reduces the number of multiplication 4 to 3
-	//   d = a * b
-	//     = {a1 * (1 << 16) + a0} * {b1 * (1 << 16) + b0};
-	//     = d2 * (1 << 32) + d1 * (1 << 16) + d0
-	// where
-	//   d2 = a1 * b1
-	//   d0 = a0 * b0
-	//   d1 = (a1 + a0) * (b1 * b0) - d2 - d0
-	// since
-	//   d1 = (a1 + a0) * (b1 * b0) - d2 - d0
-	//      = a1 * b1 + a1 * b0 + a0 * b1 + a0 * b0 - a1 * b1 - a0 * b0
-	//      = a1 * b0 + a0 * b1
-	// The result of mulhwux is
-	//   d2' = (((d0 >> 16) + d1) >> 16) + d2
-	//
-	// Though it is not so fast...
 	IREmitter::InstLoc a = ibuild.EmitLoadGReg(inst.RA);
-	IREmitter::InstLoc a0 = ibuild.EmitAnd(a, ibuild.EmitIntConst(0xFFFF));
-	IREmitter::InstLoc a1 = ibuild.EmitShrl(a, ibuild.EmitIntConst(16));
 	IREmitter::InstLoc b = ibuild.EmitLoadGReg(inst.RB);
-	IREmitter::InstLoc b0 = ibuild.EmitAnd(b, ibuild.EmitIntConst(0xFFFF));
-	IREmitter::InstLoc b1 = ibuild.EmitShrl(b, ibuild.EmitIntConst(16));
-
-	IREmitter::InstLoc d2 = ibuild.EmitMul(a1, b1);
-	IREmitter::InstLoc d0 = ibuild.EmitMul(a0, b0);
-	IREmitter::InstLoc d1 = ibuild.EmitMul(ibuild.EmitAdd(a1, a0), ibuild.EmitAdd(b1, b0));
-	d1 = ibuild.EmitSub(d1, d2);
-	d1 = ibuild.EmitSub(d1, d0);
-
-	d1 = ibuild.EmitAdd(d1, ibuild.EmitShrl(d0, ibuild.EmitIntConst(16)));
-	d2 = ibuild.EmitAdd(d2, ibuild.EmitShrl(d1, ibuild.EmitIntConst(16)));
-
-	ibuild.EmitStoreGReg(d2, inst.RD);
+	IREmitter::InstLoc d = ibuild.EmitMulHighUnsigned(a, b);
+	ibuild.EmitStoreGReg(d, inst.RD);
 	if (inst.Rc)
-		ComputeRC(ibuild, d2);
+		ComputeRC(ibuild, d);
 }

 // skipped some of the special handling in here - if we get crashes, let the interpreter handle this op