JitIL: Added a new IR instruction, MulHighUnsigned, which computes the upper 32-bit of the multiplication of two unsigned 32-bit integers. Rewrote mulhwux with MulHighUnsigned.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6145 8ced0084-cf51-0410-be5f-012b33b47a6e
2010-08-29 08:00:51 +00:00 · 2010-08-29 08:00:51 +00:00 · a3df65bd02
parent e1d1a1eba0
commit a3df65bd02
4 changed files with 57 additions and 33 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@ -617,6 +617,36 @@ InstLoc IRBuilder::FoldMul(InstLoc Op1, InstLoc Op2) {
 	return EmitBiOp(Mul, Op1, Op2);
 }
 InstLoc IRBuilder::FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2) {
 	// (i0 * i1) >> 32
 	if (isImm(*Op1) && isImm(*Op2)) {
 		return EmitIntConst((u32)(((u64)GetImmValue(Op1) * (u64)GetImmValue(Op2)) >> 32));
 	}
 	if (isImm(*Op1) && !isImm(*Op2)) {
 		return FoldMulHighUnsigned(Op2, Op1);
 	}
 	if (isImm(*Op2)) {
 		const unsigned imm = GetImmValue(Op2);
 		// (x * 0) >> 32 => 0
 		if (imm == 0) {
 			return EmitIntConst(0);
 		}
 		for (unsigned i0 = 0; i0 < 30; ++i0) {
 			// (x * (1 << i0)) => x >> (32 - i0)
 			// One "shl" is faster than one "imul".
 			if (imm == (1U << i0)) {
 				return FoldShrl(Op1, EmitIntConst(32 - i0));
 			}
 		}
 	}
 	return EmitBiOp(MulHighUnsigned, Op1, Op2);
 }
 InstLoc IRBuilder::FoldAnd(InstLoc Op1, InstLoc Op2) {
 	simplifyCommutative(And, Op1, Op2);
@ -1001,6 +1031,7 @@ InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned
 		case Add: return FoldAdd(Op1, Op2);
 		case Sub: return FoldSub(Op1, Op2);
 		case Mul: return FoldMul(Op1, Op2);
 		case MulHighUnsigned: return FoldMulHighUnsigned(Op1, Op2);
 		case And: return FoldAnd(Op1, Op2);
 		case Or: return FoldOr(Op1, Op2);
 		case Xor: return FoldXor(Op1, Op2);
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@ -68,6 +68,7 @@ enum Opcode {
 	Or,
 	Xor,
 	// Non-commutative integer operators
 	MulHighUnsigned,
 	Sub,
 	Shl,  // Note that shifts ignore bits above the bottom 5
 	Shrl,
@ -224,6 +225,7 @@ private:
 	InstLoc FoldAdd(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldSub(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldMul(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldAnd(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldOr(InstLoc Op1, InstLoc Op2);
 	InstLoc FoldRol(InstLoc Op1, InstLoc Op2);
@ -306,6 +308,9 @@ public:
 	InstLoc EmitMul(InstLoc op1, InstLoc op2) {
 		return FoldBiOp(Mul, op1, op2);
 	}
 	InstLoc EmitMulHighUnsigned(InstLoc op1, InstLoc op2) {
 		return FoldBiOp(MulHighUnsigned, op1, op2);
 	}
 	InstLoc EmitRol(InstLoc op1, InstLoc op2) {
 		return FoldBiOp(Rol, op1, op2);
 	}
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp
@ -803,6 +803,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
 		case Or:
 		case Xor:
 		case Mul:
 		case MulHighUnsigned:
 		case Rol:
 		case Shl:
 		case Shrl:
@ -1105,6 +1106,23 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
 			regNormalRegClear(RI, I);
 			break;
 		}
 		case MulHighUnsigned: {
 			if (!thisUsed) break;
 			regSpill(RI, EAX);
 			regSpill(RI, EDX);
 			X64Reg reg = regBinReg(RI, I);
 			if (isImm(*getOp2(I))) {
 				unsigned RHS = RI.Build->GetImmValue(getOp2(I));
 				Jit->MOV(32, R(EAX), Imm32(RHS));
 			} else {
 				Jit->MOV(32, R(EAX), regLocForInst(RI, getOp2(I)));
 			}
 			Jit->MUL(32, regLocForInst(RI, getOp1(I)));
 			Jit->MOV(32, R(reg), R(EDX));
 			RI.regs[reg] = I;
 			regNormalRegClear(RI, I);
 			break;
 		}
 		case Rol: {
 			if (!thisUsed) break;
 			regEmitShiftInst(RI, I, &JitIL::ROL);
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp
@ -299,42 +299,12 @@ void JitIL::mulhwux(UGeckoInstruction inst)
 	INSTRUCTION_START
 	JITDISABLE(Integer)
 	// Compute upper 32-bit of (a * b) using Karatsuba algorithm
 	// Karatsuba algorithm reduces the number of multiplication 4 to 3
 	//   d = a * b
 	//     = {a1 * (1 << 16) + a0} * {b1 * (1 << 16) + b0};
 	//     = d2 * (1 << 32) + d1 * (1 << 16) + d0
 	// where
 	//   d2 = a1 * b1
 	//   d0 = a0 * b0
 	//   d1 = (a1 + a0) * (b1 * b0) - d2 - d0
 	// since
 	//   d1 = (a1 + a0) * (b1 * b0) - d2 - d0
 	//      = a1 * b1 + a1 * b0 + a0 * b1 + a0 * b0 - a1 * b1 - a0 * b0
 	//      = a1 * b0 + a0 * b1
 	// The result of mulhwux is
 	//   d2' = (((d0 >> 16) + d1) >> 16) + d2
 	//
 	// Though it is not so fast...
 	IREmitter::InstLoc a = ibuild.EmitLoadGReg(inst.RA);
 	IREmitter::InstLoc a0 = ibuild.EmitAnd(a, ibuild.EmitIntConst(0xFFFF));
 	IREmitter::InstLoc a1 = ibuild.EmitShrl(a, ibuild.EmitIntConst(16));
 	IREmitter::InstLoc b = ibuild.EmitLoadGReg(inst.RB);
-	IREmitter::InstLoc b0 = ibuild.EmitAnd(b, ibuild.EmitIntConst(0xFFFF));
+	IREmitter::InstLoc d = ibuild.EmitMulHighUnsigned(a, b);
-	IREmitter::InstLoc b1 = ibuild.EmitShrl(b, ibuild.EmitIntConst(16));
+	ibuild.EmitStoreGReg(d, inst.RD);
 	IREmitter::InstLoc d2 = ibuild.EmitMul(a1, b1);
 	IREmitter::InstLoc d0 = ibuild.EmitMul(a0, b0);
 	IREmitter::InstLoc d1 = ibuild.EmitMul(ibuild.EmitAdd(a1, a0), ibuild.EmitAdd(b1, b0));
 	d1 = ibuild.EmitSub(d1, d2);
 	d1 = ibuild.EmitSub(d1, d0);
 	d1 = ibuild.EmitAdd(d1, ibuild.EmitShrl(d0, ibuild.EmitIntConst(16)));
 	d2 = ibuild.EmitAdd(d2, ibuild.EmitShrl(d1, ibuild.EmitIntConst(16)));
 	ibuild.EmitStoreGReg(d2, inst.RD);
 	if (inst.Rc)
-		ComputeRC(ibuild, d2);
+		ComputeRC(ibuild, d);
 }
 // skipped some of the special handling in here - if we get crashes, let the interpreter handle this op