JitIL: Added a new IR instruction, MulHighUnsigned, which computes the upper 32-bit of the multiplication of two unsigned 32-bit integers. Rewrote mulhwux with MulHighUnsigned.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6145 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
nodchip 2010-08-29 08:00:51 +00:00
parent e1d1a1eba0
commit a3df65bd02
4 changed files with 57 additions and 33 deletions

View File

@ -617,6 +617,36 @@ InstLoc IRBuilder::FoldMul(InstLoc Op1, InstLoc Op2) {
return EmitBiOp(Mul, Op1, Op2);
}
InstLoc IRBuilder::FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2) {
// (i0 * i1) >> 32
if (isImm(*Op1) && isImm(*Op2)) {
return EmitIntConst((u32)(((u64)GetImmValue(Op1) * (u64)GetImmValue(Op2)) >> 32));
}
if (isImm(*Op1) && !isImm(*Op2)) {
return FoldMulHighUnsigned(Op2, Op1);
}
if (isImm(*Op2)) {
const unsigned imm = GetImmValue(Op2);
// (x * 0) >> 32 => 0
if (imm == 0) {
return EmitIntConst(0);
}
for (unsigned i0 = 0; i0 < 30; ++i0) {
// (x * (1 << i0)) => x >> (32 - i0)
// One "shl" is faster than one "imul".
if (imm == (1U << i0)) {
return FoldShrl(Op1, EmitIntConst(32 - i0));
}
}
}
return EmitBiOp(MulHighUnsigned, Op1, Op2);
}
InstLoc IRBuilder::FoldAnd(InstLoc Op1, InstLoc Op2) {
simplifyCommutative(And, Op1, Op2);
@ -1001,6 +1031,7 @@ InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned
case Add: return FoldAdd(Op1, Op2);
case Sub: return FoldSub(Op1, Op2);
case Mul: return FoldMul(Op1, Op2);
case MulHighUnsigned: return FoldMulHighUnsigned(Op1, Op2);
case And: return FoldAnd(Op1, Op2);
case Or: return FoldOr(Op1, Op2);
case Xor: return FoldXor(Op1, Op2);

View File

@ -68,6 +68,7 @@ enum Opcode {
Or,
Xor,
// Non-commutative integer operators
MulHighUnsigned,
Sub,
Shl, // Note that shifts ignore bits above the bottom 5
Shrl,
@ -224,6 +225,7 @@ private:
InstLoc FoldAdd(InstLoc Op1, InstLoc Op2);
InstLoc FoldSub(InstLoc Op1, InstLoc Op2);
InstLoc FoldMul(InstLoc Op1, InstLoc Op2);
InstLoc FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2);
InstLoc FoldAnd(InstLoc Op1, InstLoc Op2);
InstLoc FoldOr(InstLoc Op1, InstLoc Op2);
InstLoc FoldRol(InstLoc Op1, InstLoc Op2);
@ -306,6 +308,9 @@ public:
InstLoc EmitMul(InstLoc op1, InstLoc op2) {
return FoldBiOp(Mul, op1, op2);
}
InstLoc EmitMulHighUnsigned(InstLoc op1, InstLoc op2) {
return FoldBiOp(MulHighUnsigned, op1, op2);
}
InstLoc EmitRol(InstLoc op1, InstLoc op2) {
return FoldBiOp(Rol, op1, op2);
}

View File

@ -803,6 +803,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
case Or:
case Xor:
case Mul:
case MulHighUnsigned:
case Rol:
case Shl:
case Shrl:
@ -1105,6 +1106,23 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
regNormalRegClear(RI, I);
break;
}
case MulHighUnsigned: {
if (!thisUsed) break;
regSpill(RI, EAX);
regSpill(RI, EDX);
X64Reg reg = regBinReg(RI, I);
if (isImm(*getOp2(I))) {
unsigned RHS = RI.Build->GetImmValue(getOp2(I));
Jit->MOV(32, R(EAX), Imm32(RHS));
} else {
Jit->MOV(32, R(EAX), regLocForInst(RI, getOp2(I)));
}
Jit->MUL(32, regLocForInst(RI, getOp1(I)));
Jit->MOV(32, R(reg), R(EDX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case Rol: {
if (!thisUsed) break;
regEmitShiftInst(RI, I, &JitIL::ROL);

View File

@ -299,42 +299,12 @@ void JitIL::mulhwux(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(Integer)
// Compute upper 32-bit of (a * b) using Karatsuba algorithm
// Karatsuba algorithm reduces the number of multiplication 4 to 3
// d = a * b
// = {a1 * (1 << 16) + a0} * {b1 * (1 << 16) + b0};
// = d2 * (1 << 32) + d1 * (1 << 16) + d0
// where
// d2 = a1 * b1
// d0 = a0 * b0
// d1 = (a1 + a0) * (b1 * b0) - d2 - d0
// since
// d1 = (a1 + a0) * (b1 * b0) - d2 - d0
// = a1 * b1 + a1 * b0 + a0 * b1 + a0 * b0 - a1 * b1 - a0 * b0
// = a1 * b0 + a0 * b1
// The result of mulhwux is
// d2' = (((d0 >> 16) + d1) >> 16) + d2
//
// Though it is not so fast...
IREmitter::InstLoc a = ibuild.EmitLoadGReg(inst.RA);
IREmitter::InstLoc a0 = ibuild.EmitAnd(a, ibuild.EmitIntConst(0xFFFF));
IREmitter::InstLoc a1 = ibuild.EmitShrl(a, ibuild.EmitIntConst(16));
IREmitter::InstLoc b = ibuild.EmitLoadGReg(inst.RB);
IREmitter::InstLoc b0 = ibuild.EmitAnd(b, ibuild.EmitIntConst(0xFFFF));
IREmitter::InstLoc b1 = ibuild.EmitShrl(b, ibuild.EmitIntConst(16));
IREmitter::InstLoc d2 = ibuild.EmitMul(a1, b1);
IREmitter::InstLoc d0 = ibuild.EmitMul(a0, b0);
IREmitter::InstLoc d1 = ibuild.EmitMul(ibuild.EmitAdd(a1, a0), ibuild.EmitAdd(b1, b0));
d1 = ibuild.EmitSub(d1, d2);
d1 = ibuild.EmitSub(d1, d0);
d1 = ibuild.EmitAdd(d1, ibuild.EmitShrl(d0, ibuild.EmitIntConst(16)));
d2 = ibuild.EmitAdd(d2, ibuild.EmitShrl(d1, ibuild.EmitIntConst(16)));
ibuild.EmitStoreGReg(d2, inst.RD);
IREmitter::InstLoc d = ibuild.EmitMulHighUnsigned(a, b);
ibuild.EmitStoreGReg(d, inst.RD);
if (inst.Rc)
ComputeRC(ibuild, d2);
ComputeRC(ibuild, d);
}
// skipped some of the special handling in here - if we get crashes, let the interpreter handle this op