JitIL: Added a new IR instruction, MulHighUnsigned, which computes the upper 32-bit of the multiplication of two unsigned 32-bit integers. Rewrote mulhwux with MulHighUnsigned.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6145 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
e1d1a1eba0
commit
a3df65bd02
|
@ -617,6 +617,36 @@ InstLoc IRBuilder::FoldMul(InstLoc Op1, InstLoc Op2) {
|
|||
return EmitBiOp(Mul, Op1, Op2);
|
||||
}
|
||||
|
||||
InstLoc IRBuilder::FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2) {
|
||||
// (i0 * i1) >> 32
|
||||
if (isImm(*Op1) && isImm(*Op2)) {
|
||||
return EmitIntConst((u32)(((u64)GetImmValue(Op1) * (u64)GetImmValue(Op2)) >> 32));
|
||||
}
|
||||
|
||||
if (isImm(*Op1) && !isImm(*Op2)) {
|
||||
return FoldMulHighUnsigned(Op2, Op1);
|
||||
}
|
||||
|
||||
if (isImm(*Op2)) {
|
||||
const unsigned imm = GetImmValue(Op2);
|
||||
|
||||
// (x * 0) >> 32 => 0
|
||||
if (imm == 0) {
|
||||
return EmitIntConst(0);
|
||||
}
|
||||
|
||||
for (unsigned i0 = 0; i0 < 30; ++i0) {
|
||||
// (x * (1 << i0)) => x >> (32 - i0)
|
||||
// One "shl" is faster than one "imul".
|
||||
if (imm == (1U << i0)) {
|
||||
return FoldShrl(Op1, EmitIntConst(32 - i0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return EmitBiOp(MulHighUnsigned, Op1, Op2);
|
||||
}
|
||||
|
||||
InstLoc IRBuilder::FoldAnd(InstLoc Op1, InstLoc Op2) {
|
||||
simplifyCommutative(And, Op1, Op2);
|
||||
|
||||
|
@ -1001,6 +1031,7 @@ InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned
|
|||
case Add: return FoldAdd(Op1, Op2);
|
||||
case Sub: return FoldSub(Op1, Op2);
|
||||
case Mul: return FoldMul(Op1, Op2);
|
||||
case MulHighUnsigned: return FoldMulHighUnsigned(Op1, Op2);
|
||||
case And: return FoldAnd(Op1, Op2);
|
||||
case Or: return FoldOr(Op1, Op2);
|
||||
case Xor: return FoldXor(Op1, Op2);
|
||||
|
|
|
@ -68,6 +68,7 @@ enum Opcode {
|
|||
Or,
|
||||
Xor,
|
||||
// Non-commutative integer operators
|
||||
MulHighUnsigned,
|
||||
Sub,
|
||||
Shl, // Note that shifts ignore bits above the bottom 5
|
||||
Shrl,
|
||||
|
@ -224,6 +225,7 @@ private:
|
|||
InstLoc FoldAdd(InstLoc Op1, InstLoc Op2);
|
||||
InstLoc FoldSub(InstLoc Op1, InstLoc Op2);
|
||||
InstLoc FoldMul(InstLoc Op1, InstLoc Op2);
|
||||
InstLoc FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2);
|
||||
InstLoc FoldAnd(InstLoc Op1, InstLoc Op2);
|
||||
InstLoc FoldOr(InstLoc Op1, InstLoc Op2);
|
||||
InstLoc FoldRol(InstLoc Op1, InstLoc Op2);
|
||||
|
@ -306,6 +308,9 @@ public:
|
|||
InstLoc EmitMul(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(Mul, op1, op2);
|
||||
}
|
||||
InstLoc EmitMulHighUnsigned(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(MulHighUnsigned, op1, op2);
|
||||
}
|
||||
InstLoc EmitRol(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(Rol, op1, op2);
|
||||
}
|
||||
|
|
|
@ -803,6 +803,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
|
|||
case Or:
|
||||
case Xor:
|
||||
case Mul:
|
||||
case MulHighUnsigned:
|
||||
case Rol:
|
||||
case Shl:
|
||||
case Shrl:
|
||||
|
@ -1105,6 +1106,23 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
|
|||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case MulHighUnsigned: {
|
||||
if (!thisUsed) break;
|
||||
regSpill(RI, EAX);
|
||||
regSpill(RI, EDX);
|
||||
X64Reg reg = regBinReg(RI, I);
|
||||
if (isImm(*getOp2(I))) {
|
||||
unsigned RHS = RI.Build->GetImmValue(getOp2(I));
|
||||
Jit->MOV(32, R(EAX), Imm32(RHS));
|
||||
} else {
|
||||
Jit->MOV(32, R(EAX), regLocForInst(RI, getOp2(I)));
|
||||
}
|
||||
Jit->MUL(32, regLocForInst(RI, getOp1(I)));
|
||||
Jit->MOV(32, R(reg), R(EDX));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case Rol: {
|
||||
if (!thisUsed) break;
|
||||
regEmitShiftInst(RI, I, &JitIL::ROL);
|
||||
|
|
|
@ -299,42 +299,12 @@ void JitIL::mulhwux(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(Integer)
|
||||
|
||||
// Compute upper 32-bit of (a * b) using Karatsuba algorithm
|
||||
// Karatsuba algorithm reduces the number of multiplication 4 to 3
|
||||
// d = a * b
|
||||
// = {a1 * (1 << 16) + a0} * {b1 * (1 << 16) + b0};
|
||||
// = d2 * (1 << 32) + d1 * (1 << 16) + d0
|
||||
// where
|
||||
// d2 = a1 * b1
|
||||
// d0 = a0 * b0
|
||||
// d1 = (a1 + a0) * (b1 * b0) - d2 - d0
|
||||
// since
|
||||
// d1 = (a1 + a0) * (b1 * b0) - d2 - d0
|
||||
// = a1 * b1 + a1 * b0 + a0 * b1 + a0 * b0 - a1 * b1 - a0 * b0
|
||||
// = a1 * b0 + a0 * b1
|
||||
// The result of mulhwux is
|
||||
// d2' = (((d0 >> 16) + d1) >> 16) + d2
|
||||
//
|
||||
// Though it is not so fast...
|
||||
IREmitter::InstLoc a = ibuild.EmitLoadGReg(inst.RA);
|
||||
IREmitter::InstLoc a0 = ibuild.EmitAnd(a, ibuild.EmitIntConst(0xFFFF));
|
||||
IREmitter::InstLoc a1 = ibuild.EmitShrl(a, ibuild.EmitIntConst(16));
|
||||
IREmitter::InstLoc b = ibuild.EmitLoadGReg(inst.RB);
|
||||
IREmitter::InstLoc b0 = ibuild.EmitAnd(b, ibuild.EmitIntConst(0xFFFF));
|
||||
IREmitter::InstLoc b1 = ibuild.EmitShrl(b, ibuild.EmitIntConst(16));
|
||||
|
||||
IREmitter::InstLoc d2 = ibuild.EmitMul(a1, b1);
|
||||
IREmitter::InstLoc d0 = ibuild.EmitMul(a0, b0);
|
||||
IREmitter::InstLoc d1 = ibuild.EmitMul(ibuild.EmitAdd(a1, a0), ibuild.EmitAdd(b1, b0));
|
||||
d1 = ibuild.EmitSub(d1, d2);
|
||||
d1 = ibuild.EmitSub(d1, d0);
|
||||
|
||||
d1 = ibuild.EmitAdd(d1, ibuild.EmitShrl(d0, ibuild.EmitIntConst(16)));
|
||||
d2 = ibuild.EmitAdd(d2, ibuild.EmitShrl(d1, ibuild.EmitIntConst(16)));
|
||||
|
||||
ibuild.EmitStoreGReg(d2, inst.RD);
|
||||
IREmitter::InstLoc d = ibuild.EmitMulHighUnsigned(a, b);
|
||||
ibuild.EmitStoreGReg(d, inst.RD);
|
||||
if (inst.Rc)
|
||||
ComputeRC(ibuild, d2);
|
||||
ComputeRC(ibuild, d);
|
||||
}
|
||||
|
||||
// skipped some of the special handling in here - if we get crashes, let the interpreter handle this op
|
||||
|
|
Loading…
Reference in New Issue