diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp index 3b2e2a2f26..e9d11e99f3 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -617,6 +617,36 @@ InstLoc IRBuilder::FoldMul(InstLoc Op1, InstLoc Op2) { return EmitBiOp(Mul, Op1, Op2); } +InstLoc IRBuilder::FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2) { + // (i0 * i1) >> 32 + if (isImm(*Op1) && isImm(*Op2)) { + return EmitIntConst((u32)(((u64)GetImmValue(Op1) * (u64)GetImmValue(Op2)) >> 32)); + } + + if (isImm(*Op1) && !isImm(*Op2)) { + return FoldMulHighUnsigned(Op2, Op1); + } + + if (isImm(*Op2)) { + const unsigned imm = GetImmValue(Op2); + + // (x * 0) >> 32 => 0 + if (imm == 0) { + return EmitIntConst(0); + } + + for (unsigned i0 = 0; i0 < 30; ++i0) { + // (x * (1 << i0)) => x >> (32 - i0) + // One "shl" is faster than one "imul". + if (imm == (1U << i0)) { + return FoldShrl(Op1, EmitIntConst(32 - i0)); + } + } + } + + return EmitBiOp(MulHighUnsigned, Op1, Op2); +} + InstLoc IRBuilder::FoldAnd(InstLoc Op1, InstLoc Op2) { simplifyCommutative(And, Op1, Op2); @@ -1001,6 +1031,7 @@ InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned case Add: return FoldAdd(Op1, Op2); case Sub: return FoldSub(Op1, Op2); case Mul: return FoldMul(Op1, Op2); + case MulHighUnsigned: return FoldMulHighUnsigned(Op1, Op2); case And: return FoldAnd(Op1, Op2); case Or: return FoldOr(Op1, Op2); case Xor: return FoldXor(Op1, Op2); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index 06ed7be813..f11ae72f7e 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -68,6 +68,7 @@ enum Opcode { Or, Xor, // Non-commutative integer operators + MulHighUnsigned, Sub, Shl, // Note that shifts ignore bits above the bottom 5 Shrl, @@ -224,6 +225,7 @@ private: InstLoc FoldAdd(InstLoc Op1, InstLoc Op2); InstLoc FoldSub(InstLoc Op1, InstLoc Op2); InstLoc FoldMul(InstLoc Op1, InstLoc Op2); + InstLoc FoldMulHighUnsigned(InstLoc Op1, InstLoc Op2); InstLoc FoldAnd(InstLoc Op1, InstLoc Op2); InstLoc FoldOr(InstLoc Op1, InstLoc Op2); InstLoc FoldRol(InstLoc Op1, InstLoc Op2); @@ -306,6 +308,9 @@ public: InstLoc EmitMul(InstLoc op1, InstLoc op2) { return FoldBiOp(Mul, op1, op2); } + InstLoc EmitMulHighUnsigned(InstLoc op1, InstLoc op2) { + return FoldBiOp(MulHighUnsigned, op1, op2); + } InstLoc EmitRol(InstLoc op1, InstLoc op2) { return FoldBiOp(Rol, op1, op2); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp index 554b0ac67a..11e2f7143f 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp @@ -803,6 +803,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak case Or: case Xor: case Mul: + case MulHighUnsigned: case Rol: case Shl: case Shrl: @@ -1105,6 +1106,23 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak regNormalRegClear(RI, I); break; } + case MulHighUnsigned: { + if (!thisUsed) break; + regSpill(RI, EAX); + regSpill(RI, EDX); + X64Reg reg = regBinReg(RI, I); + if (isImm(*getOp2(I))) { + unsigned RHS = RI.Build->GetImmValue(getOp2(I)); + Jit->MOV(32, R(EAX), Imm32(RHS)); + } else { + Jit->MOV(32, R(EAX), regLocForInst(RI, getOp2(I))); + } + Jit->MUL(32, regLocForInst(RI, getOp1(I))); + Jit->MOV(32, R(reg), R(EDX)); + RI.regs[reg] = I; + regNormalRegClear(RI, I); + break; + } case Rol: { if (!thisUsed) break; regEmitShiftInst(RI, I, &JitIL::ROL); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp index 6029c031ab..572cb63861 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_Integer.cpp @@ -299,42 +299,12 @@ void JitIL::mulhwux(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(Integer) - // Compute upper 32-bit of (a * b) using Karatsuba algorithm - // Karatsuba algorithm reduces the number of multiplication 4 to 3 - // d = a * b - // = {a1 * (1 << 16) + a0} * {b1 * (1 << 16) + b0}; - // = d2 * (1 << 32) + d1 * (1 << 16) + d0 - // where - // d2 = a1 * b1 - // d0 = a0 * b0 - // d1 = (a1 + a0) * (b1 * b0) - d2 - d0 - // since - // d1 = (a1 + a0) * (b1 * b0) - d2 - d0 - // = a1 * b1 + a1 * b0 + a0 * b1 + a0 * b0 - a1 * b1 - a0 * b0 - // = a1 * b0 + a0 * b1 - // The result of mulhwux is - // d2' = (((d0 >> 16) + d1) >> 16) + d2 - // - // Though it is not so fast... IREmitter::InstLoc a = ibuild.EmitLoadGReg(inst.RA); - IREmitter::InstLoc a0 = ibuild.EmitAnd(a, ibuild.EmitIntConst(0xFFFF)); - IREmitter::InstLoc a1 = ibuild.EmitShrl(a, ibuild.EmitIntConst(16)); IREmitter::InstLoc b = ibuild.EmitLoadGReg(inst.RB); - IREmitter::InstLoc b0 = ibuild.EmitAnd(b, ibuild.EmitIntConst(0xFFFF)); - IREmitter::InstLoc b1 = ibuild.EmitShrl(b, ibuild.EmitIntConst(16)); - - IREmitter::InstLoc d2 = ibuild.EmitMul(a1, b1); - IREmitter::InstLoc d0 = ibuild.EmitMul(a0, b0); - IREmitter::InstLoc d1 = ibuild.EmitMul(ibuild.EmitAdd(a1, a0), ibuild.EmitAdd(b1, b0)); - d1 = ibuild.EmitSub(d1, d2); - d1 = ibuild.EmitSub(d1, d0); - - d1 = ibuild.EmitAdd(d1, ibuild.EmitShrl(d0, ibuild.EmitIntConst(16))); - d2 = ibuild.EmitAdd(d2, ibuild.EmitShrl(d1, ibuild.EmitIntConst(16))); - - ibuild.EmitStoreGReg(d2, inst.RD); + IREmitter::InstLoc d = ibuild.EmitMulHighUnsigned(a, b); + ibuild.EmitStoreGReg(d, inst.RD); if (inst.Rc) - ComputeRC(ibuild, d2); + ComputeRC(ibuild, d); } // skipped some of the special handling in here - if we get crashes, let the interpreter handle this op