JitArm64: Optimize divwux

When the divisor is a constant value, we can emit more efficient code.
For powers of two, we can use bit shifts. For other values, we can
instead use a multiplication by magic constant method.

- Example 1 - Division by 16 (power of two)
Before:
mov    w24, #0x10                ; =16
udiv   w27, w25, w24

After:
lsr    w27, w25, #4

- Example 2 - Division by 10 (fast)
Before:
mov    w25, #0xa                 ; =10
udiv   w27, w26, w25

After:
mov    w27, #0xcccd              ; =52429
movk   w27, #0xcccc, lsl #16
umull  x27, w26, w27
lsr    x27, x27, #35

- Example 3 - Division by 127 (slow)
Before:
mov    w26, #0x7f                ; =127
udiv   w27, w27, w26

After:
mov    w26, #0x408               ; =1032
movk   w26, #0x8102, lsl #16
umaddl x27, w27, w26, x26
lsr    x27, x27, #38
This commit is contained in:
Bram Speeckaert 2024-03-23 13:08:35 +01:00
parent 749ee2ff5e
commit 2580837c60
1 changed files with 54 additions and 0 deletions

View File

@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst)
if (inst.Rc) if (inst.Rc)
ComputeRC0(gpr.GetImm(d)); ComputeRC0(gpr.GetImm(d));
} }
else if (gpr.IsImm(b))
{
const u32 divisor = gpr.GetImm(b);
if (divisor == 0)
{
gpr.SetImmediate(d, 0);
if (inst.Rc)
ComputeRC0(0);
}
else
{
const bool allocate_reg = d == a;
gpr.BindToRegister(d, allocate_reg);
ARM64Reg RD = gpr.R(d);
ARM64Reg RA = gpr.R(a);
if (MathUtil::IsPow2(divisor))
{
int shift = MathUtil::IntLog2(divisor);
if (shift)
LSR(RD, RA, shift);
else if (d != a)
MOV(RD, RA);
}
else
{
UnsignedMagic m = UnsignedDivisionConstants(divisor);
ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD;
ARM64Reg XD = EncodeRegTo64(RD);
MOVI2R(WI, m.multiplier);
if (m.fast)
{
UMULL(XD, RA, WI);
}
else
{
UMADDL(XD, RA, WI, EncodeRegTo64(WI));
}
LSR(XD, XD, 32 + m.shift);
if (allocate_reg)
gpr.Unlock(WI);
}
if (inst.Rc)
ComputeRC0(gpr.R(d));
}
}
else else
{ {
gpr.BindToRegister(d, d == a || d == b); gpr.BindToRegister(d, d == a || d == b);