Merge pull request #12661 from Sintendo/arm64divwux
JitArm64: Optimize divwux
This commit is contained in:
commit
5f6a054ffc
|
@ -1451,12 +1451,10 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||
}
|
||||
else
|
||||
{
|
||||
u32 shift = 31;
|
||||
while (!(divisor & (1 << shift)))
|
||||
shift--;
|
||||
|
||||
if (divisor == (u32)(1 << shift))
|
||||
if (MathUtil::IsPow2(divisor))
|
||||
{
|
||||
u32 shift = MathUtil::IntLog2(divisor);
|
||||
|
||||
RCOpArg Ra = gpr.Use(a, RCMode::Read);
|
||||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
||||
RegCache::Realize(Ra, Rd);
|
||||
|
@ -1468,24 +1466,22 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||
}
|
||||
else
|
||||
{
|
||||
u64 magic_dividend = 0x100000000ULL << shift;
|
||||
u32 magic = (u32)(magic_dividend / divisor);
|
||||
u32 max_quotient = magic >> shift;
|
||||
UnsignedMagic m = UnsignedDivisionConstants(divisor);
|
||||
|
||||
// Test for failure in round-up method
|
||||
if (((u64)(magic + 1) * (max_quotient * divisor - 1)) >> (shift + 32) != max_quotient - 1)
|
||||
if (!m.fast)
|
||||
{
|
||||
// If failed, use slower round-down method
|
||||
RCOpArg Ra = gpr.Use(a, RCMode::Read);
|
||||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
||||
RegCache::Realize(Ra, Rd);
|
||||
|
||||
MOV(32, R(RSCRATCH), Imm32(magic));
|
||||
MOV(32, R(RSCRATCH), Imm32(m.multiplier));
|
||||
if (d != a)
|
||||
MOV(32, Rd, Ra);
|
||||
IMUL(64, Rd, R(RSCRATCH));
|
||||
ADD(64, Rd, R(RSCRATCH));
|
||||
SHR(64, Rd, Imm8(shift + 32));
|
||||
SHR(64, Rd, Imm8(m.shift + 32));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1494,32 +1490,23 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
||||
RegCache::Realize(Ra, Rd);
|
||||
|
||||
magic++;
|
||||
|
||||
// Use smallest magic number and shift amount possible
|
||||
while ((magic & 1) == 0 && shift > 0)
|
||||
{
|
||||
magic >>= 1;
|
||||
shift--;
|
||||
}
|
||||
|
||||
// Three-operand IMUL sign extends the immediate to 64 bits, so we may only
|
||||
// use it when the magic number has its most significant bit set to 0
|
||||
if ((magic & 0x80000000) == 0)
|
||||
if ((m.multiplier & 0x80000000) == 0)
|
||||
{
|
||||
IMUL(64, Rd, Ra, Imm32(magic));
|
||||
IMUL(64, Rd, Ra, Imm32(m.multiplier));
|
||||
}
|
||||
else if (d == a)
|
||||
{
|
||||
MOV(32, R(RSCRATCH), Imm32(magic));
|
||||
MOV(32, R(RSCRATCH), Imm32(m.multiplier));
|
||||
IMUL(64, Rd, R(RSCRATCH));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(32, Rd, Imm32(magic));
|
||||
MOV(32, Rd, Imm32(m.multiplier));
|
||||
IMUL(64, Rd, Ra);
|
||||
}
|
||||
SHR(64, Rd, Imm8(shift + 32));
|
||||
SHR(64, Rd, Imm8(m.shift + 32));
|
||||
}
|
||||
}
|
||||
if (inst.OE)
|
||||
|
@ -1792,7 +1779,7 @@ void Jit64::divwx(UGeckoInstruction inst)
|
|||
else
|
||||
{
|
||||
// Optimize signed 32-bit integer division by a constant
|
||||
Magic m = SignedDivisionConstants(divisor);
|
||||
SignedMagic m = SignedDivisionConstants(divisor);
|
||||
|
||||
MOVSX(64, 32, RSCRATCH, Ra);
|
||||
|
||||
|
|
|
@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst)
|
|||
if (inst.Rc)
|
||||
ComputeRC0(gpr.GetImm(d));
|
||||
}
|
||||
else if (gpr.IsImm(b))
|
||||
{
|
||||
const u32 divisor = gpr.GetImm(b);
|
||||
|
||||
if (divisor == 0)
|
||||
{
|
||||
gpr.SetImmediate(d, 0);
|
||||
if (inst.Rc)
|
||||
ComputeRC0(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
const bool allocate_reg = d == a;
|
||||
gpr.BindToRegister(d, allocate_reg);
|
||||
|
||||
ARM64Reg RD = gpr.R(d);
|
||||
ARM64Reg RA = gpr.R(a);
|
||||
|
||||
if (MathUtil::IsPow2(divisor))
|
||||
{
|
||||
int shift = MathUtil::IntLog2(divisor);
|
||||
if (shift)
|
||||
LSR(RD, RA, shift);
|
||||
else if (d != a)
|
||||
MOV(RD, RA);
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsignedMagic m = UnsignedDivisionConstants(divisor);
|
||||
|
||||
ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD;
|
||||
ARM64Reg XD = EncodeRegTo64(RD);
|
||||
|
||||
MOVI2R(WI, m.multiplier);
|
||||
|
||||
if (m.fast)
|
||||
{
|
||||
UMULL(XD, RA, WI);
|
||||
}
|
||||
else
|
||||
{
|
||||
UMADDL(XD, RA, WI, EncodeRegTo64(WI));
|
||||
}
|
||||
|
||||
LSR(XD, XD, 32 + m.shift);
|
||||
|
||||
if (allocate_reg)
|
||||
gpr.Unlock(WI);
|
||||
}
|
||||
|
||||
if (inst.Rc)
|
||||
ComputeRC0(gpr.R(d));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
gpr.BindToRegister(d, d == a || d == b);
|
||||
|
@ -1675,7 +1729,7 @@ void JitArm64::divwx(UGeckoInstruction inst)
|
|||
else
|
||||
{
|
||||
// Optimize signed 32-bit integer division by a constant
|
||||
Magic m = SignedDivisionConstants(divisor);
|
||||
SignedMagic m = SignedDivisionConstants(divisor);
|
||||
|
||||
ARM64Reg WA = gpr.GetReg();
|
||||
ARM64Reg WB = gpr.GetReg();
|
||||
|
|
|
@ -3,16 +3,18 @@
|
|||
|
||||
#include "Core/PowerPC/JitCommon/DivUtils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <bit>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace JitCommon
|
||||
{
|
||||
Magic SignedDivisionConstants(s32 d)
|
||||
SignedMagic SignedDivisionConstants(s32 divisor)
|
||||
{
|
||||
const u32 two31 = 2147483648;
|
||||
|
||||
const u32 ad = std::abs(d);
|
||||
const u32 t = two31 - (d < 0);
|
||||
const u32 ad = std::abs(divisor);
|
||||
const u32 t = two31 - (divisor < 0);
|
||||
const u32 anc = t - 1 - t % ad;
|
||||
u32 q1 = two31 / anc;
|
||||
u32 r1 = two31 - q1 * anc;
|
||||
|
@ -44,13 +46,43 @@ Magic SignedDivisionConstants(s32 d)
|
|||
delta = ad - r2;
|
||||
} while (q1 < delta || (q1 == delta && r1 == 0));
|
||||
|
||||
Magic mag;
|
||||
SignedMagic mag;
|
||||
mag.multiplier = q2 + 1;
|
||||
if (d < 0)
|
||||
if (divisor < 0)
|
||||
mag.multiplier = -mag.multiplier;
|
||||
mag.shift = p - 32;
|
||||
|
||||
return mag;
|
||||
}
|
||||
|
||||
UnsignedMagic UnsignedDivisionConstants(u32 divisor)
|
||||
{
|
||||
u32 shift = 31 - std::countl_zero(divisor);
|
||||
|
||||
u64 magic_dividend = 0x100000000ULL << shift;
|
||||
u32 multiplier = magic_dividend / divisor;
|
||||
u32 max_quotient = multiplier >> shift;
|
||||
|
||||
// Test for failure in round-up method
|
||||
u32 round_up = (u64(multiplier + 1) * (max_quotient * divisor - 1)) >> (shift + 32);
|
||||
bool fast = round_up == max_quotient - 1;
|
||||
|
||||
if (fast)
|
||||
{
|
||||
multiplier++;
|
||||
|
||||
// Use smallest magic number and shift amount possible
|
||||
u32 trailing_zeroes = std::min(shift, u32(std::countr_zero(multiplier)));
|
||||
multiplier >>= trailing_zeroes;
|
||||
shift -= trailing_zeroes;
|
||||
}
|
||||
|
||||
UnsignedMagic mag;
|
||||
mag.multiplier = multiplier;
|
||||
mag.shift = shift;
|
||||
mag.fast = fast;
|
||||
|
||||
return mag;
|
||||
}
|
||||
|
||||
} // namespace JitCommon
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
namespace JitCommon
|
||||
{
|
||||
struct Magic
|
||||
struct SignedMagic
|
||||
{
|
||||
s32 multiplier;
|
||||
u8 shift;
|
||||
|
@ -16,6 +16,27 @@ struct Magic
|
|||
// Calculate the constants required to optimize a signed 32-bit integer division.
|
||||
// Taken from The PowerPC Compiler Writer's Guide and LLVM.
|
||||
// Divisor must not be -1, 0, 1 or INT_MIN.
|
||||
Magic SignedDivisionConstants(s32 divisor);
|
||||
SignedMagic SignedDivisionConstants(s32 divisor);
|
||||
|
||||
struct UnsignedMagic
|
||||
{
|
||||
u32 multiplier;
|
||||
u8 shift;
|
||||
bool fast;
|
||||
};
|
||||
|
||||
/// Calculate the constants required to optimize an unsigned 32-bit integer
|
||||
/// division.
|
||||
/// Divisor must not be 0, 1, or a power of two.
|
||||
///
|
||||
/// Original implementation by calc84maniac.
|
||||
/// Results are the same as the approach laid out in Hacker's Delight, with an
|
||||
/// improvement for so-called uncooperative divisors (e.g. 7), as discovered by
|
||||
/// ridiculousfish.
|
||||
///
|
||||
/// See also:
|
||||
/// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
|
||||
/// https://rubenvannieuwpoort.nl/posts/division-by-constant-unsigned-integers
|
||||
UnsignedMagic UnsignedDivisionConstants(u32 divisor);
|
||||
|
||||
} // namespace JitCommon
|
||||
|
|
|
@ -9,12 +9,12 @@ using namespace JitCommon;
|
|||
|
||||
TEST(DivUtils, Signed)
|
||||
{
|
||||
Magic m3 = SignedDivisionConstants(3);
|
||||
Magic m5 = SignedDivisionConstants(5);
|
||||
Magic m7 = SignedDivisionConstants(7);
|
||||
Magic minus3 = SignedDivisionConstants(-3);
|
||||
Magic minus5 = SignedDivisionConstants(-5);
|
||||
Magic minus7 = SignedDivisionConstants(-7);
|
||||
SignedMagic m3 = SignedDivisionConstants(3);
|
||||
SignedMagic m5 = SignedDivisionConstants(5);
|
||||
SignedMagic m7 = SignedDivisionConstants(7);
|
||||
SignedMagic minus3 = SignedDivisionConstants(-3);
|
||||
SignedMagic minus5 = SignedDivisionConstants(-5);
|
||||
SignedMagic minus7 = SignedDivisionConstants(-7);
|
||||
|
||||
EXPECT_EQ(0x55555556, m3.multiplier);
|
||||
EXPECT_EQ(0, m3.shift);
|
||||
|
@ -30,3 +30,32 @@ TEST(DivUtils, Signed)
|
|||
EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
|
||||
EXPECT_EQ(2, minus7.shift);
|
||||
}
|
||||
|
||||
TEST(DivUtils, Unsigned)
|
||||
{
|
||||
UnsignedMagic m3 = UnsignedDivisionConstants(3);
|
||||
UnsignedMagic m5 = UnsignedDivisionConstants(5);
|
||||
UnsignedMagic m7 = UnsignedDivisionConstants(7);
|
||||
UnsignedMagic m9 = UnsignedDivisionConstants(9);
|
||||
UnsignedMagic m19 = UnsignedDivisionConstants(19);
|
||||
|
||||
EXPECT_EQ(0xAAAAAAABU, m3.multiplier);
|
||||
EXPECT_EQ(1, m3.shift);
|
||||
EXPECT_TRUE(m3.fast);
|
||||
|
||||
EXPECT_EQ(0xCCCCCCCDU, m5.multiplier);
|
||||
EXPECT_EQ(2, m5.shift);
|
||||
EXPECT_TRUE(m5.fast);
|
||||
|
||||
EXPECT_EQ(0x92492492U, m7.multiplier);
|
||||
EXPECT_EQ(2, m7.shift);
|
||||
EXPECT_FALSE(m7.fast);
|
||||
|
||||
EXPECT_EQ(0x38E38E39U, m9.multiplier);
|
||||
EXPECT_EQ(1, m9.shift);
|
||||
EXPECT_TRUE(m9.fast);
|
||||
|
||||
EXPECT_EQ(0xD79435E5U, m19.multiplier);
|
||||
EXPECT_EQ(4, m19.shift);
|
||||
EXPECT_FALSE(m19.fast);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue