Jit64: divwx - Optimize constant divisor

Optimize division by a constant into multiplication. This method is also used by GCC and LLVM. We also add optimized paths for divisors 0, 1, and -1, because they don't work using this method. They don't occur very often, but are necessary for correctness. - Division by 1 Before: 41 BF 01 00 00 00 mov r15d,1 41 8B C5 mov eax,r13d 45 85 FF test r15d,r15d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FF FF cmp r15d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B F8 mov r15d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 FF idiv eax,r15d 44 8B F8 mov r15d,eax done: After: 45 8B FD mov r15d,r13d - Division by 30307 Before: 41 BA 63 76 00 00 mov r10d,7663h 41 8B C5 mov eax,r13d 45 85 D2 test r10d,r10d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FA FF cmp r10d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B C0 mov r8d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 FA idiv eax,r10d 44 8B C0 mov r8d,eax done: After: 49 63 C5 movsxd rax,r13d 48 69 C0 65 6B 32 45 imul rax,rax,45326B65h 4C 8B C0 mov r8,rax 48 C1 E8 3F shr rax,3Fh 49 C1 F8 2D sar r8,2Dh 44 03 C0 add r8d,eax - Division by 30323 Before: 41 BA 73 76 00 00 mov r10d,7673h 41 8B C5 mov eax,r13d 45 85 D2 test r10d,r10d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FA FF cmp r10d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B C0 mov r8d,eax EB 07 jmp 00000000161737E7 normal_path: 99 cdq 41 F7 FA idiv eax,r10d 44 8B C0 mov r8d,eax done: After: 49 63 C5 movsxd rax,r13d 4C 69 C0 19 25 52 8A imul r8,rax,0FFFFFFFF8A522519h 49 C1 E8 20 shr r8,20h 44 03 C0 add r8d,eax C1 E8 1F shr eax,1Fh 41 C1 F8 0E sar r8d,0Eh 44 03 C0 add r8d,eax
2021-03-04 20:17:50 +01:00 · 2021-03-04 20:17:50 +01:00 · 95698c5ae1
parent 5bb8798df6
commit 95698c5ae1
1 changed files with 84 additions and 0 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@ -16,10 +16,12 @@
 #include "Core/PowerPC/Jit64/Jit.h"
 #include "Core/PowerPC/Jit64/RegCache/JitRegCache.h"
 #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
+#include "Core/PowerPC/JitCommon/DivUtils.h"
 #include "Core/PowerPC/PPCAnalyst.h"
 #include "Core/PowerPC/PowerPC.h"

 using namespace Gen;
+using namespace JitCommon;

 void Jit64::GenerateConstantOverflow(s64 val)
 {
@ -1414,6 +1416,88 @@ void Jit64::divwx(UGeckoInstruction inst)
      SetJumpTarget(done);
    }
  }
+  else if (gpr.IsImm(b))
+  {
+    // Constant divisor
+    const s32 divisor = gpr.SImm32(b);
+    RCOpArg Ra = gpr.Use(a, RCMode::Read);
+    RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
+    RegCache::Realize(Ra, Rd);
+
+    // Handle 0, 1, and -1 explicitly
+    if (divisor == 0)
+    {
+      if (d != a)
+        MOV(32, Rd, Ra);
+      SAR(32, Rd, Imm8(31));
+      if (inst.OE)
+        GenerateConstantOverflow(true);
+    }
+    else if (divisor == 1)
+    {
+      if (d != a)
+        MOV(32, Rd, Ra);
+      if (inst.OE)
+        GenerateConstantOverflow(false);
+    }
+    else if (divisor == -1)
+    {
+      if (d != a)
+        MOV(32, Rd, Ra);
+
+      CMP(32, Rd, Imm32(0x80000000));
+      const FixupBranch normal = J_CC(CC_NE);
+
+      MOV(32, Rd, Imm32(0xFFFFFFFF));
+      if (inst.OE)
+        GenerateConstantOverflow(true);
+      const FixupBranch done = J();
+
+      SetJumpTarget(normal);
+      NEG(32, Rd);
+      if (inst.OE)
+        GenerateConstantOverflow(false);
+
+      SetJumpTarget(done);
+    }
+    else
+    {
+      // Optimize signed 32-bit integer division by a constant
+      Magic m = SignedDivisionConstants(divisor);
+
+      MOVSX(64, 32, RSCRATCH, Ra);
+
+      if (divisor > 0 && m.multiplier < 0)
+      {
+        IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
+        SHR(64, Rd, Imm8(32));
+        ADD(32, Rd, R(RSCRATCH));
+        SHR(32, R(RSCRATCH), Imm8(31));
+        SAR(32, Rd, Imm8(m.shift));
+      }
+      else if (divisor < 0 && m.multiplier > 0)
+      {
+        IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
+        SHR(64, R(RSCRATCH), Imm8(32));
+        SUB(32, R(RSCRATCH), Rd);
+        MOV(32, Rd, R(RSCRATCH));
+        SHR(32, Rd, Imm8(31));
+        SAR(32, R(RSCRATCH), Imm8(m.shift));
+      }
+      else
+      {
+        IMUL(64, RSCRATCH, R(RSCRATCH), Imm32(m.multiplier));
+        MOV(64, Rd, R(RSCRATCH));
+        SHR(64, R(RSCRATCH), Imm8(63));
+        SAR(64, R(Rd), Imm8(32 + m.shift));
+      }
+
+      ADD(32, Rd, R(RSCRATCH));
+
+      if (inst.OE)
+        GenerateConstantOverflow(false);
+    }
+  }
  else
  {
    RCOpArg Ra = gpr.Use(a, RCMode::Read);