From 2e4e2ad1ffc92ac76498f926ca88637d098ea747 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Sun, 15 Nov 2020 13:04:57 +0100
Subject: [PATCH 1/9] Jit64: subfic - Handle constants

Occurs surprisingly often. Prevents generating silly code like this:

BE 03 00 00 00       mov         esi,3
83 EE 08             sub         esi,8
0F 93 45 58          setae       byte ptr [rbp+58h]
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 74362c12c1..12e46b0530 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -862,13 +862,20 @@ void Jit64::subfic(UGeckoInstruction inst)
 {
   INSTRUCTION_START
   JITDISABLE(bJITIntegerOff);
-  int a = inst.RA, d = inst.RD;
+  int a = inst.RA, d = inst.RD, imm = inst.SIMM_16;
+
+  if (gpr.IsImm(a))
+  {
+    u32 i = imm, j = gpr.Imm32(a);
+    gpr.SetImmediate32(d, i - j);
+    FinalizeCarry(j == 0 || (i > j - 1));
+    return;
+  }
 
   RCOpArg Ra = gpr.Use(a, RCMode::Read);
   RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
   RegCache::Realize(Ra, Rd);
 
-  int imm = inst.SIMM_16;
   if (d == a)
   {
     if (imm == 0)

From 17db359979587c588d549e3f31dc8994c96abd8b Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Mon, 16 Nov 2020 23:00:52 +0100
Subject: [PATCH 2/9] Jit64: srwx - Optimize shift by constant

More efficient code can be generated if the shift amount is known at
compile time. Similar optimizations were present in JitArm64 already,
but were missing in Jit64.

- By using an 8-bit immediate we can eliminate the need for ECX as a
  scratch register, thereby reducing register pressure and occasionally
  eliminating a spill.

Before:
B9 18 00 00 00       mov         ecx,18h
45 8B C1             mov         r8d,r9d
49 D3 E8             shr         r8,cl

After:
45 8B C1             mov         r8d,r9d
41 C1 E8 18          shr         r8d,18h

- PowerPC has strange shift amount masking behavior which is emulated
  using 64-bit shifts, even though we only care about a 32-bit result.
  If the shift amount is known, we can handle this special case
  separately, and use 32-bit shift instructions otherwise.

Before:
B9 F8 FF FF FF       mov         ecx,0FFFFFFF8h
45 8B C1             mov         r8d,r9d
49 D3 E8             shr         r8,cl

After:
Nothing, register is set to constant zero.

- A shift by zero becomes a simple MOV.

Before:
B9 00 00 00 00       mov         ecx,0
45 8B C1             mov         r8d,r9d
49 D3 E8             shr         r8,cl

After:
45 8B C1             mov         r8d,r9d
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 12e46b0530..4166d8626b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1795,6 +1795,27 @@ void Jit64::srwx(UGeckoInstruction inst)
     u32 amount = gpr.Imm32(b);
     gpr.SetImmediate32(a, (amount & 0x20) ? 0 : (gpr.Imm32(s) >> (amount & 0x1f)));
   }
+  else if (gpr.IsImm(b))
+  {
+    u32 amount = gpr.Imm32(b);
+    if (amount & 0x20)
+    {
+      gpr.SetImmediate32(a, 0);
+    }
+    else
+    {
+      RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
+      RCOpArg Rs = gpr.Use(s, RCMode::Read);
+      RegCache::Realize(Ra, Rs);
+
+      if (a != s)
+        MOV(32, Ra, Rs);
+
+      amount &= 0x1f;
+      if (amount != 0)
+        SHR(32, Ra, Imm8(amount));
+    }
+  }
   else
   {
     RCX64Reg ecx = gpr.Scratch(ECX);  // no register choice

From 17dc870847ba65a5c8aceeafd21fe7623d042c42 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Mon, 16 Nov 2020 23:54:54 +0100
Subject: [PATCH 3/9] Jit64: slwx - Optimize shift by constant

More efficient code can be generated if the shift amount is known at
compile time. Similar optimizations were present in JitArm64 already,
but were missing in Jit64.

- By using an 8-bit immediate we can eliminate the need for ECX as a
  scratch register, thereby reducing register pressure and occasionally
  eliminating a spill.

Before:
B9 18 00 00 00       mov         ecx,18h
41 8B F7             mov         esi,r15d
48 D3 E6             shl         rsi,cl
8B F6                mov         esi,esi

After:
41 8B CF             mov         ecx,r15d
C1 E1 18             shl         ecx,18h

- PowerPC has strange shift amount masking behavior which is emulated
  using 64-bit shifts, even though we only care about a 32-bit result.
  If the shift amount is known, we can handle this special case
  separately, and use 32-bit shift instructions otherwise. We also no
  longer need to clear the upper 32 bits of the register.

Before:
BE F8 FF FF FF       mov         esi,0FFFFFFF8h
8B CE                mov         ecx,esi
41 8B F4             mov         esi,r12d
48 D3 E6             shl         rsi,cl
8B F6                mov         esi,esi

After:
Nothing, register is set to constant zero.

- A shift by zero becomes a simple MOV.

Before:
BE 00 00 00 00       mov         esi,0
8B CE                mov         ecx,esi
41 8B F3             mov         esi,r11d
48 D3 E6             shl         rsi,cl
8B F6                mov         esi,esi

After:
41 8B FB             mov         edi,r11d
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 4166d8626b..fb632a42fd 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1849,6 +1849,30 @@ void Jit64::slwx(UGeckoInstruction inst)
     if (inst.Rc)
       ComputeRC(a);
   }
+  else if (gpr.IsImm(b))
+  {
+    u32 amount = gpr.Imm32(b);
+    if (amount & 0x20)
+    {
+      gpr.SetImmediate32(a, 0);
+    }
+    else
+    {
+      RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
+      RCOpArg Rs = gpr.Use(s, RCMode::Read);
+      RegCache::Realize(Ra, Rs);
+
+      if (a != s)
+        MOV(32, Ra, Rs);
+
+      amount &= 0x1f;
+      if (amount != 0)
+        SHL(32, Ra, Imm8(amount));
+    }
+
+    if (inst.Rc)
+      ComputeRC(a);
+  }
   else
   {
     RCX64Reg ecx = gpr.Scratch(ECX);  // no register choice

From b968120f8a2f9925bd218357a095a41e29449c8e Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Wed, 18 Nov 2020 00:03:16 +0100
Subject: [PATCH 4/9] Jit64: srawx - Optimize shift by constant

More efficient code can be generated if the shift amount is known at
compile time. We can once again take advantage of shifts with the shift
amount in an 8-bit immediate to eliminate ECX as a scratch register,
reducing register pressure and removing the occasional spill. We can
also do 32-bit shifts instead of 64-bit operations.

We recognize four distinct cases:

- The special case where we're dealing with the PowerPC's quirky shift
  amount masking. If the shift amount is a number from 32 to 63, all
  bits are shifted out and the result it either all zeroes or all ones.

Before:
B9 F0 FF FF FF       mov         ecx,0FFFFFFF0h
8B F7                mov         esi,edi
48 C1 E6 20          shl         rsi,20h
48 D3 FE             sar         rsi,cl
8B C6                mov         eax,esi
48 C1 EE 20          shr         rsi,20h
85 F0                test        eax,esi
0F 95 45 58          setne       byte ptr [rbp+58h]

After:
8B F7                mov         esi,edi
C1 FE 1F             sar         esi,1Fh
0F 95 45 58          setne       byte ptr [rbp+58h]

- The shift amount is zero. Not calculation needs to be done, just clear
  the carry flag.

Before:
B9 00 00 00 00       mov         ecx,0
49 C1 E5 20          shl         r13,20h
49 D3 FD             sar         r13,cl
41 8B C5             mov         eax,r13d
49 C1 ED 20          shr         r13,20h
44 85 E8             test        eax,r13d
0F 95 45 58          setne       byte ptr [rbp+58h]

After:
C6 45 58 00          mov         byte ptr [rbp+58h],0

- The carry flag doesn't need to be computed. Just do the arithmetic
  shift.

Before:
B9 02 00 00 00       mov         ecx,2
48 C1 E7 20          shl         rdi,20h
48 D3 FF             sar         rdi,cl
48 C1 EF 20          shr         rdi,20h

After:
C1 FF 02             sar         edi,2

- The carry flag must be computed. In addition to the arithmetic shift,
  we do a shift to the left and and them together to know if any ones
  were shifted out. It's still better than before, because we can do
  32-bit shifts.

Before:
B9 02 00 00 00       mov         ecx,2
49 C1 E5 20          shl         r13,20h
49 D3 FD             sar         r13,cl
41 8B C5             mov         eax,r13d
49 C1 ED 20          shr         r13,20h
44 85 E8             test        eax,r13d
0F 95 45 58          setne       byte ptr [rbp+58h]

After:
41 8B C5             mov         eax,r13d
41 C1 FD 02          sar         r13d,2
C1 E0 1E             shl         eax,1Eh
44 85 E8             test        eax,r13d
0F 95 45 58          setne       byte ptr [rbp+58h]
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index fb632a42fd..fb782c8a66 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1907,6 +1907,43 @@ void Jit64::srawx(UGeckoInstruction inst)
   int b = inst.RB;
   int s = inst.RS;
 
+  if (gpr.IsImm(b))
+  {
+    u32 amount = gpr.Imm32(b);
+    RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
+    RCOpArg Rs = gpr.Use(s, RCMode::Read);
+    RegCache::Realize(Ra, Rs);
+
+    if (a != s)
+      MOV(32, Ra, Rs);
+
+    bool special = amount & 0x20;
+    amount &= 0x1f;
+
+    if (special)
+    {
+      SAR(32, Ra, Imm8(31));
+      FinalizeCarry(CC_NZ);
+    }
+    else if (amount == 0)
+    {
+      FinalizeCarry(false);
+    }
+    else if (!js.op->wantsCA)
+    {
+      SAR(32, Ra, Imm8(amount));
+      FinalizeCarry(CC_NZ);
+    }
+    else
+    {
+      MOV(32, R(RSCRATCH), Ra);
+      SAR(32, Ra, Imm8(amount));
+      SHL(32, R(RSCRATCH), Imm8(32 - amount));
+      TEST(32, Ra, R(RSCRATCH));
+      FinalizeCarry(CC_NZ);
+    }
+  }
+  else
   {
     RCX64Reg ecx = gpr.Scratch(ECX);  // no register choice
     RCX64Reg Ra = gpr.Bind(a, RCMode::Write);

From 8ac40162da0b43ebfb668d6858639d9b49c491f5 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Wed, 18 Nov 2020 23:20:30 +0100
Subject: [PATCH 5/9] Jit64: srawx - Handle constant input registers

If both input registers hold known values at compile time, we can just
calculate the result on the spot.

Code has mostly been copied from JitArm64 where it had already been implemented.

Before:
BF FF FF FF FF       mov         edi,0FFFFFFFFh
8B C7                mov         eax,edi
C1 FF 10             sar         edi,10h
C1 E0 10             shl         eax,10h
85 F8                test        eax,edi
0F 95 45 58          setne       byte ptr [rbp+58h]

After:
C6 45 58 01          mov         byte ptr [rbp+58h],1
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index fb782c8a66..e7591429e1 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1907,7 +1907,22 @@ void Jit64::srawx(UGeckoInstruction inst)
   int b = inst.RB;
   int s = inst.RS;
 
-  if (gpr.IsImm(b))
+  if (gpr.IsImm(b, s))
+  {
+    s32 i = gpr.SImm32(s), amount = gpr.SImm32(b);
+    if (amount & 0x20)
+    {
+      gpr.SetImmediate32(a, i & 0x80000000 ? 0xFFFFFFFF : 0);
+      FinalizeCarry(i & 0x80000000 ? true : false);
+    }
+    else
+    {
+      amount &= 0x1F;
+      gpr.SetImmediate32(a, i >> amount);
+      FinalizeCarry(amount != 0 && i < 0 && (u32(i) << (32 - amount)));
+    }
+  }
+  else if (gpr.IsImm(b))
   {
     u32 amount = gpr.Imm32(b);
     RCX64Reg Ra = gpr.Bind(a, RCMode::Write);

From cb70d5ee4f7f8b2209b79546db938bdd4d4727a1 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Thu, 19 Nov 2020 21:20:54 +0100
Subject: [PATCH 6/9] Jit64: srawix - Handle constant input register

Much like we did for srawx. This was already implemented on JitArm64.

Before:
B8 00 00 00 00       mov         eax,0
8B F0                mov         esi,eax
C1 E8 1F             shr         eax,1Fh
23 C6                and         eax,esi
D1 FE                sar         esi,1
88 45 58             mov         byte ptr [rbp+58h],al

After:
C6 45 58 00          mov         byte ptr [rbp+58h],0
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index e7591429e1..608e588bad 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1995,7 +1995,13 @@ void Jit64::srawix(UGeckoInstruction inst)
   int s = inst.RS;
   int amount = inst.SH;
 
-  if (amount != 0)
+  if (gpr.IsImm(s))
+  {
+    s32 imm = gpr.SImm32(s);
+    gpr.SetImmediate32(a, imm >> amount);
+    FinalizeCarry(amount != 0 && imm < 0 && (u32(imm) << (32 - amount)));
+  }
+  else if (amount != 0)
   {
     RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
     RCOpArg Rs = gpr.Use(s, RCMode::Read);

From 1a52fdf7e361b26eb1b7a0ae551b9672947b9b09 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Thu, 19 Nov 2020 21:25:11 +0100
Subject: [PATCH 7/9] Jit64: rlwnmx - Optimize rotate by constant

Only removes the scratch register and a MOV, but hey.

Before:
B9 02 00 00 00       mov         ecx,2
41 8B F5             mov         esi,r13d
D3 C6                rol         esi,cl
83 E6 01             and         esi,1

After:
41 8B F5             mov         esi,r13d
C1 C6 02             rol         esi,2
83 E6 01             and         esi,1
---
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 608e588bad..0e04f574a5 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1729,6 +1729,25 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
   {
     gpr.SetImmediate32(a, Common::RotateLeft(gpr.Imm32(s), gpr.Imm32(b) & 0x1F) & mask);
   }
+  else if (gpr.IsImm(b))
+  {
+    u32 amount = gpr.Imm32(b) & 0x1f;
+    RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
+    RCOpArg Rs = gpr.Use(s, RCMode::Read);
+    RegCache::Realize(Ra, Rs);
+
+    if (a != s)
+      MOV(32, Ra, Rs);
+
+    if (amount)
+      ROL(32, Ra, Imm8(amount));
+
+    // we need flags if we're merging the branch
+    if (inst.Rc && CheckMergedBranch(0))
+      AND(32, Ra, Imm32(mask));
+    else
+      AndWithMask(Ra, mask);
+  }
   else
   {
     RCX64Reg ecx = gpr.Scratch(ECX);  // no register choice

From 10d65519f95b33221731aab1b96d98d69e5792f3 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Sun, 13 Dec 2020 11:52:21 +0100
Subject: [PATCH 8/9] Jit64: slwx - Handle constant zero input

Shifting zero by any amount always gives zero.

Before:
41 BF 00 00 00 00    mov         r15d,0
8B CF                mov         ecx,edi
49 D3 E7             shl         r15,cl
45 8B FF             mov         r15d,r15d

After:
Nothing, register is set to constant zero.

All games I've tried hit this optimization on launch. In Soul Calibur II
it occurs very frequently during gameplay.
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 0e04f574a5..d32d75bb88 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1892,6 +1892,12 @@ void Jit64::slwx(UGeckoInstruction inst)
     if (inst.Rc)
       ComputeRC(a);
   }
+  else if (gpr.IsImm(s) && gpr.Imm32(s) == 0)
+  {
+    gpr.SetImmediate32(a, 0);
+    if (inst.Rc)
+      ComputeRC(a);
+  }
   else
   {
     RCX64Reg ecx = gpr.Scratch(ECX);  // no register choice

From 67d2fa11f122dfd15f157d59356bcfb69d3acc47 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Sun, 13 Dec 2020 12:36:07 +0100
Subject: [PATCH 9/9] Jit64: srawx - Handle constant zero input

Shifting zero by any amount always gives zero.

Before:
41 B9 00 00 00 00    mov         r9d,0
41 8B CF             mov         ecx,r15d
49 C1 E1 20          shl         r9,20h
49 D3 F9             sar         r9,cl
49 C1 E9 20          shr         r9,20h

After:
Nothing, register is set to constant zero.

Before:
41 B8 00 00 00 00    mov         r8d,0
41 8B CF             mov         ecx,r15d
49 C1 E0 20          shl         r8,20h
49 D3 F8             sar         r8,cl
41 8B C0             mov         eax,r8d
49 C1 E8 20          shr         r8,20h
44 85 C0             test        eax,r8d
0F 95 45 58          setne       byte ptr [rbp+58h]

After:
C6 45 58 00          mov         byte ptr [rbp+58h],0

Occurs a bunch of times in Super Mario Sunshine. Since this is an
arithmetic shift a similar optimization can be done for constant -1
(0xFFFFFFFF), but I couldn't find any game where this happens.
---
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index d32d75bb88..1bc4ef5c35 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1983,6 +1983,11 @@ void Jit64::srawx(UGeckoInstruction inst)
       FinalizeCarry(CC_NZ);
     }
   }
+  else if (gpr.IsImm(s) && gpr.Imm32(s) == 0)
+  {
+    gpr.SetImmediate32(a, 0);
+    FinalizeCarry(false);
+  }
   else
   {
     RCX64Reg ecx = gpr.Scratch(ECX);  // no register choice