From 2e4e2ad1ffc92ac76498f926ca88637d098ea747 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sun, 15 Nov 2020 13:04:57 +0100 Subject: [PATCH 1/9] Jit64: subfic - Handle constants Occurs surprisingly often. Prevents generating silly code like this: BE 03 00 00 00 mov esi,3 83 EE 08 sub esi,8 0F 93 45 58 setae byte ptr [rbp+58h] --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 74362c12c1..12e46b0530 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -862,13 +862,20 @@ void Jit64::subfic(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITIntegerOff); - int a = inst.RA, d = inst.RD; + int a = inst.RA, d = inst.RD, imm = inst.SIMM_16; + + if (gpr.IsImm(a)) + { + u32 i = imm, j = gpr.Imm32(a); + gpr.SetImmediate32(d, i - j); + FinalizeCarry(j == 0 || (i > j - 1)); + return; + } RCOpArg Ra = gpr.Use(a, RCMode::Read); RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RegCache::Realize(Ra, Rd); - int imm = inst.SIMM_16; if (d == a) { if (imm == 0) From 17db359979587c588d549e3f31dc8994c96abd8b Mon Sep 17 00:00:00 2001 From: Sintendo Date: Mon, 16 Nov 2020 23:00:52 +0100 Subject: [PATCH 2/9] Jit64: srwx - Optimize shift by constant More efficient code can be generated if the shift amount is known at compile time. Similar optimizations were present in JitArm64 already, but were missing in Jit64. - By using an 8-bit immediate we can eliminate the need for ECX as a scratch register, thereby reducing register pressure and occasionally eliminating a spill. Before: B9 18 00 00 00 mov ecx,18h 45 8B C1 mov r8d,r9d 49 D3 E8 shr r8,cl After: 45 8B C1 mov r8d,r9d 41 C1 E8 18 shr r8d,18h - PowerPC has strange shift amount masking behavior which is emulated using 64-bit shifts, even though we only care about a 32-bit result. If the shift amount is known, we can handle this special case separately, and use 32-bit shift instructions otherwise. Before: B9 F8 FF FF FF mov ecx,0FFFFFFF8h 45 8B C1 mov r8d,r9d 49 D3 E8 shr r8,cl After: Nothing, register is set to constant zero. - A shift by zero becomes a simple MOV. Before: B9 00 00 00 00 mov ecx,0 45 8B C1 mov r8d,r9d 49 D3 E8 shr r8,cl After: 45 8B C1 mov r8d,r9d --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 12e46b0530..4166d8626b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1795,6 +1795,27 @@ void Jit64::srwx(UGeckoInstruction inst) u32 amount = gpr.Imm32(b); gpr.SetImmediate32(a, (amount & 0x20) ? 0 : (gpr.Imm32(s) >> (amount & 0x1f))); } + else if (gpr.IsImm(b)) + { + u32 amount = gpr.Imm32(b); + if (amount & 0x20) + { + gpr.SetImmediate32(a, 0); + } + else + { + RCX64Reg Ra = gpr.Bind(a, RCMode::Write); + RCOpArg Rs = gpr.Use(s, RCMode::Read); + RegCache::Realize(Ra, Rs); + + if (a != s) + MOV(32, Ra, Rs); + + amount &= 0x1f; + if (amount != 0) + SHR(32, Ra, Imm8(amount)); + } + } else { RCX64Reg ecx = gpr.Scratch(ECX); // no register choice From 17dc870847ba65a5c8aceeafd21fe7623d042c42 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Mon, 16 Nov 2020 23:54:54 +0100 Subject: [PATCH 3/9] Jit64: slwx - Optimize shift by constant More efficient code can be generated if the shift amount is known at compile time. Similar optimizations were present in JitArm64 already, but were missing in Jit64. - By using an 8-bit immediate we can eliminate the need for ECX as a scratch register, thereby reducing register pressure and occasionally eliminating a spill. Before: B9 18 00 00 00 mov ecx,18h 41 8B F7 mov esi,r15d 48 D3 E6 shl rsi,cl 8B F6 mov esi,esi After: 41 8B CF mov ecx,r15d C1 E1 18 shl ecx,18h - PowerPC has strange shift amount masking behavior which is emulated using 64-bit shifts, even though we only care about a 32-bit result. If the shift amount is known, we can handle this special case separately, and use 32-bit shift instructions otherwise. We also no longer need to clear the upper 32 bits of the register. Before: BE F8 FF FF FF mov esi,0FFFFFFF8h 8B CE mov ecx,esi 41 8B F4 mov esi,r12d 48 D3 E6 shl rsi,cl 8B F6 mov esi,esi After: Nothing, register is set to constant zero. - A shift by zero becomes a simple MOV. Before: BE 00 00 00 00 mov esi,0 8B CE mov ecx,esi 41 8B F3 mov esi,r11d 48 D3 E6 shl rsi,cl 8B F6 mov esi,esi After: 41 8B FB mov edi,r11d --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 4166d8626b..fb632a42fd 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1849,6 +1849,30 @@ void Jit64::slwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC(a); } + else if (gpr.IsImm(b)) + { + u32 amount = gpr.Imm32(b); + if (amount & 0x20) + { + gpr.SetImmediate32(a, 0); + } + else + { + RCX64Reg Ra = gpr.Bind(a, RCMode::Write); + RCOpArg Rs = gpr.Use(s, RCMode::Read); + RegCache::Realize(Ra, Rs); + + if (a != s) + MOV(32, Ra, Rs); + + amount &= 0x1f; + if (amount != 0) + SHL(32, Ra, Imm8(amount)); + } + + if (inst.Rc) + ComputeRC(a); + } else { RCX64Reg ecx = gpr.Scratch(ECX); // no register choice From b968120f8a2f9925bd218357a095a41e29449c8e Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 18 Nov 2020 00:03:16 +0100 Subject: [PATCH 4/9] Jit64: srawx - Optimize shift by constant More efficient code can be generated if the shift amount is known at compile time. We can once again take advantage of shifts with the shift amount in an 8-bit immediate to eliminate ECX as a scratch register, reducing register pressure and removing the occasional spill. We can also do 32-bit shifts instead of 64-bit operations. We recognize four distinct cases: - The special case where we're dealing with the PowerPC's quirky shift amount masking. If the shift amount is a number from 32 to 63, all bits are shifted out and the result it either all zeroes or all ones. Before: B9 F0 FF FF FF mov ecx,0FFFFFFF0h 8B F7 mov esi,edi 48 C1 E6 20 shl rsi,20h 48 D3 FE sar rsi,cl 8B C6 mov eax,esi 48 C1 EE 20 shr rsi,20h 85 F0 test eax,esi 0F 95 45 58 setne byte ptr [rbp+58h] After: 8B F7 mov esi,edi C1 FE 1F sar esi,1Fh 0F 95 45 58 setne byte ptr [rbp+58h] - The shift amount is zero. Not calculation needs to be done, just clear the carry flag. Before: B9 00 00 00 00 mov ecx,0 49 C1 E5 20 shl r13,20h 49 D3 FD sar r13,cl 41 8B C5 mov eax,r13d 49 C1 ED 20 shr r13,20h 44 85 E8 test eax,r13d 0F 95 45 58 setne byte ptr [rbp+58h] After: C6 45 58 00 mov byte ptr [rbp+58h],0 - The carry flag doesn't need to be computed. Just do the arithmetic shift. Before: B9 02 00 00 00 mov ecx,2 48 C1 E7 20 shl rdi,20h 48 D3 FF sar rdi,cl 48 C1 EF 20 shr rdi,20h After: C1 FF 02 sar edi,2 - The carry flag must be computed. In addition to the arithmetic shift, we do a shift to the left and and them together to know if any ones were shifted out. It's still better than before, because we can do 32-bit shifts. Before: B9 02 00 00 00 mov ecx,2 49 C1 E5 20 shl r13,20h 49 D3 FD sar r13,cl 41 8B C5 mov eax,r13d 49 C1 ED 20 shr r13,20h 44 85 E8 test eax,r13d 0F 95 45 58 setne byte ptr [rbp+58h] After: 41 8B C5 mov eax,r13d 41 C1 FD 02 sar r13d,2 C1 E0 1E shl eax,1Eh 44 85 E8 test eax,r13d 0F 95 45 58 setne byte ptr [rbp+58h] --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index fb632a42fd..fb782c8a66 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1907,6 +1907,43 @@ void Jit64::srawx(UGeckoInstruction inst) int b = inst.RB; int s = inst.RS; + if (gpr.IsImm(b)) + { + u32 amount = gpr.Imm32(b); + RCX64Reg Ra = gpr.Bind(a, RCMode::Write); + RCOpArg Rs = gpr.Use(s, RCMode::Read); + RegCache::Realize(Ra, Rs); + + if (a != s) + MOV(32, Ra, Rs); + + bool special = amount & 0x20; + amount &= 0x1f; + + if (special) + { + SAR(32, Ra, Imm8(31)); + FinalizeCarry(CC_NZ); + } + else if (amount == 0) + { + FinalizeCarry(false); + } + else if (!js.op->wantsCA) + { + SAR(32, Ra, Imm8(amount)); + FinalizeCarry(CC_NZ); + } + else + { + MOV(32, R(RSCRATCH), Ra); + SAR(32, Ra, Imm8(amount)); + SHL(32, R(RSCRATCH), Imm8(32 - amount)); + TEST(32, Ra, R(RSCRATCH)); + FinalizeCarry(CC_NZ); + } + } + else { RCX64Reg ecx = gpr.Scratch(ECX); // no register choice RCX64Reg Ra = gpr.Bind(a, RCMode::Write); From 8ac40162da0b43ebfb668d6858639d9b49c491f5 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 18 Nov 2020 23:20:30 +0100 Subject: [PATCH 5/9] Jit64: srawx - Handle constant input registers If both input registers hold known values at compile time, we can just calculate the result on the spot. Code has mostly been copied from JitArm64 where it had already been implemented. Before: BF FF FF FF FF mov edi,0FFFFFFFFh 8B C7 mov eax,edi C1 FF 10 sar edi,10h C1 E0 10 shl eax,10h 85 F8 test eax,edi 0F 95 45 58 setne byte ptr [rbp+58h] After: C6 45 58 01 mov byte ptr [rbp+58h],1 --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index fb782c8a66..e7591429e1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1907,7 +1907,22 @@ void Jit64::srawx(UGeckoInstruction inst) int b = inst.RB; int s = inst.RS; - if (gpr.IsImm(b)) + if (gpr.IsImm(b, s)) + { + s32 i = gpr.SImm32(s), amount = gpr.SImm32(b); + if (amount & 0x20) + { + gpr.SetImmediate32(a, i & 0x80000000 ? 0xFFFFFFFF : 0); + FinalizeCarry(i & 0x80000000 ? true : false); + } + else + { + amount &= 0x1F; + gpr.SetImmediate32(a, i >> amount); + FinalizeCarry(amount != 0 && i < 0 && (u32(i) << (32 - amount))); + } + } + else if (gpr.IsImm(b)) { u32 amount = gpr.Imm32(b); RCX64Reg Ra = gpr.Bind(a, RCMode::Write); From cb70d5ee4f7f8b2209b79546db938bdd4d4727a1 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 19 Nov 2020 21:20:54 +0100 Subject: [PATCH 6/9] Jit64: srawix - Handle constant input register Much like we did for srawx. This was already implemented on JitArm64. Before: B8 00 00 00 00 mov eax,0 8B F0 mov esi,eax C1 E8 1F shr eax,1Fh 23 C6 and eax,esi D1 FE sar esi,1 88 45 58 mov byte ptr [rbp+58h],al After: C6 45 58 00 mov byte ptr [rbp+58h],0 --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index e7591429e1..608e588bad 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1995,7 +1995,13 @@ void Jit64::srawix(UGeckoInstruction inst) int s = inst.RS; int amount = inst.SH; - if (amount != 0) + if (gpr.IsImm(s)) + { + s32 imm = gpr.SImm32(s); + gpr.SetImmediate32(a, imm >> amount); + FinalizeCarry(amount != 0 && imm < 0 && (u32(imm) << (32 - amount))); + } + else if (amount != 0) { RCX64Reg Ra = gpr.Bind(a, RCMode::Write); RCOpArg Rs = gpr.Use(s, RCMode::Read); From 1a52fdf7e361b26eb1b7a0ae551b9672947b9b09 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 19 Nov 2020 21:25:11 +0100 Subject: [PATCH 7/9] Jit64: rlwnmx - Optimize rotate by constant Only removes the scratch register and a MOV, but hey. Before: B9 02 00 00 00 mov ecx,2 41 8B F5 mov esi,r13d D3 C6 rol esi,cl 83 E6 01 and esi,1 After: 41 8B F5 mov esi,r13d C1 C6 02 rol esi,2 83 E6 01 and esi,1 --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 608e588bad..0e04f574a5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1729,6 +1729,25 @@ void Jit64::rlwnmx(UGeckoInstruction inst) { gpr.SetImmediate32(a, Common::RotateLeft(gpr.Imm32(s), gpr.Imm32(b) & 0x1F) & mask); } + else if (gpr.IsImm(b)) + { + u32 amount = gpr.Imm32(b) & 0x1f; + RCX64Reg Ra = gpr.Bind(a, RCMode::Write); + RCOpArg Rs = gpr.Use(s, RCMode::Read); + RegCache::Realize(Ra, Rs); + + if (a != s) + MOV(32, Ra, Rs); + + if (amount) + ROL(32, Ra, Imm8(amount)); + + // we need flags if we're merging the branch + if (inst.Rc && CheckMergedBranch(0)) + AND(32, Ra, Imm32(mask)); + else + AndWithMask(Ra, mask); + } else { RCX64Reg ecx = gpr.Scratch(ECX); // no register choice From 10d65519f95b33221731aab1b96d98d69e5792f3 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sun, 13 Dec 2020 11:52:21 +0100 Subject: [PATCH 8/9] Jit64: slwx - Handle constant zero input Shifting zero by any amount always gives zero. Before: 41 BF 00 00 00 00 mov r15d,0 8B CF mov ecx,edi 49 D3 E7 shl r15,cl 45 8B FF mov r15d,r15d After: Nothing, register is set to constant zero. All games I've tried hit this optimization on launch. In Soul Calibur II it occurs very frequently during gameplay. --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 0e04f574a5..d32d75bb88 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1892,6 +1892,12 @@ void Jit64::slwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC(a); } + else if (gpr.IsImm(s) && gpr.Imm32(s) == 0) + { + gpr.SetImmediate32(a, 0); + if (inst.Rc) + ComputeRC(a); + } else { RCX64Reg ecx = gpr.Scratch(ECX); // no register choice From 67d2fa11f122dfd15f157d59356bcfb69d3acc47 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sun, 13 Dec 2020 12:36:07 +0100 Subject: [PATCH 9/9] Jit64: srawx - Handle constant zero input Shifting zero by any amount always gives zero. Before: 41 B9 00 00 00 00 mov r9d,0 41 8B CF mov ecx,r15d 49 C1 E1 20 shl r9,20h 49 D3 F9 sar r9,cl 49 C1 E9 20 shr r9,20h After: Nothing, register is set to constant zero. Before: 41 B8 00 00 00 00 mov r8d,0 41 8B CF mov ecx,r15d 49 C1 E0 20 shl r8,20h 49 D3 F8 sar r8,cl 41 8B C0 mov eax,r8d 49 C1 E8 20 shr r8,20h 44 85 C0 test eax,r8d 0F 95 45 58 setne byte ptr [rbp+58h] After: C6 45 58 00 mov byte ptr [rbp+58h],0 Occurs a bunch of times in Super Mario Sunshine. Since this is an arithmetic shift a similar optimization can be done for constant -1 (0xFFFFFFFF), but I couldn't find any game where this happens. --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index d32d75bb88..1bc4ef5c35 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1983,6 +1983,11 @@ void Jit64::srawx(UGeckoInstruction inst) FinalizeCarry(CC_NZ); } } + else if (gpr.IsImm(s) && gpr.Imm32(s) == 0) + { + gpr.SetImmediate32(a, 0); + FinalizeCarry(false); + } else { RCX64Reg ecx = gpr.Scratch(ECX); // no register choice