From dbb8f588c7ac5c6c4586804c072c87de730ef00c Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Tue, 1 Nov 2022 11:10:00 +0100 Subject: [PATCH 1/6] JitArm64: cmpl - Optimize a == 0 case By explicitly handling this, we can avoid materializing zero in a register. Before: 0x52800019 mov w25, #0x0 0xb94087b6 ldr w22, [x29, #0x84] 0xcb16033b sub x27, x25, x22 After: 0xb94087b9 ldr w25, [x29, #0x84] 0xcb1903fb neg x27, x25 --- .../Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 2fc1524809..6c0d159be8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -615,16 +615,19 @@ void JitArm64::cmpl(UGeckoInstruction inst) u64 A = gpr.GetImm(a); u64 B = gpr.GetImm(b); MOVI2R(CR, A - B); - return; } - - if (gpr.IsImm(b) && !gpr.GetImm(b)) + else if (gpr.IsImm(a) && !gpr.GetImm(a)) + { + NEG(CR, EncodeRegTo64(gpr.R(b))); + } + else if (gpr.IsImm(b) && !gpr.GetImm(b)) { MOV(EncodeRegTo32(CR), gpr.R(a)); - return; } - - SUB(gpr.CR(crf), EncodeRegTo64(gpr.R(a)), EncodeRegTo64(gpr.R(b))); + else + { + SUB(CR, EncodeRegTo64(gpr.R(a)), EncodeRegTo64(gpr.R(b))); + } } void JitArm64::cmpi(UGeckoInstruction inst) From f5e7e70cc5078915656aafaa7c6ccb3a38d62b05 Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Tue, 1 Nov 2022 11:24:16 +0100 Subject: [PATCH 2/6] JitArm64: cmp - Refactor --- .../PowerPC/JitArm64/JitArm64_Integer.cpp | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 6c0d159be8..2b46e80e70 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -578,25 +578,24 @@ void JitArm64::cmp(UGeckoInstruction inst) s64 A = static_cast(gpr.GetImm(a)); s64 B = static_cast(gpr.GetImm(b)); MOVI2R(CR, A - B); - return; } - - if (gpr.IsImm(b) && !gpr.GetImm(b)) + else if (gpr.IsImm(b) && !gpr.GetImm(b)) { SXTW(CR, gpr.R(a)); - return; } + else + { + ARM64Reg WA = gpr.GetReg(); + ARM64Reg XA = EncodeRegTo64(WA); + ARM64Reg RA = gpr.R(a); + ARM64Reg RB = gpr.R(b); - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); - ARM64Reg RA = gpr.R(a); - ARM64Reg RB = gpr.R(b); + SXTW(XA, RA); + SXTW(CR, RB); + SUB(CR, XA, CR); - SXTW(XA, RA); - SXTW(CR, RB); - SUB(CR, XA, CR); - - gpr.Unlock(WA); + gpr.Unlock(WA); + } } void JitArm64::cmpl(UGeckoInstruction inst) From 592ba31e224c062fe558dad21d2f605196df0b31 Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Tue, 1 Nov 2022 11:51:57 +0100 Subject: [PATCH 3/6] JitArm64: cmp - Optimize a == 0 case By explicitly handling this, we can avoid materializing zero in a register and generate more efficient code altogether. Before: 0x52800016 mov w22, #0x0 0xb94093b5 ldr w21, [x29, #0x90] 0x93407ed7 sxtw x23, w22 0x93407eb9 sxtw x25, w21 0xcb1902f9 sub x25, x23, x25 After: 0xb94093b7 ldr w23, [x29, #0x90] 0x4b1703f9 neg w25, w23 0x93407f39 sxtw x25, w25 --- Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 2b46e80e70..2166a1427d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -579,6 +579,11 @@ void JitArm64::cmp(UGeckoInstruction inst) s64 B = static_cast(gpr.GetImm(b)); MOVI2R(CR, A - B); } + else if (gpr.IsImm(a) && !gpr.GetImm(a)) + { + NEG(EncodeRegTo32(CR), gpr.R(b)); + SXTW(CR, EncodeRegTo32(CR)); + } else if (gpr.IsImm(b) && !gpr.GetImm(b)) { SXTW(CR, gpr.R(a)); From 82f22cdfa1fbaeaac8cf608294c4dc730ee0f099 Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Tue, 1 Nov 2022 12:00:16 +0100 Subject: [PATCH 4/6] JitArm64: cmp - Optimize a == -1 case By explicitly handling this, we can avoid materializing -1 in a register and generate more efficient code by taking advantage of -x == ~x + 1. Before: 0x12800015 mov w21, #-0x1 0x93407eb9 sxtw x25, w21 0x93407ef8 sxtw x24, w23 0xcb180338 sub x24, x25, x24 After: 0x2a3703f8 mvn w24, w23 0x93407f18 sxtw x24, w24 --- Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 2166a1427d..6378c322e7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -584,6 +584,11 @@ void JitArm64::cmp(UGeckoInstruction inst) NEG(EncodeRegTo32(CR), gpr.R(b)); SXTW(CR, EncodeRegTo32(CR)); } + else if (gpr.IsImm(a) && gpr.GetImm(a) == 0xFFFFFFFF) + { + MVN(EncodeRegTo32(CR), gpr.R(b)); + SXTW(CR, EncodeRegTo32(CR)); + } else if (gpr.IsImm(b) && !gpr.GetImm(b)) { SXTW(CR, gpr.R(a)); From ae6ce1df48578d381dd43450e00c4254bb8b490d Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Tue, 1 Nov 2022 12:15:56 +0100 Subject: [PATCH 5/6] Arm64Emitter: Add ArithOption with ExtendSpecifier ARM64 can do perform various types of sign and zero extension on a register value before using it. The Arm64Emitter already had support for this, but it was kinda hidden away. This commit exposes the functionality by making the ExtendSpecifier enum available everywhere and adding a new ArithOption constructor. --- Source/Core/Common/Arm64Emitter.h | 33 ++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index caf997a29d..19d63a8dae 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -309,6 +309,18 @@ enum class ShiftType ROR = 3, }; +enum class ExtendSpecifier +{ + UXTB = 0x0, + UXTH = 0x1, + UXTW = 0x2, /* Also LSL on 32bit width */ + UXTX = 0x3, /* Also LSL on 64bit width */ + SXTB = 0x4, + SXTH = 0x5, + SXTW = 0x6, + SXTX = 0x7, +}; + enum class IndexType { Unsigned, @@ -405,18 +417,6 @@ private: Width64Bit, }; - enum class ExtendSpecifier - { - UXTB = 0x0, - UXTH = 0x1, - UXTW = 0x2, /* Also LSL on 32bit width */ - UXTX = 0x3, /* Also LSL on 64bit width */ - SXTB = 0x4, - SXTH = 0x5, - SXTW = 0x6, - SXTX = 0x7, - }; - enum class TypeSpecifier { ExtendedReg, @@ -463,6 +463,15 @@ public: } m_shifttype = ShiftType::LSL; } + ArithOption(ARM64Reg Rd, ExtendSpecifier extend_type, u32 shift = 0) + { + m_destReg = Rd; + m_width = Is64Bit(Rd) ? WidthSpecifier::Width64Bit : WidthSpecifier::Width32Bit; + m_extend = extend_type; + m_type = TypeSpecifier::ExtendedReg; + m_shifttype = ShiftType::LSL; + m_shift = shift; + } ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) { m_destReg = Rd; From d0de68c41bb7d1676ae99cf750cb1bbabb7317a1 Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Tue, 1 Nov 2022 12:21:24 +0100 Subject: [PATCH 6/6] JitArm64: cmp - Optimize general case We can merge an SXTW with the SUB, eliminating one instruction. In addition, it is no longer necessary to allocate a temporary register, reducing register pressure. Before: 0x93407f59 sxtw x25, w26 0x93407ebb sxtw x27, w21 0xcb1b033b sub x27, x25, x27 After: 0x93407f5b sxtw x27, w26 0xcb35c37b sub x27, x27, w21, sxtw --- Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 6378c322e7..f315efb976 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -595,16 +595,11 @@ void JitArm64::cmp(UGeckoInstruction inst) } else { - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); ARM64Reg RA = gpr.R(a); ARM64Reg RB = gpr.R(b); - SXTW(XA, RA); - SXTW(CR, RB); - SUB(CR, XA, CR); - - gpr.Unlock(WA); + SXTW(CR, RA); + SUB(CR, CR, RB, ArithOption(RB, ExtendSpecifier::SXTW)); } }