From 4dbf0b8e904ad3a338809ff1df4ba87836fffebe Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Wed, 19 Oct 2022 19:35:10 +0200
Subject: [PATCH] JitArm64: Reimplement Force25BitPrecision

The previous implementation of Force25BitPrecision was essentially a
translation of the x86-64 implementation. It worked, but we can make a
more efficient implementation by using an AArch64 instruction I don't
believe x86-64 has an equivalent of: URSHR. The latency is the same as
before, but the instruction count and register count are both reduced.
---
 Source/Core/Common/Arm64Emitter.cpp           | 37 ++++++++++++++++++-
 Source/Core/Common/Arm64Emitter.h             |  8 +++-
 Source/Core/Core/PowerPC/JitArm64/Jit.h       |  3 +-
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 33 ++++++-----------
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp |  8 +---
 5 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp
index 2d6531503b..30dbaacd46 100644
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@@ -2354,7 +2354,7 @@ void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64R
 
 void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
 {
-  Write32((2 << 30) | (U << 29) | (0x3E << 23) | (imm << 16) | (opcode << 11) | (1 << 10) |
+  Write32((1 << 30) | (U << 29) | (0x3E << 23) | (imm << 16) | (opcode << 11) | (1 << 10) |
           (DecodeReg(Rn) << 5) | DecodeReg(Rd));
 }
 
@@ -3540,7 +3540,26 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
   EmitPermute(size, 0b111, Rd, Rn, Rm);
 }
 
-// Shift by immediate
+// Scalar shift by immediate
+void ARM64FloatEmitter::SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  constexpr size_t src_size = 64;
+  ASSERT_MSG(DYNA_REC, IsDouble(Rd), "Only double registers are supported!");
+  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
+             shift, src_size);
+  EmitScalarShiftImm(0, src_size | shift, 0b01010, Rd, Rn);
+}
+
+void ARM64FloatEmitter::URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  constexpr size_t src_size = 64;
+  ASSERT_MSG(DYNA_REC, IsDouble(Rd), "Only double registers are supported!");
+  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
+             shift, src_size);
+  EmitScalarShiftImm(1, src_size * 2 - shift, 0b00100, Rd, Rn);
+}
+
+// Vector shift by immediate
 void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
   SSHLL(src_size, Rd, Rn, shift, false);
@@ -3582,6 +3601,13 @@ void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
   UXTL(src_size, Rd, Rn, true);
 }
 
+void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
+             shift, src_size);
+  EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn);
+}
+
 void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
 {
   ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
@@ -3589,6 +3615,13 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
   EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn);
 }
 
+void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
+             shift, src_size);
+  EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn);
+}
+
 void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
 {
   ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h
index a180b5282c..caf997a29d 100644
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@@ -1229,9 +1229,15 @@ public:
   void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 
-  // Shift by immediate
+  // Scalar shift by immediate
+  void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+
+  // Vector shift by immediate
+  void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index 75565b771d..2d0f39b66b 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -333,8 +333,7 @@ protected:
                bool Rc = false);
 
   void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
-  void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input,
-                           Arm64Gen::ARM64Reg temp);
+  void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input);
 
   // <Fastmem fault location, slowmem handler location>
   std::map<const u8*, FastmemArea> m_fault_to_handler;
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index 40bad6ee03..260da2d900 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -45,24 +45,18 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
 // Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
 // a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
 // rounding to nearest as it does.
-void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp)
+void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input)
 {
-  ASSERT(output != input && output != temp && input != temp);
-
-  // temp   = 0x0000'0000'0800'0000ULL
-  // output = 0xFFFF'FFFF'F800'0000ULL
-  m_float_emit.MOVI(32, temp, 0x08, 24);
-  m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL);
-  m_float_emit.BIC(temp, temp, output);
-  m_float_emit.ORR(32, output, 0xF8, 24);
-
-  // output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1)
-  m_float_emit.AND(temp, input, temp);
-  m_float_emit.AND(output, input, output);
   if (IsQuad(input))
-    m_float_emit.ADD(64, output, output, temp);
+  {
+    m_float_emit.URSHR(64, output, input, 28);
+    m_float_emit.SHL(64, output, output, 28);
+  }
   else
-    m_float_emit.ADD(output, output, temp);
+  {
+    m_float_emit.URSHR(output, input, 28);
+    m_float_emit.SHL(output, output, 28);
+  }
 }
 
 void JitArm64::fp_arith(UGeckoInstruction inst)
@@ -113,9 +107,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
 
       V0Q = fpr.GetReg();
-      V1Q = fpr.GetReg();
 
-      Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
+      Force25BitPrecision(reg_encoder(V0Q), VC);
       VC = reg_encoder(V0Q);
     }
 
@@ -160,18 +153,16 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     {
       ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
 
-      V0Q = fpr.GetReg();
       V1Q = fpr.GetReg();
 
-      Force25BitPrecision(reg_encoder(V1Q), VC, reg_encoder(V0Q));
+      Force25BitPrecision(reg_encoder(V1Q), VC);
       VC = reg_encoder(V1Q);
     }
 
     ARM64Reg inaccurate_fma_temp_reg = VD;
     if (inaccurate_fma && d == b)
     {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetReg();
+      V0Q = fpr.GetReg();
 
       inaccurate_fma_temp_reg = reg_encoder(V0Q);
     }
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 7682679044..0d786f80a0 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -103,12 +103,9 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
     ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
 
     V0Q = fpr.GetReg();
-    const ARM64Reg V1Q = fpr.GetReg();
 
-    Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q));
+    Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC));
     VC = reg_encoder(V0Q);
-
-    fpr.Unlock(V1Q);
   }
 
   m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
@@ -165,10 +162,9 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
   {
     ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
 
-    allocate_v0_if_needed();
     V1Q = fpr.GetReg();
 
-    Force25BitPrecision(reg_encoder(V1Q), VC, V0);
+    Force25BitPrecision(reg_encoder(V1Q), VC);
     VC = reg_encoder(V1Q);
   }