[a64] Optimize `OPCODE_SPLAT` with `MOVI`/`FMOV`

Moves the `FMOV` constant functions into `a64_util` so it is available to other translation units. Optimize constant-splats with conditional use of `MOVI` and `FMOV`.
2024-06-13 14:07:01 -07:00 · 2024-06-13 14:07:01 -07:00 · 9c8b0678a5
parent 539a03d5f6
commit 9c8b0678a5
3 changed files with 110 additions and 79 deletions
--- a/src/xenia/cpu/backend/a64/a64_emitter.cc
+++ b/src/xenia/cpu/backend/a64/a64_emitter.cc
@ -8,6 +8,7 @@
 */

 #include "xenia/cpu/backend/a64/a64_emitter.h"
+#include "xenia/cpu/backend/a64/a64_util.h"

 #include <cstddef>

@ -810,74 +811,6 @@ uintptr_t A64Emitter::GetVConstPtr(VConst id) const {
  return GetVConstPtr() + GetVConstOffset(id);
 }

-// Attempts to convert an fp32 bit-value into an fp8-immediate value for FMOV
-// returns false if the value cannot be represented
-// C2.2.3 Modified immediate constants in A64 floating-point instructions
-// abcdefgh
-//    V
-// aBbbbbbc defgh000 00000000 00000000
-// B = NOT(b)
-static bool f32_to_fimm8(uint32_t u32, oaknut::FImm8& fp8) {
-  const uint32_t sign = (u32 >> 31) & 1;
-  int32_t exp = ((u32 >> 23) & 0xff) - 127;
-  int64_t mantissa = u32 & 0x7fffff;
-
-  // Too many mantissa bits
-  if (mantissa & 0x7ffff) {
-    return false;
-  }
-  // Too many exp bits
-  if (exp < -3 || exp > 4) {
-    return false;
-  }
-
-  // mantissa = (16 + e:f:g:h) / 16.
-  mantissa >>= 19;
-  if ((mantissa & 0b1111) != mantissa) {
-    return false;
-  }
-
-  // exp = (NOT(b):c:d) - 3
-  exp = ((exp + 3) & 0b111) ^ 0b100;
-
-  fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa));
-  return true;
-}
-
-// Attempts to convert an fp64 bit-value into an fp8-immediate value for FMOV
-// returns false if the value cannot be represented
-// C2.2.3 Modified immediate constants in A64 floating-point instructions
-// abcdefgh
-//    V
-// aBbbbbbb bbcdefgh 00000000 00000000 00000000 00000000 00000000 00000000
-// B = NOT(b)
-static bool f64_to_fimm8(uint64_t u64, oaknut::FImm8& fp8) {
-  const uint32_t sign = (u64 >> 63) & 1;
-  int32_t exp = ((u64 >> 52) & 0x7ff) - 1023;
-  int64_t mantissa = u64 & 0xfffffffffffffULL;
-
-  // Too many mantissa bits
-  if (mantissa & 0xffffffffffffULL) {
-    return false;
-  }
-  // Too many exp bits
-  if (exp < -3 || exp > 4) {
-    return false;
-  }
-
-  // mantissa = (16 + e:f:g:h) / 16.
-  mantissa >>= 48;
-  if ((mantissa & 0b1111) != mantissa) {
-    return false;
-  }
-
-  // exp = (NOT(b):c:d) - 3
-  exp = ((exp + 3) & 0b111) ^ 0b100;
-
-  fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa));
-  return true;
-}
-
 // Implies possible StashV(0, ...)!
 void A64Emitter::LoadConstantV(oaknut::QReg dest, const vec128_t& v) {
  if (!v.low && !v.high) {
--- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc
+++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc
@ -8,6 +8,7 @@
 */

 #include "xenia/cpu/backend/a64/a64_sequences.h"
+#include "xenia/cpu/backend/a64/a64_util.h"

 #include <algorithm>
 #include <cstring>
@ -1026,12 +1027,7 @@ EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32);
 struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
    if (i.src1.is_constant) {
-      if (i.src1.constant() <= 0xFF) {
-        e.MOVI(i.dest.reg().B16(), i.src1.constant());
-        return;
-      }
-      e.MOV(W0, i.src1.constant());
-      e.DUP(i.dest.reg().B16(), W0);
+      e.MOVI(i.dest.reg().B16(), i.src1.constant());
    } else {
      e.DUP(i.dest.reg().B16(), i.src1);
    }
@ -1040,9 +1036,12 @@ struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
 struct SPLAT_I16 : Sequence<SPLAT_I16, I<OPCODE_SPLAT, V128Op, I16Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
    if (i.src1.is_constant) {
-      if (i.src1.constant() <= 0xFF) {
+      if ((i.src1.constant() & 0xFF'00) == 0) {
        e.MOVI(i.dest.reg().H8(), i.src1.constant());
        return;
+      } else if ((i.src1.constant() & 0x00'FF) == 0) {
+        e.MOVI(i.dest.reg().H8(), i.src1.constant(), oaknut::util::LSL, 8);
+        return;
      }
      e.MOV(W0, i.src1.constant());
      e.DUP(i.dest.reg().H8(), W0);
@ -1054,9 +1053,22 @@ struct SPLAT_I16 : Sequence<SPLAT_I16, I<OPCODE_SPLAT, V128Op, I16Op>> {
 struct SPLAT_I32 : Sequence<SPLAT_I32, I<OPCODE_SPLAT, V128Op, I32Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
    if (i.src1.is_constant) {
-      if (i.src1.constant() <= 0xFF) {
+      oaknut::FImm8 fp8(0);
+      if (f32_to_fimm8(i.src1.value->constant.u32, fp8)) {
+        e.FMOV(i.dest.reg().S4(), fp8);
+        return;
+      } else if ((i.src1.constant() & 0xFF'FF'FF'00) == 0) {
        e.MOVI(i.dest.reg().S4(), i.src1.constant());
        return;
+      } else if ((i.src1.constant() & 0xFF'FF'00'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 8);
+        return;
+      } else if ((i.src1.constant() & 0xFF'00'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 16);
+        return;
+      } else if ((i.src1.constant() & 0x00'FF'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 24);
+        return;
      }
      e.MOV(W0, i.src1.constant());
      e.DUP(i.dest.reg().S4(), W0);
@ -1068,8 +1080,24 @@ struct SPLAT_I32 : Sequence<SPLAT_I32, I<OPCODE_SPLAT, V128Op, I32Op>> {
 struct SPLAT_F32 : Sequence<SPLAT_F32, I<OPCODE_SPLAT, V128Op, F32Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
    if (i.src1.is_constant) {
-      if (i.src1.value->constant.i32 <= 0xFF) {
-        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.i32);
+      oaknut::FImm8 fp8(0);
+      if (f32_to_fimm8(i.src1.value->constant.u32, fp8)) {
+        e.FMOV(i.dest.reg().S4(), fp8);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0xFF'FF'FF'00) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0xFF'FF'00'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL,
+               8);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0xFF'00'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL,
+               16);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0x00'FF'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL,
+               24);
        return;
      }
      e.MOV(W0, i.src1.value->constant.i32);
--- a/src/xenia/cpu/backend/a64/a64_util.h
+++ b/src/xenia/cpu/backend/a64/a64_util.h
@ -17,7 +17,77 @@
 namespace xe {
 namespace cpu {
 namespace backend {
-namespace a64 {}  // namespace a64
+namespace a64 {
+
+// Attempts to convert an fp32 bit-value into an fp8-immediate value for FMOV
+// returns false if the value cannot be represented
+// C2.2.3 Modified immediate constants in A64  ing-point instructions
+// abcdefgh
+//    V
+// aBbbbbbc defgh000 00000000 00000000
+// B = NOT(b)
+constexpr bool f32_to_fimm8(uint32_t u32, oaknut::FImm8& fp8) {
+  const uint32_t sign = (u32 >> 31) & 1;
+  int32_t exp = ((u32 >> 23) & 0xff) - 127;
+  int64_t mantissa = u32 & 0x7fffff;
+
+  // Too many mantissa bits
+  if (mantissa & 0x7ffff) {
+    return false;
+  }
+  // Too many exp bits
+  if (exp < -3 || exp > 4) {
+    return false;
+  }
+
+  // mantissa = (16 + e:f:g:h) / 16.
+  mantissa >>= 19;
+  if ((mantissa & 0b1111) != mantissa) {
+    return false;
+  }
+
+  // exp = (NOT(b):c:d) - 3
+  exp = ((exp + 3) & 0b111) ^ 0b100;
+
+  fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa));
+  return true;
+}
+
+// Attempts to convert an fp64 bit-value into an fp8-immediate value for FMOV
+// returns false if the value cannot be represented
+// C2.2.3 Modified immediate constants in A64 floating-point instructions
+// abcdefgh
+//    V
+// aBbbbbbb bbcdefgh 00000000 00000000 00000000 00000000 00000000 00000000
+// B = NOT(b)
+constexpr bool f64_to_fimm8(uint64_t u64, oaknut::FImm8& fp8) {
+  const uint32_t sign = (u64 >> 63) & 1;
+  int32_t exp = ((u64 >> 52) & 0x7ff) - 1023;
+  int64_t mantissa = u64 & 0xfffffffffffffULL;
+
+  // Too many mantissa bits
+  if (mantissa & 0xffffffffffffULL) {
+    return false;
+  }
+  // Too many exp bits
+  if (exp < -3 || exp > 4) {
+    return false;
+  }
+
+  // mantissa = (16 + e:f:g:h) / 16.
+  mantissa >>= 48;
+  if ((mantissa & 0b1111) != mantissa) {
+    return false;
+  }
+
+  // exp = (NOT(b):c:d) - 3
+  exp = ((exp + 3) & 0b111) ^ 0b100;
+
+  fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa));
+  return true;
+}
+
+}  // namespace a64
 }  // namespace backend
 }  // namespace cpu
 }  // namespace xe