[a64] Implement `OPCODE_VECTOR_SHL`

Vector registers are passed as pointers rather than directly in the `Qn` registers. So these functions should be taking pointer-type arguments rather than vector-register types directly. Fixes `OPCODE_VECTOR_SHL` and passes unit tests.
2024-05-04 11:23:45 -07:00 · 2024-05-04 11:23:45 -07:00 · 07a4df8e2f
parent 3d345d71a7
commit 07a4df8e2f
1 changed files with 86 additions and 4 deletions
--- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc
+++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc
@ -424,7 +424,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
 // ============================================================================
 // OPCODE_VECTOR_SHL
 // ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static uint8x16_t EmulateVectorShl(void*, std::byte src1[16],
+                                   std::byte src2[16]) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];

+  // Load NEON registers into a C array.
+  vst1q_u8(reinterpret_cast<T*>(value), vld1q_u8(src1));
+  vst1q_u8(reinterpret_cast<T*>(shamt), vld1q_u8(src2));
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return vld1q_u8(value);
+}
 struct VECTOR_SHL_V128
    : Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
  static void Emit(A64Emitter& e, const EmitArgType& i) {
@ -444,11 +460,77 @@ struct VECTOR_SHL_V128
    }
  }

-  static void EmitInt8(A64Emitter& e, const EmitArgType& i) {}
+  static void EmitInt8(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SHL
+        e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }

-  static void EmitInt16(A64Emitter& e, const EmitArgType& i) {}
+  static void EmitInt16(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SHL
+        e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }

-  static void EmitInt32(A64Emitter& e, const EmitArgType& i) {}
+  static void EmitInt32(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SHL
+        e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);

@ -491,7 +573,7 @@ static uint8x16_t EmulateVectorShr(void*, uint8x16_t src1, uint8x16_t src2) {
  alignas(16) T value[16 / sizeof(T)];
  alignas(16) T shamt[16 / sizeof(T)];

-  // Load SSE registers into a C array.
+  // Load NEON registers into a C array.
  vst1q_u8(reinterpret_cast<T*>(value), src1);
  vst1q_u8(reinterpret_cast<T*>(shamt), src2);