[a64] Use VectorCodeGenerator rather than CodeBlock+CodeGenerator

The emitter doesn't actually hold onto executable code, but just generates the assembly-data into a buffer for the currently-resolving function before placing it into a code-cache. When code gets pushed into the code-cache, it can just be copied from an `std::vector` and reset. The code-cache itself maintains the actual executable memory and stack-unwinding code and such. This also fixes a bunch of errornous relative-addressing glitches where relative addresses were calculated based on the address of the unused CodeBlock rather than being position-independent. `MOVP2R` in particular was generating different instructions depending on its distance from the code block when it should always just use `MOV` and not do any relative-address calculations since we can't predict where the actual instruction's offset will be(we cannot predict what the program counter will be). Oaknut probably needs a "position independent" policy or mode or something so that it avoids PC-relative instructions.
2024-06-08 14:23:59 -07:00 · 2024-06-08 14:23:59 -07:00 · 2953e2e6fc
parent 02edbd264d
commit 2953e2e6fc
4 changed files with 36 additions and 37 deletions
--- a/src/xenia/cpu/backend/a64/a64_emitter.cc
+++ b/src/xenia/cpu/backend/a64/a64_emitter.cc
@ -58,8 +58,6 @@ using xe::cpu::hir::Instr;
 using namespace xe::literals;
 using namespace oaknut::util;

-static const size_t kMaxCodeSize = 1_MiB;
-
 static const size_t kStashOffset = 32;
 // static const size_t kStashOffsetHigh = 32 + 32;

@ -73,8 +71,7 @@ const uint8_t A64Emitter::fpr_reg_map_[A64Emitter::FPR_COUNT] = {
 };

 A64Emitter::A64Emitter(A64Backend* backend)
-    : CodeBlock(kMaxCodeSize),
-      CodeGenerator(CodeBlock::ptr()),
+    : VectorCodeGenerator(assembly_buffer),
      processor_(backend->processor()),
      backend_(backend),
      code_cache_(backend->code_cache()) {
@ -138,23 +135,22 @@ bool A64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
 void* A64Emitter::Emplace(const EmitFunctionInfo& func_info,
                          GuestFunction* function) {
  // Copy the current oaknut instruction-buffer into the code-cache
-  uint32_t* old_address = CodeBlock::ptr();
  void* new_execute_address;
  void* new_write_address;

  assert_true(func_info.code_size.total == offset());

  if (function) {
-    code_cache_->PlaceGuestCode(function->address(), CodeBlock::ptr(),
+    code_cache_->PlaceGuestCode(function->address(), assembly_buffer.data(),
                                func_info, function, new_execute_address,
                                new_write_address);
  } else {
-    code_cache_->PlaceHostCode(0, CodeBlock::ptr(), func_info,
+    code_cache_->PlaceHostCode(0, assembly_buffer.data(), func_info,
                               new_execute_address, new_write_address);
  }

  // Reset the oaknut instruction-buffer
-  set_wptr(reinterpret_cast<uint32_t*>(old_address));
+  assembly_buffer.clear();
  label_lookup_.clear();

  return new_execute_address;
@ -224,7 +220,8 @@ bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

    // Call count.
    MOV(W0, 1);
-    MOVP2R(X5, low_address(&trace_header->function_call_count));
+    MOV(X5, reinterpret_cast<uintptr_t>(
+                low_address(&trace_header->function_call_count)));
    LDADDAL(X0, X0, X5);

    // Get call history slot.
@ -234,8 +231,8 @@ bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
    AND(W0, W0, 0b00000011);

    // Record call history value into slot (guest addr in W1).
-    MOV(X5, uint32_t(
-                uint64_t(low_address(&trace_header->function_caller_history))));
+    MOV(X5, reinterpret_cast<uintptr_t>(
+                low_address(&trace_header->function_caller_history)));
    STR(W1, X5, X0, oaknut::IndexExt::LSL, 2);

    // Calling thread. Load X0 with thread ID.
@ -243,7 +240,8 @@ bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
    MOV(W5, 1);
    LSL(W0, W5, W0);

-    MOVP2R(X5, low_address(&trace_header->function_thread_use));
+    MOV(X5, reinterpret_cast<uintptr_t>(
+                low_address(&trace_header->function_thread_use)));
    LDSET(W0, WZR, X5);
  }

@ -334,8 +332,9 @@ void A64Emitter::MarkSourceOffset(const Instr* i) {
    const uint32_t instruction_index =
        (entry->guest_address - trace_data_->start_address()) / 4;
    MOV(X0, 1);
-    MOVP2R(X1, low_address(trace_data_->instruction_execute_counts() +
-                           instruction_index * 8));
+    MOV(X1, reinterpret_cast<uintptr_t>(
+                low_address(trace_data_->instruction_execute_counts() +
+                            instruction_index * 8)));
    LDADDAL(X0, ZR, X1);
  }
 }
@ -803,11 +802,9 @@ void A64Emitter::FreeConstData(uintptr_t data) {
                       memory::DeallocationType::kRelease);
 }

-std::byte* A64Emitter::GetVConstPtr() const {
-  return reinterpret_cast<std::byte*>(backend_->emitter_data());
-}
+uintptr_t A64Emitter::GetVConstPtr() const { return backend_->emitter_data(); }

-std::byte* A64Emitter::GetVConstPtr(VConst id) const {
+uintptr_t A64Emitter::GetVConstPtr(VConst id) const {
  // Load through fixed constant table setup by PlaceConstData.
  // It's important that the pointer is not signed, as it will be sign-extended.
  return GetVConstPtr() + GetVConstOffset(id);
--- a/src/xenia/cpu/backend/a64/a64_emitter.h
+++ b/src/xenia/cpu/backend/a64/a64_emitter.h
@ -122,7 +122,7 @@ enum A64EmitterFeatureFlags {
  kA64EmitF16C = 1 << 1,
 };

-class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator {
+class A64Emitter : public oaknut::VectorCodeGenerator {
 public:
  A64Emitter(A64Backend* backend);
  virtual ~A64Emitter();
@ -203,8 +203,8 @@ class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator {
  static bool ConstantFitsIn32Reg(uint64_t v);
  void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v);

-  std::byte* GetVConstPtr() const;
-  std::byte* GetVConstPtr(VConst id) const;
+  uintptr_t GetVConstPtr() const;
+  uintptr_t GetVConstPtr(VConst id) const;
  static constexpr uintptr_t GetVConstOffset(VConst id) {
    return sizeof(vec128_t) * id;
  }
@ -239,6 +239,8 @@ class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator {
  A64CodeCache* code_cache_ = nullptr;
  uint32_t feature_flags_ = 0;

+  std::vector<std::uint32_t> assembly_buffer;
+
  oaknut::Label* epilog_label_ = nullptr;

  // Convert from plain-text label-names into oaknut-labels
--- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc
+++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc
@ -83,10 +83,10 @@ struct LOAD_VECTOR_SHL_I8
    if (i.src1.is_constant) {
      auto sh = i.src1.constant();
      assert_true(sh < xe::countof(lvsl_table));
-      e.MOVP2R(X0, &lvsl_table[sh]);
+      e.MOV(X0, reinterpret_cast<uintptr_t>(&lvsl_table[sh]));
      e.LDR(i.dest, X0);
    } else {
-      e.MOVP2R(X0, lvsl_table);
+      e.MOV(X0, reinterpret_cast<uintptr_t>(lvsl_table));
      e.AND(X1, i.src1.reg().toX(), 0xf);
      e.LDR(i.dest, X0, X1, IndexExt::LSL, 4);
    }
@ -121,10 +121,10 @@ struct LOAD_VECTOR_SHR_I8
    if (i.src1.is_constant) {
      auto sh = i.src1.constant();
      assert_true(sh < xe::countof(lvsr_table));
-      e.MOVP2R(X0, &lvsr_table[sh]);
+      e.MOV(X0, reinterpret_cast<uintptr_t>(&lvsr_table[sh]));
      e.LDR(i.dest, X0);
    } else {
-      e.MOVP2R(X0, lvsr_table);
+      e.MOV(X0, reinterpret_cast<uintptr_t>(lvsr_table));
      e.AND(X1, i.src1.reg().toX(), 0xf);
      e.LDR(i.dest, X0, X1, IndexExt::LSL, 4);
    }
@ -1007,7 +1007,7 @@ struct EXTRACT_I32
      e.AND(X0, i.src2.reg().toX(), 0b11);
      e.LSL(X0, X0, 4);

-      e.MOVP2R(X1, extract_table_32);
+      e.MOV(X1, reinterpret_cast<uintptr_t>(extract_table_32));
      e.LDR(Q0, X1, X0);

      // Byte-table lookup
@ -1335,7 +1335,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    }

    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
    // are valid - max before min to pack NaN as zero (5454082B is heavily
@ -1435,7 +1435,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
      e.LoadConstantV(src, i.src1.constant());
    }
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    // Saturate
    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
@ -1456,7 +1456,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
      e.LoadConstantV(src, i.src1.constant());
    }
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    // Saturate
    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
@ -1478,7 +1478,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
      e.LoadConstantV(src, i.src1.constant());
    }
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    // Saturate.
    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked));
@ -1519,7 +1519,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
      e.LoadConstantV(src, i.src1.constant());
    }
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    // Saturate.
    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked));
@ -1740,7 +1740,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
  static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) {
    // ARGB (WXYZ) -> RGBA (XYZW)
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    QReg src(0);

@ -1849,7 +1849,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
    // src is (xx,xx,xx,VALUE)
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    QReg src(0);
    if (i.src1.is_constant) {
@ -1892,7 +1892,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    // src is (xx,xx,VALUE,VALUE)

    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    QReg src(0);
    if (i.src1.is_constant) {
@ -1928,7 +1928,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
  }
  static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    QReg src(0);
    if (i.src1.is_constant) {
@ -1972,7 +1972,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
  }
  static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {
    const XReg VConstData = X3;
-    e.MOVP2R(VConstData, e.GetVConstPtr());
+    e.MOV(VConstData, e.GetVConstPtr());

    QReg src(0);
    if (i.src1.is_constant) {
--- a/src/xenia/cpu/backend/a64/a64_sequences.cc
+++ b/src/xenia/cpu/backend/a64/a64_sequences.cc
@ -2758,7 +2758,7 @@ struct SET_ROUNDING_MODE_I32
    e.AND(W1, i.src1, 0b111);

    // Use the low 3 bits as an index into a LUT
-    e.MOVP2R(X0, fpcr_table);
+    e.MOV(X0, reinterpret_cast<uintptr_t>(fpcr_table));
    e.LDRB(W0, X0, X1);

    // Replace FPCR bits with new value