diff --git a/.gitmodules b/.gitmodules
index c27065533..142b85d70 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -85,3 +85,6 @@
 [submodule "third_party/VulkanMemoryAllocator"]
 	path = third_party/VulkanMemoryAllocator
 	url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.git
+[submodule "third_party/oaknut"]
+	path = third_party/oaknut
+	url = https://github.com/merryhime/oaknut.git
diff --git a/premake5.lua b/premake5.lua
index 6739d6dba..accb8f91e 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -54,7 +54,7 @@ filter("configurations:Checked")
   defines({
     "DEBUG",
   })
-filter({"configurations:Checked", "platforms:Windows"})
+filter({"configurations:Checked", "platforms:Windows-*"})
   buildoptions({
     "/RTCsu",           -- Full Run-Time Checks.
   })
@@ -153,7 +153,7 @@ filter("platforms:Android-*")
     "log",
   })
 
-filter("platforms:Windows")
+filter("platforms:Windows-*")
   system("windows")
   toolset("msc")
   buildoptions({
@@ -179,8 +179,12 @@ filter("platforms:Windows")
     "_CRT_SECURE_NO_WARNINGS",
     "WIN32",
     "_WIN64=1",
-    "_AMD64=1",
   })
+  filter("architecture:x86_64")
+    defines({
+      "_AMD64=1",
+    })
+  filter({})
   linkoptions({
     "/ignore:4006",  -- Ignores complaints about empty obj files.
     "/ignore:4221",
@@ -198,7 +202,7 @@ filter("platforms:Windows")
   })
 
 -- Embed the manifest for things like dependencies and DPI awareness.
-filter({"platforms:Windows", "kind:ConsoleApp or WindowedApp"})
+filter({"platforms:Windows-*", "kind:ConsoleApp or WindowedApp"})
   files({
     "src/xenia/base/app_win32.manifest"
   })
@@ -228,7 +232,12 @@ workspace("xenia")
         ["ARCHS"] = "x86_64"
       })
     elseif os.istarget("windows") then
-      platforms({"Windows"})
+      platforms({"Windows-ARM64", "Windows-x86_64"})
+      filter("platforms:Windows-ARM64")
+        architecture("ARM64")
+      filter("platforms:Windows-x86_64")
+        architecture("x86_64")
+      filter({})
       -- 10.0.15063.0: ID3D12GraphicsCommandList1::SetSamplePositions.
       -- 10.0.19041.0: D3D12_HEAP_FLAG_CREATE_NOT_ZEROED.
       -- 10.0.22000.0: DWMWA_WINDOW_CORNER_PREFERENCE.
@@ -284,7 +293,13 @@ workspace("xenia")
   include("src/xenia/apu/nop")
   include("src/xenia/base")
   include("src/xenia/cpu")
-  include("src/xenia/cpu/backend/x64")
+
+  filter("architecture:x86_64")
+    include("src/xenia/cpu/backend/x64")
+  filter("architecture:ARM64")
+    include("src/xenia/cpu/backend/a64")
+  filter({})
+
   include("src/xenia/debug/ui")
   include("src/xenia/gpu")
   include("src/xenia/gpu/null")
diff --git a/src/xenia/app/premake5.lua b/src/xenia/app/premake5.lua
index 86fcef758..eb9ded7da 100644
--- a/src/xenia/app/premake5.lua
+++ b/src/xenia/app/premake5.lua
@@ -32,6 +32,7 @@ project("xenia-app")
     "libavcodec",
     "libavutil",
     "mspack",
+    "SDL2",
     "snappy",
     "xxhash",
   })
@@ -72,13 +73,18 @@ project("xenia-app")
       "xenia-cpu-backend-x64",
     })
 
+  filter("architecture:ARM64")
+    links({
+      "xenia-cpu-backend-a64",
+    })
+
   -- TODO(Triang3l): The emulator itself on Android.
   filter("platforms:not Android-*")
     files({
       "xenia_main.cc",
     })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     files({
       "main_resources.rc",
     })
@@ -104,7 +110,7 @@ project("xenia-app")
       "SDL2",
     })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     links({
       "xenia-apu-xaudio2",
       "xenia-gpu-d3d12",
@@ -113,13 +119,13 @@ project("xenia-app")
       "xenia-ui-d3d12",
     })
 
-  filter({"platforms:Windows", SINGLE_LIBRARY_FILTER})
+  filter({"platforms:Windows-*", SINGLE_LIBRARY_FILTER})
     links({
       "xenia-gpu-d3d12-trace-viewer",
       "xenia-ui-window-d3d12-demo",
     })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     -- Only create the .user file if it doesn't already exist.
     local user_file = project_root.."/build/xenia-app.vcxproj.user"
     if not os.isfile(user_file) then
diff --git a/src/xenia/base/clock.cc b/src/xenia/base/clock.cc
index 058eae43a..dd9972ad6 100644
--- a/src/xenia/base/clock.cc
+++ b/src/xenia/base/clock.cc
@@ -21,8 +21,9 @@ DEFINE_bool(clock_no_scaling, false,
             "Guest system time is directly pulled from host.",
             "CPU");
 DEFINE_bool(clock_source_raw, false,
-            "Use the RDTSC instruction as the time source. "
-            "Host CPU must support invariant TSC.",
+            "On x64, Use the RDTSC instruction as the time source. Requires "
+            "invariant TSC. "
+            "On a64, Use the CNTVCT_EL0 register as the time source",
             "CPU");
 
 namespace xe {
diff --git a/src/xenia/base/clock.h b/src/xenia/base/clock.h
index 67a3ebb67..1b57d8b52 100644
--- a/src/xenia/base/clock.h
+++ b/src/xenia/base/clock.h
@@ -18,6 +18,8 @@
 
 #if XE_ARCH_AMD64
 #define XE_CLOCK_RAW_AVAILABLE 1
+#elif XE_ARCH_ARM64
+#define XE_CLOCK_RAW_AVAILABLE 1
 #endif
 
 DECLARE_bool(clock_no_scaling);
diff --git a/src/xenia/base/clock_a64.cc b/src/xenia/base/clock_a64.cc
new file mode 100644
index 000000000..6ca3569fe
--- /dev/null
+++ b/src/xenia/base/clock_a64.cc
@@ -0,0 +1,50 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/base/clock.h"
+#include "xenia/base/platform.h"
+
+#if XE_ARCH_ARM64 && XE_CLOCK_RAW_AVAILABLE
+
+#include "xenia/base/logging.h"
+
+#ifdef _MSC_VER
+#include <arm64_neon.h>
+#include <intrin.h>
+#else
+#include <arm_neon.h>
+#endif
+
+// Wrap all these different cpu compiler intrinsics.
+#if XE_COMPILER_MSVC
+constexpr int32_t CNTFRQ_EL0 = ARM64_SYSREG(3, 3, 14, 0, 0);
+constexpr int32_t CNTVCT_EL0 = ARM64_SYSREG(3, 3, 14, 0, 2);
+#define xe_cpu_mrs(reg) _ReadStatusReg(reg)
+#elif XE_COMPILER_CLANG || XE_COMPILER_GNUC
+constexpr int32_t CNTFRQ_EL0 = 0b11'011'1110'0000'000;
+constexpr int32_t CNTVCT_EL0 = 0b11'011'1110'0000'010;
+
+uint64_t xe_cpu_mrs(uint32_t reg) {
+  uint64_t result;
+  __asm__ volatile("mrs \t%0," #reg : "=r"(result));
+  return result;
+}
+#else
+#error \
+    "No cpu instruction wrappers xe_cpu_mrs(CNTVCT_EL0); for current compiler implemented."
+#endif
+
+namespace xe {
+
+uint64_t Clock::host_tick_frequency_raw() { return xe_cpu_mrs(CNTFRQ_EL0); }
+uint64_t Clock::host_tick_count_raw() { return xe_cpu_mrs(CNTVCT_EL0); }
+
+}  // namespace xe
+
+#endif
diff --git a/src/xenia/base/exception_handler_win.cc b/src/xenia/base/exception_handler_win.cc
index 786a129a5..49e49643f 100644
--- a/src/xenia/base/exception_handler_win.cc
+++ b/src/xenia/base/exception_handler_win.cc
@@ -36,12 +36,22 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) {
   }
 
   HostThreadContext thread_context;
+
+#if XE_ARCH_AMD64
   thread_context.rip = ex_info->ContextRecord->Rip;
   thread_context.eflags = ex_info->ContextRecord->EFlags;
   std::memcpy(thread_context.int_registers, &ex_info->ContextRecord->Rax,
               sizeof(thread_context.int_registers));
   std::memcpy(thread_context.xmm_registers, &ex_info->ContextRecord->Xmm0,
               sizeof(thread_context.xmm_registers));
+#elif XE_ARCH_ARM64
+  thread_context.pc = ex_info->ContextRecord->Pc;
+  thread_context.cpsr = ex_info->ContextRecord->Cpsr;
+  std::memcpy(thread_context.x, &ex_info->ContextRecord->X,
+              sizeof(thread_context.x));
+  std::memcpy(thread_context.v, &ex_info->ContextRecord->V,
+              sizeof(thread_context.v));
+#endif
 
   // https://msdn.microsoft.com/en-us/library/ms679331(v=vs.85).aspx
   // https://msdn.microsoft.com/en-us/library/aa363082(v=vs.85).aspx
@@ -78,6 +88,7 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) {
   for (size_t i = 0; i < xe::countof(handlers_) && handlers_[i].first; ++i) {
     if (handlers_[i].first(&ex, handlers_[i].second)) {
       // Exception handled.
+#if XE_ARCH_AMD64
       ex_info->ContextRecord->Rip = thread_context.rip;
       ex_info->ContextRecord->EFlags = thread_context.eflags;
       uint32_t modified_register_index;
@@ -98,6 +109,28 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) {
                     &thread_context.xmm_registers[modified_register_index],
                     sizeof(vec128_t));
       }
+#elif XE_ARCH_ARM64
+      ex_info->ContextRecord->Pc = thread_context.pc;
+      ex_info->ContextRecord->Cpsr = thread_context.cpsr;
+      uint32_t modified_register_index;
+      uint16_t modified_int_registers_remaining = ex.modified_x_registers();
+      while (xe::bit_scan_forward(modified_int_registers_remaining,
+                                  &modified_register_index)) {
+        modified_int_registers_remaining &=
+            ~(UINT16_C(1) << modified_register_index);
+        ex_info->ContextRecord->X[modified_register_index] =
+            thread_context.x[modified_register_index];
+      }
+      uint16_t modified_xmm_registers_remaining = ex.modified_v_registers();
+      while (xe::bit_scan_forward(modified_xmm_registers_remaining,
+                                  &modified_register_index)) {
+        modified_xmm_registers_remaining &=
+            ~(UINT16_C(1) << modified_register_index);
+        std::memcpy(&ex_info->ContextRecord->V + modified_register_index,
+                    &thread_context.v[modified_register_index],
+                    sizeof(vec128_t));
+      }
+#endif
       return EXCEPTION_CONTINUE_EXECUTION;
     }
   }
diff --git a/src/xenia/base/host_thread_context.cc b/src/xenia/base/host_thread_context.cc
index bf668bdd3..24b2b6e12 100644
--- a/src/xenia/base/host_thread_context.cc
+++ b/src/xenia/base/host_thread_context.cc
@@ -67,7 +67,7 @@ std::string HostThreadContext::GetStringFromValue(HostRegister reg,
     case Arm64Register::kPc:
       return hex ? string_util::to_hex_string(pc) : std::to_string(pc);
     case Arm64Register::kPstate:
-      return hex ? string_util::to_hex_string(pstate) : std::to_string(pstate);
+      return hex ? string_util::to_hex_string(cpsr) : std::to_string(cpsr);
     case Arm64Register::kFpsr:
       return hex ? string_util::to_hex_string(fpsr) : std::to_string(fpsr);
     case Arm64Register::kFpcr:
diff --git a/src/xenia/base/host_thread_context.h b/src/xenia/base/host_thread_context.h
index 554d09f44..6379f62f8 100644
--- a/src/xenia/base/host_thread_context.h
+++ b/src/xenia/base/host_thread_context.h
@@ -202,7 +202,7 @@ class HostThreadContext {
   uint64_t x[31];
   uint64_t sp;
   uint64_t pc;
-  uint64_t pstate;
+  uint32_t cpsr;
   uint32_t fpsr;
   uint32_t fpcr;
   vec128_t v[32];
diff --git a/src/xenia/base/main_init_win.cc b/src/xenia/base/main_init_win.cc
index 6b0a9059a..e67e50b66 100644
--- a/src/xenia/base/main_init_win.cc
+++ b/src/xenia/base/main_init_win.cc
@@ -11,6 +11,8 @@
 
 #include <cstdlib>
 
+#if XE_ARCH_AMD64
+
 // Includes Windows headers, so it goes after platform_win.h.
 #include "third_party/xbyak/xbyak/xbyak_util.h"
 
@@ -39,3 +41,5 @@ class StartupAvxCheck {
 #pragma warning(suppress : 4073)
 #pragma init_seg(lib)
 static StartupAvxCheck gStartupAvxCheck;
+
+#endif
\ No newline at end of file
diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h
index 55dce4b45..14dd8d6b1 100644
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@@ -31,6 +31,8 @@
 
 #if XE_ARCH_AMD64
 #include <xmmintrin.h>
+#elif XE_ARCH_ARM64
+#include <arm64_neon.h>
 #endif
 
 namespace xe {
@@ -135,10 +137,17 @@ constexpr inline uint32_t bit_count(T v) {
 }
 #else
 #if XE_COMPILER_MSVC || XE_COMPILER_INTEL
+#if XE_ARCH_AMD64
 inline uint32_t bit_count(uint32_t v) { return __popcnt(v); }
 inline uint32_t bit_count(uint64_t v) {
   return static_cast<uint32_t>(__popcnt64(v));
 }
+#elif XE_ARCH_ARM64
+inline uint32_t bit_count(uint32_t v) { return _CountOneBits(v); }
+inline uint32_t bit_count(uint64_t v) {
+  return static_cast<uint32_t>(_CountOneBits64(v));
+}
+#endif
 #elif XE_COMPILER_GCC || XE_COMPILER_CLANG
 static_assert(sizeof(unsigned int) == sizeof(uint32_t));
 static_assert(sizeof(unsigned long long) == sizeof(uint64_t));
@@ -372,6 +381,24 @@ template <int N>
 int64_t m128_i64(const __m128& v) {
   return m128_i64<N>(_mm_castps_pd(v));
 }
+#elif XE_ARCH_ARM64
+// Utilities for NEON values.
+template <int N>
+float m128_f32(const float32x4_t& v) {
+  return vgetq_lane_f32(v, N);
+}
+template <int N>
+int32_t m128_i32(const int32x4_t& v) {
+  return vgetq_lane_s32(v, N);
+}
+template <int N>
+double m128_f64(const float64x2_t& v) {
+  return vgetq_lane_f64(v, N);
+}
+template <int N>
+int64_t m128_i64(const int64x2_t& v) {
+  return vgetq_lane_s64(v, N);
+}
 #endif
 
 // Similar to the C++ implementation of XMConvertFloatToHalf and
diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h
index 439d0c467..c852ad649 100644
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@@ -66,6 +66,14 @@
 #define XE_ARCH_PPC 1
 #endif
 
+#ifdef XE_ARCH_AMD64
+#define XE_HOST_ARCH_NAME "x64"
+#elif XE_ARCH_ARM64
+#define XE_HOST_ARCH_NAME "a64"
+#elif XE_ARCH_PPC
+#define XE_HOST_ARCH_NAME "ppc"
+#endif
+
 #if XE_PLATFORM_WIN32
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX  // Don't want windows.h including min/max macros.
diff --git a/src/xenia/cpu/backend/a64/a64_assembler.cc b/src/xenia/cpu/backend/a64/a64_assembler.cc
new file mode 100644
index 000000000..280b82468
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_assembler.cc
@@ -0,0 +1,146 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_assembler.h"
+
+#include <climits>
+
+#include "third_party/capstone/include/capstone/arm64.h"
+#include "third_party/capstone/include/capstone/capstone.h"
+#include "xenia/base/profiling.h"
+#include "xenia/base/reset_scope.h"
+#include "xenia/base/string.h"
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#include "xenia/cpu/backend/a64/a64_code_cache.h"
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+#include "xenia/cpu/backend/a64/a64_function.h"
+#include "xenia/cpu/cpu_flags.h"
+#include "xenia/cpu/hir/hir_builder.h"
+#include "xenia/cpu/hir/label.h"
+#include "xenia/cpu/processor.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+using xe::cpu::hir::HIRBuilder;
+
+A64Assembler::A64Assembler(A64Backend* backend)
+    : Assembler(backend), a64_backend_(backend), capstone_handle_(0) {
+  if (cs_open(CS_ARCH_ARM64, CS_MODE_LITTLE_ENDIAN, &capstone_handle_) !=
+      CS_ERR_OK) {
+    assert_always("Failed to initialize capstone");
+  }
+  cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL);
+  cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_OFF);
+}
+
+A64Assembler::~A64Assembler() {
+  // Emitter must be freed before the allocator.
+  emitter_.reset();
+
+  if (capstone_handle_) {
+    cs_close(&capstone_handle_);
+  }
+}
+
+bool A64Assembler::Initialize() {
+  if (!Assembler::Initialize()) {
+    return false;
+  }
+
+  emitter_.reset(new A64Emitter(a64_backend_));
+
+  return true;
+}
+
+void A64Assembler::Reset() {
+  string_buffer_.Reset();
+  Assembler::Reset();
+}
+
+bool A64Assembler::Assemble(GuestFunction* function, HIRBuilder* builder,
+                            uint32_t debug_info_flags,
+                            std::unique_ptr<FunctionDebugInfo> debug_info) {
+  SCOPE_profile_cpu_f("cpu");
+
+  // Reset when we leave.
+  xe::make_reset_scope(this);
+
+  // Lower HIR -> a64.
+  void* machine_code = nullptr;
+  size_t code_size = 0;
+  if (!emitter_->Emit(function, builder, debug_info_flags, debug_info.get(),
+                      &machine_code, &code_size, &function->source_map())) {
+    return false;
+  }
+
+  // Stash generated machine code.
+  if (debug_info_flags & DebugInfoFlags::kDebugInfoDisasmMachineCode) {
+    DumpMachineCode(machine_code, code_size, function->source_map(),
+                    &string_buffer_);
+    debug_info->set_machine_code_disasm(xe_strdup(string_buffer_.buffer()));
+    string_buffer_.Reset();
+  }
+
+  function->set_debug_info(std::move(debug_info));
+  static_cast<A64Function*>(function)->Setup(
+      reinterpret_cast<uint8_t*>(machine_code), code_size);
+
+  // Install into indirection table.
+  const uint64_t host_address = reinterpret_cast<uint64_t>(machine_code);
+  assert_true((host_address >> 32) == 0);
+  reinterpret_cast<A64CodeCache*>(backend_->code_cache())
+      ->AddIndirection(function->address(),
+                       static_cast<uint32_t>(host_address));
+
+  return true;
+}
+
+void A64Assembler::DumpMachineCode(
+    void* machine_code, size_t code_size,
+    const std::vector<SourceMapEntry>& source_map, StringBuffer* str) {
+  if (source_map.empty()) {
+    return;
+  }
+  auto source_map_index = 0;
+  uint32_t next_code_offset = source_map[0].code_offset;
+
+  const uint8_t* code_ptr = reinterpret_cast<uint8_t*>(machine_code);
+  size_t remaining_code_size = code_size;
+  uint64_t address = uint64_t(machine_code);
+  cs_insn insn = {0};
+  while (remaining_code_size &&
+         cs_disasm_iter(capstone_handle_, &code_ptr, &remaining_code_size,
+                        &address, &insn)) {
+    // Look up source offset.
+    auto code_offset =
+        uint32_t(code_ptr - reinterpret_cast<uint8_t*>(machine_code));
+    if (code_offset >= next_code_offset &&
+        source_map_index < source_map.size()) {
+      auto& source_map_entry = source_map[source_map_index];
+      str->AppendFormat("{:08X} ", source_map_entry.guest_address);
+      ++source_map_index;
+      next_code_offset = source_map_index < source_map.size()
+                             ? source_map[source_map_index].code_offset
+                             : UINT_MAX;
+    } else {
+      str->Append("         ");
+    }
+
+    str->AppendFormat("{:08X}      {:<6} {}\n", uint32_t(insn.address),
+                      insn.mnemonic, insn.op_str);
+  }
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_assembler.h b/src/xenia/cpu/backend/a64/a64_assembler.h
new file mode 100644
index 000000000..95e0a6f1e
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_assembler.h
@@ -0,0 +1,59 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_ASSEMBLER_H_
+#define XENIA_CPU_BACKEND_A64_A64_ASSEMBLER_H_
+
+#include <memory>
+#include <vector>
+
+#include "xenia/base/string_buffer.h"
+#include "xenia/cpu/backend/assembler.h"
+#include "xenia/cpu/function.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+class A64Backend;
+class A64Emitter;
+
+class A64Assembler : public Assembler {
+ public:
+  explicit A64Assembler(A64Backend* backend);
+  ~A64Assembler() override;
+
+  bool Initialize() override;
+
+  void Reset() override;
+
+  bool Assemble(GuestFunction* function, hir::HIRBuilder* builder,
+                uint32_t debug_info_flags,
+                std::unique_ptr<FunctionDebugInfo> debug_info) override;
+
+ private:
+  void DumpMachineCode(void* machine_code, size_t code_size,
+                       const std::vector<SourceMapEntry>& source_map,
+                       StringBuffer* str);
+
+ private:
+  A64Backend* a64_backend_;
+  std::unique_ptr<A64Emitter> emitter_;
+  uintptr_t capstone_handle_;
+
+  StringBuffer string_buffer_;
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_ASSEMBLER_H_
diff --git a/src/xenia/cpu/backend/a64/a64_backend.cc b/src/xenia/cpu/backend/a64/a64_backend.cc
new file mode 100644
index 000000000..8b3f3a6f7
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_backend.cc
@@ -0,0 +1,735 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_backend.h"
+
+#include <cstddef>
+
+#include "third_party/capstone/include/capstone/arm64.h"
+#include "third_party/capstone/include/capstone/capstone.h"
+
+#include "xenia/base/exception_handler.h"
+#include "xenia/base/logging.h"
+#include "xenia/cpu/backend/a64/a64_assembler.h"
+#include "xenia/cpu/backend/a64/a64_code_cache.h"
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+#include "xenia/cpu/backend/a64/a64_function.h"
+#include "xenia/cpu/backend/a64/a64_sequences.h"
+#include "xenia/cpu/backend/a64/a64_stack_layout.h"
+#include "xenia/cpu/breakpoint.h"
+#include "xenia/cpu/processor.h"
+#include "xenia/cpu/stack_walker.h"
+
+DEFINE_int32(a64_extension_mask, -1,
+             "Allow the detection and utilization of specific instruction set "
+             "features.\n"
+             "    0 = armv8.0\n"
+             "    1 = LSE\n"
+             "    2 = F16C\n"
+             "   -1 = Detect and utilize all possible processor features\n",
+             "a64");
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+using namespace oaknut::util;
+
+class A64ThunkEmitter : public A64Emitter {
+ public:
+  A64ThunkEmitter(A64Backend* backend);
+  ~A64ThunkEmitter() override;
+  HostToGuestThunk EmitHostToGuestThunk();
+  GuestToHostThunk EmitGuestToHostThunk();
+  ResolveFunctionThunk EmitResolveFunctionThunk();
+
+ private:
+  // The following four functions provide save/load functionality for registers.
+  // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
+  // allocated on the stack.
+
+  // Caller saved:
+  // Dont assume these registers will survive a subroutine call
+  // x0, v0 is not saved for use as arg0/return
+  // x1-x15, x30 | v0-v7 and v16-v31
+  void EmitSaveVolatileRegs();
+  void EmitLoadVolatileRegs();
+
+  // Callee saved:
+  // Subroutines must preserve these registers if they intend to use them
+  // x19-x30 | d8-d15
+  void EmitSaveNonvolatileRegs();
+  void EmitLoadNonvolatileRegs();
+};
+
+A64Backend::A64Backend() : Backend(), code_cache_(nullptr) {
+  if (cs_open(CS_ARCH_ARM64, CS_MODE_LITTLE_ENDIAN, &capstone_handle_) !=
+      CS_ERR_OK) {
+    assert_always("Failed to initialize capstone");
+  }
+  cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL);
+  cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_ON);
+  cs_option(capstone_handle_, CS_OPT_SKIPDATA, CS_OPT_OFF);
+}
+
+A64Backend::~A64Backend() {
+  if (capstone_handle_) {
+    cs_close(&capstone_handle_);
+  }
+
+  A64Emitter::FreeConstData(emitter_data_);
+  ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this);
+}
+
+bool A64Backend::Initialize(Processor* processor) {
+  if (!Backend::Initialize(processor)) {
+    return false;
+  }
+
+  auto& gprs = machine_info_.register_sets[0];
+  gprs.id = 0;
+  std::strcpy(gprs.name, "x");
+  gprs.types = MachineInfo::RegisterSet::INT_TYPES;
+  gprs.count = A64Emitter::GPR_COUNT;
+
+  auto& fprs = machine_info_.register_sets[1];
+  fprs.id = 1;
+  std::strcpy(fprs.name, "v");
+  fprs.types = MachineInfo::RegisterSet::FLOAT_TYPES |
+               MachineInfo::RegisterSet::VEC_TYPES;
+  fprs.count = A64Emitter::FPR_COUNT;
+
+  code_cache_ = A64CodeCache::Create();
+  Backend::code_cache_ = code_cache_.get();
+  if (!code_cache_->Initialize()) {
+    return false;
+  }
+
+  // Generate thunks used to transition between jitted code and host code.
+  A64ThunkEmitter thunk_emitter(this);
+  host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
+  guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
+  resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
+
+  // Set the code cache to use the ResolveFunction thunk for default
+  // indirections.
+  assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
+  code_cache_->set_indirection_default(
+      uint32_t(uint64_t(resolve_function_thunk_)));
+
+  // Allocate some special indirections.
+  code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
+
+  // Allocate emitter constant data.
+  emitter_data_ = A64Emitter::PlaceConstData();
+
+  // Setup exception callback
+  ExceptionHandler::Install(&ExceptionCallbackThunk, this);
+
+  return true;
+}
+
+void A64Backend::CommitExecutableRange(uint32_t guest_low,
+                                       uint32_t guest_high) {
+  code_cache_->CommitExecutableRange(guest_low, guest_high);
+}
+
+std::unique_ptr<Assembler> A64Backend::CreateAssembler() {
+  return std::make_unique<A64Assembler>(this);
+}
+
+std::unique_ptr<GuestFunction> A64Backend::CreateGuestFunction(
+    Module* module, uint32_t address) {
+  return std::make_unique<A64Function>(module, address);
+}
+
+uint64_t ReadCapstoneReg(HostThreadContext* context, arm64_reg reg) {
+  switch (reg) {
+    case ARM64_REG_X0:
+      return context->x[0];
+    case ARM64_REG_X1:
+      return context->x[1];
+    case ARM64_REG_X2:
+      return context->x[2];
+    case ARM64_REG_X3:
+      return context->x[3];
+    case ARM64_REG_X4:
+      return context->x[4];
+    case ARM64_REG_X5:
+      return context->x[5];
+    case ARM64_REG_X6:
+      return context->x[6];
+    case ARM64_REG_X7:
+      return context->x[7];
+    case ARM64_REG_X8:
+      return context->x[8];
+    case ARM64_REG_X9:
+      return context->x[9];
+    case ARM64_REG_X10:
+      return context->x[10];
+    case ARM64_REG_X11:
+      return context->x[11];
+    case ARM64_REG_X12:
+      return context->x[12];
+    case ARM64_REG_X13:
+      return context->x[13];
+    case ARM64_REG_X14:
+      return context->x[14];
+    case ARM64_REG_X15:
+      return context->x[15];
+    case ARM64_REG_X16:
+      return context->x[16];
+    case ARM64_REG_X17:
+      return context->x[17];
+    case ARM64_REG_X18:
+      return context->x[18];
+    case ARM64_REG_X19:
+      return context->x[19];
+    case ARM64_REG_X20:
+      return context->x[20];
+    case ARM64_REG_X21:
+      return context->x[21];
+    case ARM64_REG_X22:
+      return context->x[22];
+    case ARM64_REG_X23:
+      return context->x[23];
+    case ARM64_REG_X24:
+      return context->x[24];
+    case ARM64_REG_X25:
+      return context->x[25];
+    case ARM64_REG_X26:
+      return context->x[26];
+    case ARM64_REG_X27:
+      return context->x[27];
+    case ARM64_REG_X28:
+      return context->x[28];
+    case ARM64_REG_X29:
+      return context->x[29];
+    case ARM64_REG_X30:
+      return context->x[30];
+    case ARM64_REG_W0:
+      return uint32_t(context->x[0]);
+    case ARM64_REG_W1:
+      return uint32_t(context->x[1]);
+    case ARM64_REG_W2:
+      return uint32_t(context->x[2]);
+    case ARM64_REG_W3:
+      return uint32_t(context->x[3]);
+    case ARM64_REG_W4:
+      return uint32_t(context->x[4]);
+    case ARM64_REG_W5:
+      return uint32_t(context->x[5]);
+    case ARM64_REG_W6:
+      return uint32_t(context->x[6]);
+    case ARM64_REG_W7:
+      return uint32_t(context->x[7]);
+    case ARM64_REG_W8:
+      return uint32_t(context->x[8]);
+    case ARM64_REG_W9:
+      return uint32_t(context->x[9]);
+    case ARM64_REG_W10:
+      return uint32_t(context->x[10]);
+    case ARM64_REG_W11:
+      return uint32_t(context->x[11]);
+    case ARM64_REG_W12:
+      return uint32_t(context->x[12]);
+    case ARM64_REG_W13:
+      return uint32_t(context->x[13]);
+    case ARM64_REG_W14:
+      return uint32_t(context->x[14]);
+    case ARM64_REG_W15:
+      return uint32_t(context->x[15]);
+    case ARM64_REG_W16:
+      return uint32_t(context->x[16]);
+    case ARM64_REG_W17:
+      return uint32_t(context->x[17]);
+    case ARM64_REG_W18:
+      return uint32_t(context->x[18]);
+    case ARM64_REG_W19:
+      return uint32_t(context->x[19]);
+    case ARM64_REG_W20:
+      return uint32_t(context->x[20]);
+    case ARM64_REG_W21:
+      return uint32_t(context->x[21]);
+    case ARM64_REG_W22:
+      return uint32_t(context->x[22]);
+    case ARM64_REG_W23:
+      return uint32_t(context->x[23]);
+    case ARM64_REG_W24:
+      return uint32_t(context->x[24]);
+    case ARM64_REG_W25:
+      return uint32_t(context->x[25]);
+    case ARM64_REG_W26:
+      return uint32_t(context->x[26]);
+    case ARM64_REG_W27:
+      return uint32_t(context->x[27]);
+    case ARM64_REG_W28:
+      return uint32_t(context->x[28]);
+    case ARM64_REG_W29:
+      return uint32_t(context->x[29]);
+    case ARM64_REG_W30:
+      return uint32_t(context->x[30]);
+    default:
+      assert_unhandled_case(reg);
+      return 0;
+  }
+}
+
+bool TestCapstonePstate(arm64_cc cond, uint32_t pstate) {
+  // https://devblogs.microsoft.com/oldnewthing/20220815-00/?p=106975
+  // Upper 4 bits of pstate are NZCV
+  const bool N = !!(pstate & 0x80000000);
+  const bool Z = !!(pstate & 0x40000000);
+  const bool C = !!(pstate & 0x20000000);
+  const bool V = !!(pstate & 0x10000000);
+  switch (cond) {
+    case ARM64_CC_EQ:
+      return (Z == true);
+    case ARM64_CC_NE:
+      return (Z == false);
+    case ARM64_CC_HS:
+      return (C == true);
+    case ARM64_CC_LO:
+      return (C == false);
+    case ARM64_CC_MI:
+      return (N == true);
+    case ARM64_CC_PL:
+      return (N == false);
+    case ARM64_CC_VS:
+      return (V == true);
+    case ARM64_CC_VC:
+      return (V == false);
+    case ARM64_CC_HI:
+      return ((C == true) && (Z == false));
+    case ARM64_CC_LS:
+      return ((C == false) || (Z == true));
+    case ARM64_CC_GE:
+      return (N == V);
+    case ARM64_CC_LT:
+      return (N != V);
+    case ARM64_CC_GT:
+      return ((Z == false) && (N == V));
+    case ARM64_CC_LE:
+      return ((Z == true) || (N != V));
+    case ARM64_CC_AL:
+      return true;
+    case ARM64_CC_NV:
+      return false;
+    default:
+      assert_unhandled_case(cond);
+      return false;
+  }
+}
+
+uint64_t A64Backend::CalculateNextHostInstruction(ThreadDebugInfo* thread_info,
+                                                  uint64_t current_pc) {
+  auto machine_code_ptr = reinterpret_cast<const uint8_t*>(current_pc);
+  size_t remaining_machine_code_size = 64;
+  uint64_t host_address = current_pc;
+  cs_insn insn = {0};
+  cs_detail all_detail = {0};
+  insn.detail = &all_detail;
+  cs_disasm_iter(capstone_handle_, &machine_code_ptr,
+                 &remaining_machine_code_size, &host_address, &insn);
+  const auto& detail = all_detail.arm64;
+  switch (insn.id) {
+    case ARM64_INS_B:
+    case ARM64_INS_BL: {
+      assert_true(detail.operands[0].type == ARM64_OP_IMM);
+      const int64_t pc_offset = static_cast<int64_t>(detail.operands[0].imm);
+      const bool test_passed =
+          TestCapstonePstate(detail.cc, thread_info->host_context.cpsr);
+      if (test_passed) {
+        return current_pc + pc_offset;
+      } else {
+        return current_pc + insn.size;
+      }
+    } break;
+    case ARM64_INS_BR:
+    case ARM64_INS_BLR: {
+      assert_true(detail.operands[0].type == ARM64_OP_REG);
+      const uint64_t target_pc =
+          ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg);
+      return target_pc;
+    } break;
+    case ARM64_INS_RET: {
+      assert_true(detail.operands[0].type == ARM64_OP_REG);
+      const uint64_t target_pc =
+          ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg);
+      return target_pc;
+    } break;
+    case ARM64_INS_CBNZ: {
+      assert_true(detail.operands[0].type == ARM64_OP_REG);
+      assert_true(detail.operands[1].type == ARM64_OP_IMM);
+      const int64_t pc_offset = static_cast<int64_t>(detail.operands[1].imm);
+      const bool test_passed = (0 != ReadCapstoneReg(&thread_info->host_context,
+                                                     detail.operands[0].reg));
+      if (test_passed) {
+        return current_pc + pc_offset;
+      } else {
+        return current_pc + insn.size;
+      }
+    } break;
+    case ARM64_INS_CBZ: {
+      assert_true(detail.operands[0].type == ARM64_OP_REG);
+      assert_true(detail.operands[1].type == ARM64_OP_IMM);
+      const int64_t pc_offset = static_cast<int64_t>(detail.operands[1].imm);
+      const bool test_passed = (0 == ReadCapstoneReg(&thread_info->host_context,
+                                                     detail.operands[0].reg));
+      if (test_passed) {
+        return current_pc + pc_offset;
+      } else {
+        return current_pc + insn.size;
+      }
+    } break;
+    default: {
+      // Not a branching instruction - just move over it.
+      return current_pc + insn.size;
+    } break;
+  }
+}
+
+void A64Backend::InstallBreakpoint(Breakpoint* breakpoint) {
+  breakpoint->ForEachHostAddress([breakpoint](uint64_t host_address) {
+    auto ptr = reinterpret_cast<void*>(host_address);
+    auto original_bytes = xe::load_and_swap<uint32_t>(ptr);
+    assert_true(original_bytes != 0x0000'dead);
+    xe::store_and_swap<uint32_t>(ptr, 0x0000'dead);
+    breakpoint->backend_data().emplace_back(host_address, original_bytes);
+  });
+}
+
+void A64Backend::InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {
+  assert_true(breakpoint->address_type() == Breakpoint::AddressType::kGuest);
+  assert_true(fn->is_guest());
+  auto guest_function = reinterpret_cast<cpu::GuestFunction*>(fn);
+  auto host_address =
+      guest_function->MapGuestAddressToMachineCode(breakpoint->guest_address());
+  if (!host_address) {
+    assert_always();
+    return;
+  }
+
+  // Assume we haven't already installed a breakpoint in this spot.
+  auto ptr = reinterpret_cast<void*>(host_address);
+  auto original_bytes = xe::load_and_swap<uint32_t>(ptr);
+  assert_true(original_bytes != 0x0000'dead);
+  xe::store_and_swap<uint32_t>(ptr, 0x0000'dead);
+  breakpoint->backend_data().emplace_back(host_address, original_bytes);
+}
+
+void A64Backend::UninstallBreakpoint(Breakpoint* breakpoint) {
+  for (auto& pair : breakpoint->backend_data()) {
+    auto ptr = reinterpret_cast<uint8_t*>(pair.first);
+    auto instruction_bytes = xe::load_and_swap<uint32_t>(ptr);
+    assert_true(instruction_bytes == 0x0000'dead);
+    xe::store_and_swap<uint32_t>(ptr, static_cast<uint32_t>(pair.second));
+  }
+  breakpoint->backend_data().clear();
+}
+
+bool A64Backend::ExceptionCallbackThunk(Exception* ex, void* data) {
+  auto backend = reinterpret_cast<A64Backend*>(data);
+  return backend->ExceptionCallback(ex);
+}
+
+bool A64Backend::ExceptionCallback(Exception* ex) {
+  if (ex->code() != Exception::Code::kIllegalInstruction) {
+    // We only care about illegal instructions. Other things will be handled by
+    // other handlers (probably). If nothing else picks it up we'll be called
+    // with OnUnhandledException to do real crash handling.
+    return false;
+  }
+
+  // Verify an expected illegal instruction.
+  auto instruction_bytes =
+      xe::load_and_swap<uint32_t>(reinterpret_cast<void*>(ex->pc()));
+  if (instruction_bytes != 0x0000'dead) {
+    // Not our `udf #0xdead` - not us.
+    return false;
+  }
+
+  // Let the processor handle things.
+  return processor()->OnThreadBreakpointHit(ex);
+}
+
+A64ThunkEmitter::A64ThunkEmitter(A64Backend* backend) : A64Emitter(backend) {}
+
+A64ThunkEmitter::~A64ThunkEmitter() {}
+
+HostToGuestThunk A64ThunkEmitter::EmitHostToGuestThunk() {
+  // X0 = target
+  // X1 = arg0 (context)
+  // X2 = arg1 (guest return address)
+
+  struct _code_offsets {
+    size_t prolog;
+    size_t prolog_stack_alloc;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+  } code_offsets = {};
+
+  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
+
+  code_offsets.prolog = offset();
+
+  SUB(SP, SP, stack_size);
+
+  code_offsets.prolog_stack_alloc = offset();
+  code_offsets.body = offset();
+
+  EmitSaveNonvolatileRegs();
+
+  MOV(X16, X0);
+  MOV(GetContextReg(), X1);  // context
+  MOV(X0, X2);               // return address
+  BLR(X16);
+
+  EmitLoadNonvolatileRegs();
+
+  code_offsets.epilog = offset();
+
+  ADD(SP, SP, stack_size);
+
+  RET();
+
+  code_offsets.tail = offset();
+
+  assert_zero(code_offsets.prolog);
+  EmitFunctionInfo func_info = {};
+  func_info.code_size.total = offset();
+  func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
+  func_info.code_size.body = code_offsets.epilog - code_offsets.body;
+  func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
+  func_info.code_size.tail = offset() - code_offsets.tail;
+  func_info.prolog_stack_alloc_offset =
+      code_offsets.prolog_stack_alloc - code_offsets.prolog;
+  func_info.stack_size = stack_size;
+
+  void* fn = Emplace(func_info);
+  return (HostToGuestThunk)fn;
+}
+
+GuestToHostThunk A64ThunkEmitter::EmitGuestToHostThunk() {
+  // X0 = target function
+  // X1 = arg0
+  // X2 = arg1
+  // X3 = arg2
+
+  struct _code_offsets {
+    size_t prolog;
+    size_t prolog_stack_alloc;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+  } code_offsets = {};
+
+  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
+
+  code_offsets.prolog = offset();
+
+  SUB(SP, SP, stack_size);
+
+  code_offsets.prolog_stack_alloc = offset();
+  code_offsets.body = offset();
+
+  EmitSaveVolatileRegs();
+
+  MOV(X16, X0);              // function
+  MOV(X0, GetContextReg());  // context
+  BLR(X16);
+
+  EmitLoadVolatileRegs();
+
+  code_offsets.epilog = offset();
+
+  ADD(SP, SP, stack_size);
+  RET();
+
+  code_offsets.tail = offset();
+
+  assert_zero(code_offsets.prolog);
+  EmitFunctionInfo func_info = {};
+  func_info.code_size.total = offset();
+  func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
+  func_info.code_size.body = code_offsets.epilog - code_offsets.body;
+  func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
+  func_info.code_size.tail = offset() - code_offsets.tail;
+  func_info.prolog_stack_alloc_offset =
+      code_offsets.prolog_stack_alloc - code_offsets.prolog;
+  func_info.stack_size = stack_size;
+
+  void* fn = Emplace(func_info);
+  return (GuestToHostThunk)fn;
+}
+
+// A64Emitter handles actually resolving functions.
+uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
+
+ResolveFunctionThunk A64ThunkEmitter::EmitResolveFunctionThunk() {
+  // Entry:
+  // W17 = target PPC address
+  // X0 = context
+
+  struct _code_offsets {
+    size_t prolog;
+    size_t prolog_stack_alloc;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+  } code_offsets = {};
+
+  const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
+
+  code_offsets.prolog = offset();
+
+  // Preserve context register
+  STP(ZR, X0, SP, PRE_INDEXED, -16);
+
+  SUB(SP, SP, stack_size);
+
+  code_offsets.prolog_stack_alloc = offset();
+  code_offsets.body = offset();
+
+  EmitSaveVolatileRegs();
+
+  // mov(rcx, rsi);  // context
+  // mov(rdx, rbx);
+  // mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
+  // call(rax)
+  MOV(X0, GetContextReg());  // context
+  MOV(W1, W17);
+  MOV(X16, reinterpret_cast<uint64_t>(&ResolveFunction));
+  BLR(X16);
+  MOV(X16, X0);
+
+  EmitLoadVolatileRegs();
+
+  code_offsets.epilog = offset();
+
+  // add(rsp, stack_size);
+  // jmp(rax);
+  ADD(SP, SP, stack_size);
+
+  // Reload context register
+  LDP(ZR, X0, SP, POST_INDEXED, 16);
+  BR(X16);
+
+  code_offsets.tail = offset();
+
+  assert_zero(code_offsets.prolog);
+  EmitFunctionInfo func_info = {};
+  func_info.code_size.total = offset();
+  func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
+  func_info.code_size.body = code_offsets.epilog - code_offsets.body;
+  func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
+  func_info.code_size.tail = offset() - code_offsets.tail;
+  func_info.prolog_stack_alloc_offset =
+      code_offsets.prolog_stack_alloc - code_offsets.prolog;
+  func_info.stack_size = stack_size;
+
+  void* fn = Emplace(func_info);
+  return (ResolveFunctionThunk)fn;
+}
+
+void A64ThunkEmitter::EmitSaveVolatileRegs() {
+  // Save off volatile registers.
+  // Preserve arguments passed to and returned from a subroutine
+  // STR(X0, SP, offsetof(StackLayout::Thunk, r[0]));
+  STP(X1, X2, SP, offsetof(StackLayout::Thunk, r[0]));
+  STP(X3, X4, SP, offsetof(StackLayout::Thunk, r[2]));
+  STP(X5, X6, SP, offsetof(StackLayout::Thunk, r[4]));
+  STP(X7, X8, SP, offsetof(StackLayout::Thunk, r[6]));
+  STP(X9, X10, SP, offsetof(StackLayout::Thunk, r[8]));
+  STP(X11, X12, SP, offsetof(StackLayout::Thunk, r[10]));
+  STP(X13, X14, SP, offsetof(StackLayout::Thunk, r[12]));
+  STP(X15, X30, SP, offsetof(StackLayout::Thunk, r[14]));
+
+  // Preserve arguments passed to and returned from a subroutine
+  // STR(Q0, SP, offsetof(StackLayout::Thunk, xmm[0]));
+  STP(Q1, Q2, SP, offsetof(StackLayout::Thunk, xmm[0]));
+  STP(Q3, Q4, SP, offsetof(StackLayout::Thunk, xmm[2]));
+  STP(Q5, Q6, SP, offsetof(StackLayout::Thunk, xmm[4]));
+  STP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6]));
+  STP(Q17, Q18, SP, offsetof(StackLayout::Thunk, xmm[8]));
+  STP(Q19, Q20, SP, offsetof(StackLayout::Thunk, xmm[10]));
+  STP(Q21, Q22, SP, offsetof(StackLayout::Thunk, xmm[12]));
+  STP(Q23, Q24, SP, offsetof(StackLayout::Thunk, xmm[14]));
+  STP(Q25, Q26, SP, offsetof(StackLayout::Thunk, xmm[16]));
+  STP(Q27, Q28, SP, offsetof(StackLayout::Thunk, xmm[18]));
+  STP(Q29, Q30, SP, offsetof(StackLayout::Thunk, xmm[20]));
+  STR(Q31, SP, offsetof(StackLayout::Thunk, xmm[21]));
+}
+
+void A64ThunkEmitter::EmitLoadVolatileRegs() {
+  // Preserve arguments passed to and returned from a subroutine
+  // LDR(X0, SP, offsetof(StackLayout::Thunk, r[0]));
+  LDP(X1, X2, SP, offsetof(StackLayout::Thunk, r[0]));
+  LDP(X3, X4, SP, offsetof(StackLayout::Thunk, r[2]));
+  LDP(X5, X6, SP, offsetof(StackLayout::Thunk, r[4]));
+  LDP(X7, X8, SP, offsetof(StackLayout::Thunk, r[6]));
+  LDP(X9, X10, SP, offsetof(StackLayout::Thunk, r[8]));
+  LDP(X11, X12, SP, offsetof(StackLayout::Thunk, r[10]));
+  LDP(X13, X14, SP, offsetof(StackLayout::Thunk, r[12]));
+  LDP(X15, X30, SP, offsetof(StackLayout::Thunk, r[14]));
+
+  // Preserve arguments passed to and returned from a subroutine
+  // LDR(Q0, SP, offsetof(StackLayout::Thunk, xmm[0]));
+  LDP(Q1, Q2, SP, offsetof(StackLayout::Thunk, xmm[0]));
+  LDP(Q3, Q4, SP, offsetof(StackLayout::Thunk, xmm[2]));
+  LDP(Q5, Q6, SP, offsetof(StackLayout::Thunk, xmm[4]));
+  LDP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6]));
+  LDP(Q17, Q18, SP, offsetof(StackLayout::Thunk, xmm[8]));
+  LDP(Q19, Q20, SP, offsetof(StackLayout::Thunk, xmm[10]));
+  LDP(Q21, Q22, SP, offsetof(StackLayout::Thunk, xmm[12]));
+  LDP(Q23, Q24, SP, offsetof(StackLayout::Thunk, xmm[14]));
+  LDP(Q25, Q26, SP, offsetof(StackLayout::Thunk, xmm[16]));
+  LDP(Q27, Q28, SP, offsetof(StackLayout::Thunk, xmm[18]));
+  LDP(Q29, Q30, SP, offsetof(StackLayout::Thunk, xmm[20]));
+  LDR(Q31, SP, offsetof(StackLayout::Thunk, xmm[21]));
+}
+
+void A64ThunkEmitter::EmitSaveNonvolatileRegs() {
+  STP(X19, X20, SP, offsetof(StackLayout::Thunk, r[0]));
+  STP(X21, X22, SP, offsetof(StackLayout::Thunk, r[2]));
+  STP(X23, X24, SP, offsetof(StackLayout::Thunk, r[4]));
+  STP(X25, X26, SP, offsetof(StackLayout::Thunk, r[6]));
+  STP(X27, X28, SP, offsetof(StackLayout::Thunk, r[8]));
+  STP(X29, X30, SP, offsetof(StackLayout::Thunk, r[10]));
+
+  STR(X17, SP, offsetof(StackLayout::Thunk, r[12]));
+
+  STP(D8, D9, SP, offsetof(StackLayout::Thunk, xmm[0]));
+  STP(D10, D11, SP, offsetof(StackLayout::Thunk, xmm[1]));
+  STP(D12, D13, SP, offsetof(StackLayout::Thunk, xmm[2]));
+  STP(D14, D15, SP, offsetof(StackLayout::Thunk, xmm[3]));
+}
+
+void A64ThunkEmitter::EmitLoadNonvolatileRegs() {
+  LDP(X19, X20, SP, offsetof(StackLayout::Thunk, r[0]));
+  LDP(X21, X22, SP, offsetof(StackLayout::Thunk, r[2]));
+  LDP(X23, X24, SP, offsetof(StackLayout::Thunk, r[4]));
+  LDP(X25, X26, SP, offsetof(StackLayout::Thunk, r[6]));
+  LDP(X27, X28, SP, offsetof(StackLayout::Thunk, r[8]));
+  LDP(X29, X30, SP, offsetof(StackLayout::Thunk, r[10]));
+
+  LDR(X17, SP, offsetof(StackLayout::Thunk, r[12]));
+
+  LDP(D8, D9, SP, offsetof(StackLayout::Thunk, xmm[0]));
+  LDP(D10, D11, SP, offsetof(StackLayout::Thunk, xmm[1]));
+  LDP(D12, D13, SP, offsetof(StackLayout::Thunk, xmm[2]));
+  LDP(D14, D15, SP, offsetof(StackLayout::Thunk, xmm[3]));
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_backend.h b/src/xenia/cpu/backend/a64/a64_backend.h
new file mode 100644
index 000000000..57557414c
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_backend.h
@@ -0,0 +1,88 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_BACKEND_H_
+#define XENIA_CPU_BACKEND_A64_A64_BACKEND_H_
+
+#include <memory>
+
+#include "xenia/base/cvar.h"
+#include "xenia/cpu/backend/backend.h"
+
+DECLARE_int32(a64_extension_mask);
+
+namespace xe {
+class Exception;
+}  // namespace xe
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+class A64CodeCache;
+
+typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
+typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
+typedef void (*ResolveFunctionThunk)();
+
+class A64Backend : public Backend {
+ public:
+  static const uint32_t kForceReturnAddress = 0x9FFF0000u;
+
+  explicit A64Backend();
+  ~A64Backend() override;
+
+  A64CodeCache* code_cache() const { return code_cache_.get(); }
+  uintptr_t emitter_data() const { return emitter_data_; }
+
+  // Call a generated function, saving all stack parameters.
+  HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; }
+  // Function that guest code can call to transition into host code.
+  GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; }
+  // Function that thunks to the ResolveFunction in A64Emitter.
+  ResolveFunctionThunk resolve_function_thunk() const {
+    return resolve_function_thunk_;
+  }
+
+  bool Initialize(Processor* processor) override;
+
+  void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override;
+
+  std::unique_ptr<Assembler> CreateAssembler() override;
+
+  std::unique_ptr<GuestFunction> CreateGuestFunction(Module* module,
+                                                     uint32_t address) override;
+
+  uint64_t CalculateNextHostInstruction(ThreadDebugInfo* thread_info,
+                                        uint64_t current_pc) override;
+
+  void InstallBreakpoint(Breakpoint* breakpoint) override;
+  void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
+  void UninstallBreakpoint(Breakpoint* breakpoint) override;
+
+ private:
+  static bool ExceptionCallbackThunk(Exception* ex, void* data);
+  bool ExceptionCallback(Exception* ex);
+
+  uintptr_t capstone_handle_ = 0;
+
+  std::unique_ptr<A64CodeCache> code_cache_;
+  uintptr_t emitter_data_ = 0;
+
+  HostToGuestThunk host_to_guest_thunk_;
+  GuestToHostThunk guest_to_host_thunk_;
+  ResolveFunctionThunk resolve_function_thunk_;
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_BACKEND_H_
diff --git a/src/xenia/cpu/backend/a64/a64_code_cache.cc b/src/xenia/cpu/backend/a64/a64_code_cache.cc
new file mode 100644
index 000000000..f484967ac
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_code_cache.cc
@@ -0,0 +1,342 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_code_cache.h"
+
+#include <cstdlib>
+#include <cstring>
+
+#include "third_party/fmt/include/fmt/format.h"
+#include "xenia/base/assert.h"
+#include "xenia/base/clock.h"
+#include "xenia/base/literals.h"
+#include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+#include "xenia/base/memory.h"
+#include "xenia/cpu/function.h"
+#include "xenia/cpu/module.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+using namespace xe::literals;
+
+A64CodeCache::A64CodeCache() = default;
+
+A64CodeCache::~A64CodeCache() {
+  if (indirection_table_base_) {
+    xe::memory::DeallocFixed(indirection_table_base_, 0,
+                             xe::memory::DeallocationType::kRelease);
+  }
+
+  // Unmap all views and close mapping.
+  if (mapping_ != xe::memory::kFileMappingHandleInvalid) {
+    if (generated_code_write_base_ &&
+        generated_code_write_base_ != generated_code_execute_base_) {
+      xe::memory::UnmapFileView(mapping_, generated_code_write_base_,
+                                kGeneratedCodeSize);
+    }
+    if (generated_code_execute_base_) {
+      xe::memory::UnmapFileView(mapping_, generated_code_execute_base_,
+                                kGeneratedCodeSize);
+    }
+    xe::memory::CloseFileMappingHandle(mapping_, file_name_);
+    mapping_ = xe::memory::kFileMappingHandleInvalid;
+  }
+}
+
+bool A64CodeCache::Initialize() {
+  indirection_table_base_ = reinterpret_cast<uint8_t*>(xe::memory::AllocFixed(
+      reinterpret_cast<void*>(kIndirectionTableBase), kIndirectionTableSize,
+      xe::memory::AllocationType::kReserve,
+      xe::memory::PageAccess::kReadWrite));
+  if (!indirection_table_base_) {
+    XELOGE("Unable to allocate code cache indirection table");
+    XELOGE(
+        "This is likely because the {:X}-{:X} range is in use by some other "
+        "system DLL",
+        static_cast<uint64_t>(kIndirectionTableBase),
+        kIndirectionTableBase + kIndirectionTableSize);
+  }
+
+  // Create mmap file. This allows us to share the code cache with the debugger.
+  file_name_ = fmt::format("xenia_code_cache_{}", Clock::QueryHostTickCount());
+  mapping_ = xe::memory::CreateFileMappingHandle(
+      file_name_, kGeneratedCodeSize, xe::memory::PageAccess::kExecuteReadWrite,
+      false);
+  if (mapping_ == xe::memory::kFileMappingHandleInvalid) {
+    XELOGE("Unable to create code cache mmap");
+    return false;
+  }
+
+  // Map generated code region into the file. Pages are committed as required.
+  if (xe::memory::IsWritableExecutableMemoryPreferred()) {
+    generated_code_execute_base_ =
+        reinterpret_cast<uint8_t*>(xe::memory::MapFileView(
+            mapping_, reinterpret_cast<void*>(kGeneratedCodeExecuteBase),
+            kGeneratedCodeSize, xe::memory::PageAccess::kExecuteReadWrite, 0));
+    generated_code_write_base_ = generated_code_execute_base_;
+    if (!generated_code_execute_base_ || !generated_code_write_base_) {
+      XELOGE("Unable to allocate code cache generated code storage");
+      XELOGE(
+          "This is likely because the {:X}-{:X} range is in use by some other "
+          "system DLL",
+          uint64_t(kGeneratedCodeExecuteBase),
+          uint64_t(kGeneratedCodeExecuteBase + kGeneratedCodeSize));
+      return false;
+    }
+  } else {
+    generated_code_execute_base_ =
+        reinterpret_cast<uint8_t*>(xe::memory::MapFileView(
+            mapping_, reinterpret_cast<void*>(kGeneratedCodeExecuteBase),
+            kGeneratedCodeSize, xe::memory::PageAccess::kExecuteReadOnly, 0));
+    generated_code_write_base_ =
+        reinterpret_cast<uint8_t*>(xe::memory::MapFileView(
+            mapping_, reinterpret_cast<void*>(kGeneratedCodeWriteBase),
+            kGeneratedCodeSize, xe::memory::PageAccess::kReadWrite, 0));
+    if (!generated_code_execute_base_ || !generated_code_write_base_) {
+      XELOGE("Unable to allocate code cache generated code storage");
+      XELOGE(
+          "This is likely because the {:X}-{:X} and {:X}-{:X} ranges are in "
+          "use by some other system DLL",
+          uint64_t(kGeneratedCodeExecuteBase),
+          uint64_t(kGeneratedCodeExecuteBase + kGeneratedCodeSize),
+          uint64_t(kGeneratedCodeWriteBase),
+          uint64_t(kGeneratedCodeWriteBase + kGeneratedCodeSize));
+      return false;
+    }
+  }
+
+  // Preallocate the function map to a large, reasonable size.
+  generated_code_map_.reserve(kMaximumFunctionCount);
+
+  return true;
+}
+
+void A64CodeCache::set_indirection_default(uint32_t default_value) {
+  indirection_default_value_ = default_value;
+}
+
+void A64CodeCache::AddIndirection(uint32_t guest_address,
+                                  uint32_t host_address) {
+  if (!indirection_table_base_) {
+    return;
+  }
+
+  uint32_t* indirection_slot = reinterpret_cast<uint32_t*>(
+      indirection_table_base_ + (guest_address - kIndirectionTableBase));
+  *indirection_slot = host_address;
+}
+
+void A64CodeCache::CommitExecutableRange(uint32_t guest_low,
+                                         uint32_t guest_high) {
+  if (!indirection_table_base_) {
+    return;
+  }
+
+  // Commit the memory.
+  xe::memory::AllocFixed(
+      indirection_table_base_ + (guest_low - kIndirectionTableBase),
+      guest_high - guest_low, xe::memory::AllocationType::kCommit,
+      xe::memory::PageAccess::kReadWrite);
+
+  // Fill memory with the default value.
+  uint32_t* p = reinterpret_cast<uint32_t*>(indirection_table_base_);
+  for (uint32_t address = guest_low; address < guest_high; ++address) {
+    p[(address - kIndirectionTableBase) / 4] = indirection_default_value_;
+  }
+}
+
+void A64CodeCache::PlaceHostCode(uint32_t guest_address, void* machine_code,
+                                 const EmitFunctionInfo& func_info,
+                                 void*& code_execute_address_out,
+                                 void*& code_write_address_out) {
+  // Same for now. We may use different pools or whatnot later on, like when
+  // we only want to place guest code in a serialized cache on disk.
+  PlaceGuestCode(guest_address, machine_code, func_info, nullptr,
+                 code_execute_address_out, code_write_address_out);
+}
+
+void A64CodeCache::PlaceGuestCode(uint32_t guest_address, void* machine_code,
+                                  const EmitFunctionInfo& func_info,
+                                  GuestFunction* function_info,
+                                  void*& code_execute_address_out,
+                                  void*& code_write_address_out) {
+  // Hold a lock while we bump the pointers up. This is important as the
+  // unwind table requires entries AND code to be sorted in order.
+  size_t low_mark;
+  size_t high_mark;
+  uint8_t* code_execute_address;
+  UnwindReservation unwind_reservation;
+  {
+    auto global_lock = global_critical_region_.Acquire();
+
+    low_mark = generated_code_offset_;
+
+    // Reserve code.
+    // Always move the code to land on 16b alignment.
+    code_execute_address =
+        generated_code_execute_base_ + generated_code_offset_;
+    code_execute_address_out = code_execute_address;
+    uint8_t* code_write_address =
+        generated_code_write_base_ + generated_code_offset_;
+    code_write_address_out = code_write_address;
+    generated_code_offset_ += xe::round_up(func_info.code_size.total, 16);
+
+    auto tail_write_address =
+        generated_code_write_base_ + generated_code_offset_;
+
+    // Reserve unwind info.
+    // We go on the high size of the unwind info as we don't know how big we
+    // need it, and a few extra bytes of padding isn't the worst thing.
+    unwind_reservation = RequestUnwindReservation(generated_code_write_base_ +
+                                                  generated_code_offset_);
+    generated_code_offset_ += xe::round_up(unwind_reservation.data_size, 16);
+
+    auto end_write_address =
+        generated_code_write_base_ + generated_code_offset_;
+
+    high_mark = generated_code_offset_;
+
+    // Store in map. It is maintained in sorted order of host PC dependent on
+    // us also being append-only.
+    generated_code_map_.emplace_back(
+        (uint64_t(code_execute_address - generated_code_execute_base_) << 32) |
+            generated_code_offset_,
+        function_info);
+
+    // TODO(DrChat): The following code doesn't really need to be under the
+    // global lock except for PlaceCode (but it depends on the previous code
+    // already being ran)
+
+    // If we are going above the high water mark of committed memory, commit
+    // some more. It's ok if multiple threads do this, as redundant commits
+    // aren't harmful.
+    size_t old_commit_mark, new_commit_mark;
+    do {
+      old_commit_mark = generated_code_commit_mark_;
+      if (high_mark <= old_commit_mark) break;
+
+      new_commit_mark = old_commit_mark + 16_MiB;
+      if (generated_code_execute_base_ == generated_code_write_base_) {
+        xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark,
+                               xe::memory::AllocationType::kCommit,
+                               xe::memory::PageAccess::kExecuteReadWrite);
+      } else {
+        xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark,
+                               xe::memory::AllocationType::kCommit,
+                               xe::memory::PageAccess::kExecuteReadOnly);
+        xe::memory::AllocFixed(generated_code_write_base_, new_commit_mark,
+                               xe::memory::AllocationType::kCommit,
+                               xe::memory::PageAccess::kReadWrite);
+      }
+    } while (generated_code_commit_mark_.compare_exchange_weak(
+        old_commit_mark, new_commit_mark));
+
+    // Copy code.
+    std::memcpy(code_write_address, machine_code, func_info.code_size.total);
+
+    // Fill unused slots with 0x00
+    std::memset(tail_write_address, 0x00,
+                static_cast<size_t>(end_write_address - tail_write_address));
+
+    // Notify subclasses of placed code.
+    PlaceCode(guest_address, machine_code, func_info, code_execute_address,
+              unwind_reservation);
+  }
+
+  // Now that everything is ready, fix up the indirection table.
+  // Note that we do support code that doesn't have an indirection fixup, so
+  // ignore those when we see them.
+  if (guest_address && indirection_table_base_) {
+    uint32_t* indirection_slot = reinterpret_cast<uint32_t*>(
+        indirection_table_base_ + (guest_address - kIndirectionTableBase));
+    *indirection_slot =
+        uint32_t(reinterpret_cast<uint64_t>(code_execute_address));
+  }
+}
+
+uint32_t A64CodeCache::PlaceData(const void* data, size_t length) {
+  // Hold a lock while we bump the pointers up.
+  size_t high_mark;
+  uint8_t* data_address = nullptr;
+  {
+    auto global_lock = global_critical_region_.Acquire();
+
+    // Reserve code.
+    // Always move the code to land on 16b alignment.
+    data_address = generated_code_write_base_ + generated_code_offset_;
+    generated_code_offset_ += xe::round_up(length, 16);
+
+    high_mark = generated_code_offset_;
+  }
+
+  // If we are going above the high water mark of committed memory, commit some
+  // more. It's ok if multiple threads do this, as redundant commits aren't
+  // harmful.
+  size_t old_commit_mark, new_commit_mark;
+  do {
+    old_commit_mark = generated_code_commit_mark_;
+    if (high_mark <= old_commit_mark) break;
+
+    new_commit_mark = old_commit_mark + 16_MiB;
+    if (generated_code_execute_base_ == generated_code_write_base_) {
+      xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark,
+                             xe::memory::AllocationType::kCommit,
+                             xe::memory::PageAccess::kExecuteReadWrite);
+    } else {
+      xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark,
+                             xe::memory::AllocationType::kCommit,
+                             xe::memory::PageAccess::kExecuteReadOnly);
+      xe::memory::AllocFixed(generated_code_write_base_, new_commit_mark,
+                             xe::memory::AllocationType::kCommit,
+                             xe::memory::PageAccess::kReadWrite);
+    }
+  } while (generated_code_commit_mark_.compare_exchange_weak(old_commit_mark,
+                                                             new_commit_mark));
+
+  // Copy code.
+  std::memcpy(data_address, data, length);
+
+  return uint32_t(uintptr_t(data_address));
+}
+
+GuestFunction* A64CodeCache::LookupFunction(uint64_t host_pc) {
+  uint32_t key = uint32_t(host_pc - kGeneratedCodeExecuteBase);
+  void* fn_entry = std::bsearch(
+      &key, generated_code_map_.data(), generated_code_map_.size() + 1,
+      sizeof(std::pair<uint32_t, Function*>),
+      [](const void* key_ptr, const void* element_ptr) {
+        auto key = *reinterpret_cast<const uint32_t*>(key_ptr);
+        auto element =
+            reinterpret_cast<const std::pair<uint64_t, GuestFunction*>*>(
+                element_ptr);
+        if (key < (element->first >> 32)) {
+          return -1;
+        } else if (key > uint32_t(element->first)) {
+          return 1;
+        } else {
+          return 0;
+        }
+      });
+  if (fn_entry) {
+    return reinterpret_cast<const std::pair<uint64_t, GuestFunction*>*>(
+               fn_entry)
+        ->second;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_code_cache.h b/src/xenia/cpu/backend/a64/a64_code_cache.h
new file mode 100644
index 000000000..2bc9ed59f
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_code_cache.h
@@ -0,0 +1,151 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_CODE_CACHE_H_
+#define XENIA_CPU_BACKEND_A64_A64_CODE_CACHE_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "xenia/base/memory.h"
+#include "xenia/base/mutex.h"
+#include "xenia/cpu/backend/code_cache.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+struct EmitFunctionInfo {
+  struct _code_size {
+    size_t prolog;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+    size_t total;
+  } code_size;
+  size_t prolog_stack_alloc_offset;  // offset of instruction after stack alloc
+  size_t stack_size;
+};
+
+class A64CodeCache : public CodeCache {
+ public:
+  ~A64CodeCache() override;
+
+  static std::unique_ptr<A64CodeCache> Create();
+
+  virtual bool Initialize();
+
+  const std::filesystem::path& file_name() const override { return file_name_; }
+  uintptr_t execute_base_address() const override {
+    return kGeneratedCodeExecuteBase;
+  }
+  size_t total_size() const override { return kGeneratedCodeSize; }
+
+  // TODO(benvanik): ELF serialization/etc
+  // TODO(benvanik): keep track of code blocks
+  // TODO(benvanik): padding/guards/etc
+
+  bool has_indirection_table() { return indirection_table_base_ != nullptr; }
+  void set_indirection_default(uint32_t default_value);
+  void AddIndirection(uint32_t guest_address, uint32_t host_address);
+
+  void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high);
+
+  void PlaceHostCode(uint32_t guest_address, void* machine_code,
+                     const EmitFunctionInfo& func_info,
+                     void*& code_execute_address_out,
+                     void*& code_write_address_out);
+  void PlaceGuestCode(uint32_t guest_address, void* machine_code,
+                      const EmitFunctionInfo& func_info,
+                      GuestFunction* function_info,
+                      void*& code_execute_address_out,
+                      void*& code_write_address_out);
+  uint32_t PlaceData(const void* data, size_t length);
+
+  GuestFunction* LookupFunction(uint64_t host_pc) override;
+
+ protected:
+  // All executable code falls within 0x80000000 to 0x9FFFFFFF, so we can
+  // only map enough for lookups within that range.
+  static const size_t kIndirectionTableSize = 0x1FFFFFFF;
+  static const uintptr_t kIndirectionTableBase = 0x80000000;
+  // The code range is 512MB, but we know the total code games will have is
+  // pretty small (dozens of mb at most) and our expansion is reasonablish
+  // so 256MB should be more than enough.
+  static const size_t kGeneratedCodeSize = 0x0FFFFFFF;
+  static const uintptr_t kGeneratedCodeExecuteBase = 0xA0000000;
+  // Used for writing when PageAccess::kExecuteReadWrite is not supported.
+  static const uintptr_t kGeneratedCodeWriteBase =
+      kGeneratedCodeExecuteBase + kGeneratedCodeSize + 1;
+
+  // This is picked to be high enough to cover whatever we can reasonably
+  // expect. If we hit issues with this it probably means some corner case
+  // in analysis triggering.
+  static const size_t kMaximumFunctionCount = 100000;
+
+  struct UnwindReservation {
+    size_t data_size = 0;
+    size_t table_slot = 0;
+    uint8_t* entry_address = 0;
+  };
+
+  A64CodeCache();
+
+  virtual UnwindReservation RequestUnwindReservation(uint8_t* entry_address) {
+    return UnwindReservation();
+  }
+  virtual void PlaceCode(uint32_t guest_address, void* machine_code,
+                         const EmitFunctionInfo& func_info,
+                         void* code_execute_address,
+                         UnwindReservation unwind_reservation) {}
+
+  std::filesystem::path file_name_;
+  xe::memory::FileMappingHandle mapping_ =
+      xe::memory::kFileMappingHandleInvalid;
+
+  // NOTE: the global critical region must be held when manipulating the offsets
+  // or counts of anything, to keep the tables consistent and ordered.
+  xe::global_critical_region global_critical_region_;
+
+  // Value that the indirection table will be initialized with upon commit.
+  uint32_t indirection_default_value_ = 0xFEEDF00D;
+
+  // Fixed at kIndirectionTableBase in host space, holding 4 byte pointers into
+  // the generated code table that correspond to the PPC functions in guest
+  // space.
+  uint8_t* indirection_table_base_ = nullptr;
+  // Fixed at kGeneratedCodeExecuteBase and holding all generated code, growing
+  // as needed.
+  uint8_t* generated_code_execute_base_ = nullptr;
+  // View of the memory that backs generated_code_execute_base_ when
+  // PageAccess::kExecuteReadWrite is not supported, for writing the generated
+  // code. Equals to generated_code_execute_base_ when it's supported.
+  uint8_t* generated_code_write_base_ = nullptr;
+  // Current offset to empty space in generated code.
+  size_t generated_code_offset_ = 0;
+  // Current high water mark of COMMITTED code.
+  std::atomic<size_t> generated_code_commit_mark_ = {0};
+  // Sorted map by host PC base offsets to source function info.
+  // This can be used to bsearch on host PC to find the guest function.
+  // The key is [start address | end address].
+  std::vector<std::pair<uint64_t, GuestFunction*>> generated_code_map_;
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_CODE_CACHE_H_
diff --git a/src/xenia/cpu/backend/a64/a64_code_cache_win.cc b/src/xenia/cpu/backend/a64/a64_code_cache_win.cc
new file mode 100644
index 000000000..21a87e9f2
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_code_cache_win.cc
@@ -0,0 +1,319 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_code_cache.h"
+
+#include <cstdlib>
+#include <cstring>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/clock.h"
+#include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+#include "xenia/base/memory.h"
+#include "xenia/base/platform_win.h"
+#include "xenia/cpu/function.h"
+
+// Function pointer definitions
+using FnRtlAddGrowableFunctionTable = decltype(&RtlAddGrowableFunctionTable);
+using FnRtlGrowFunctionTable = decltype(&RtlGrowFunctionTable);
+using FnRtlDeleteGrowableFunctionTable =
+    decltype(&RtlDeleteGrowableFunctionTable);
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+// ARM64 unwind-op codes
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes
+// https://www.corsix.org/content/windows-arm64-unwind-codes
+typedef enum _UNWIND_OP_CODES {
+  UWOP_NOP = 0xE3,
+  UWOP_ALLOC_S = 0x00,           // sub sp, sp, i*16
+  UWOP_ALLOC_L = 0xE0'00'00'00,  // sub sp, sp, i*16
+  UWOP_SAVE_FPLR = 0x40,         // stp fp, lr, [sp+i*8]
+  UWOP_SAVE_FPLRX = 0x80,        // stp fp, lr, [sp-(i+1)*8]!
+  UWOP_SET_FP = 0xE1,            // mov fp, sp
+  UWOP_END = 0xE4,
+} UNWIND_CODE_OPS;
+
+using UNWIND_CODE = uint32_t;
+
+static_assert(sizeof(UNWIND_CODE) == sizeof(uint32_t));
+
+// UNWIND_INFO defines the static part (first 32-bit) of the .xdata record
+typedef struct _UNWIND_INFO {
+  uint32_t FunctionLength : 18;
+  uint32_t Version : 2;
+  uint32_t X : 1;
+  uint32_t E : 1;
+  uint32_t EpilogCount : 5;
+  uint32_t CodeWords : 5;
+  UNWIND_CODE UnwindCodes[2];
+} UNWIND_INFO, *PUNWIND_INFO;
+
+static_assert(offsetof(UNWIND_INFO, UnwindCodes[0]) == 4);
+static_assert(offsetof(UNWIND_INFO, UnwindCodes[1]) == 8);
+
+// Size of unwind info per function.
+static const uint32_t kUnwindInfoSize = sizeof(UNWIND_INFO);
+
+class Win32A64CodeCache : public A64CodeCache {
+ public:
+  Win32A64CodeCache();
+  ~Win32A64CodeCache() override;
+
+  bool Initialize() override;
+
+  void* LookupUnwindInfo(uint64_t host_pc) override;
+
+ private:
+  UnwindReservation RequestUnwindReservation(uint8_t* entry_address) override;
+  void PlaceCode(uint32_t guest_address, void* machine_code,
+                 const EmitFunctionInfo& func_info, void* code_execute_address,
+                 UnwindReservation unwind_reservation) override;
+
+  void InitializeUnwindEntry(uint8_t* unwind_entry_address,
+                             size_t unwind_table_slot,
+                             void* code_execute_address,
+                             const EmitFunctionInfo& func_info);
+
+  // Growable function table system handle.
+  void* unwind_table_handle_ = nullptr;
+  // Actual unwind table entries.
+  std::vector<RUNTIME_FUNCTION> unwind_table_;
+  // Current number of entries in the table.
+  std::atomic<uint32_t> unwind_table_count_ = {0};
+  // Does this version of Windows support growable funciton tables?
+  bool supports_growable_table_ = false;
+
+  FnRtlAddGrowableFunctionTable add_growable_table_ = nullptr;
+  FnRtlDeleteGrowableFunctionTable delete_growable_table_ = nullptr;
+  FnRtlGrowFunctionTable grow_table_ = nullptr;
+};
+
+std::unique_ptr<A64CodeCache> A64CodeCache::Create() {
+  return std::make_unique<Win32A64CodeCache>();
+}
+
+Win32A64CodeCache::Win32A64CodeCache() = default;
+
+Win32A64CodeCache::~Win32A64CodeCache() {
+  if (supports_growable_table_) {
+    if (unwind_table_handle_) {
+      delete_growable_table_(unwind_table_handle_);
+    }
+  } else {
+    if (generated_code_execute_base_) {
+      RtlDeleteFunctionTable(reinterpret_cast<PRUNTIME_FUNCTION>(
+          reinterpret_cast<DWORD64>(generated_code_execute_base_) | 0x3));
+    }
+  }
+}
+
+bool Win32A64CodeCache::Initialize() {
+  if (!A64CodeCache::Initialize()) {
+    return false;
+  }
+
+  // Compute total number of unwind entries we should allocate.
+  // We don't support reallocing right now, so this should be high.
+  unwind_table_.resize(kMaximumFunctionCount);
+
+  // Check if this version of Windows supports growable function tables.
+  auto ntdll_handle = GetModuleHandleW(L"ntdll.dll");
+  if (!ntdll_handle) {
+    add_growable_table_ = nullptr;
+    delete_growable_table_ = nullptr;
+    grow_table_ = nullptr;
+  } else {
+    add_growable_table_ = (FnRtlAddGrowableFunctionTable)GetProcAddress(
+        ntdll_handle, "RtlAddGrowableFunctionTable");
+    delete_growable_table_ = (FnRtlDeleteGrowableFunctionTable)GetProcAddress(
+        ntdll_handle, "RtlDeleteGrowableFunctionTable");
+    grow_table_ = (FnRtlGrowFunctionTable)GetProcAddress(
+        ntdll_handle, "RtlGrowFunctionTable");
+  }
+  supports_growable_table_ =
+      add_growable_table_ && delete_growable_table_ && grow_table_;
+
+  // Create table and register with the system. It's empty now, but we'll grow
+  // it as functions are added.
+  if (supports_growable_table_) {
+    if (add_growable_table_(
+            &unwind_table_handle_, unwind_table_.data(), unwind_table_count_,
+            DWORD(unwind_table_.size()),
+            reinterpret_cast<ULONG_PTR>(generated_code_execute_base_),
+            reinterpret_cast<ULONG_PTR>(generated_code_execute_base_ +
+                                        kGeneratedCodeSize))) {
+      XELOGE("Unable to create unwind function table");
+      return false;
+    }
+  } else {
+    // Install a callback that the debugger will use to lookup unwind info on
+    // demand.
+    if (!RtlInstallFunctionTableCallback(
+            reinterpret_cast<DWORD64>(generated_code_execute_base_) | 0x3,
+            reinterpret_cast<DWORD64>(generated_code_execute_base_),
+            kGeneratedCodeSize,
+            [](DWORD64 control_pc, PVOID context) {
+              auto code_cache = reinterpret_cast<Win32A64CodeCache*>(context);
+              return reinterpret_cast<PRUNTIME_FUNCTION>(
+                  code_cache->LookupUnwindInfo(control_pc));
+            },
+            this, nullptr)) {
+      XELOGE("Unable to install function table callback");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+Win32A64CodeCache::UnwindReservation
+Win32A64CodeCache::RequestUnwindReservation(uint8_t* entry_address) {
+  assert_false(unwind_table_count_ >= kMaximumFunctionCount);
+  UnwindReservation unwind_reservation;
+  unwind_reservation.data_size = xe::round_up(kUnwindInfoSize, 16);
+  unwind_reservation.table_slot = unwind_table_count_++;
+  unwind_reservation.entry_address = entry_address;
+  return unwind_reservation;
+}
+
+void Win32A64CodeCache::PlaceCode(uint32_t guest_address, void* machine_code,
+                                  const EmitFunctionInfo& func_info,
+                                  void* code_execute_address,
+                                  UnwindReservation unwind_reservation) {
+  // Add unwind info.
+  InitializeUnwindEntry(unwind_reservation.entry_address,
+                        unwind_reservation.table_slot, code_execute_address,
+                        func_info);
+
+  if (supports_growable_table_) {
+    // Notify that the unwind table has grown.
+    // We do this outside of the lock, but with the latest total count.
+    grow_table_(unwind_table_handle_, unwind_table_count_);
+  }
+
+  // https://docs.microsoft.com/en-us/uwp/win32-and-com/win32-apis
+  FlushInstructionCache(GetCurrentProcess(), code_execute_address,
+                        func_info.code_size.total);
+}
+
+constexpr UNWIND_CODE UnwindOpWord(uint8_t code0 = UWOP_NOP,
+                                   uint8_t code1 = UWOP_NOP,
+                                   uint8_t code2 = UWOP_NOP,
+                                   uint8_t code3 = UWOP_NOP) {
+  return static_cast<uint32_t>(code0) | (static_cast<uint32_t>(code1) << 8) |
+         (static_cast<uint32_t>(code2) << 16) |
+         (static_cast<uint32_t>(code3) << 24);
+}
+
+// 8-byte unwind code for "stp fp, lr, [sp, #-16]!
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes
+static uint8_t OpSaveFpLrX(int16_t pre_index_offset) {
+  assert_true(pre_index_offset <= -8);
+  assert_true(pre_index_offset >= -512);
+  // 16-byte aligned
+  constexpr int IndexShift = 3;
+  constexpr int IndexMask = (1 << IndexShift) - 1;
+  assert_true((pre_index_offset & IndexMask) == 0);
+  const uint32_t encoded_value = (-pre_index_offset >> IndexShift) - 1;
+  return UWOP_SAVE_FPLRX | encoded_value;
+}
+
+// Ensure a 16-byte aligned stack
+static constexpr size_t StackAlignShift = 4;                          // n / 16
+static constexpr size_t StackAlignMask = (1 << StackAlignShift) - 1;  // n % 16
+
+// 8-byte unwind code for up to +512-byte "sub sp, sp, #stack_space"
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes
+static uint8_t OpAllocS(int16_t stack_space) {
+  assert_true(stack_space >= 0);
+  assert_true(stack_space < 512);
+  assert_true((stack_space & StackAlignMask) == 0);
+  return UWOP_ALLOC_S | (stack_space >> StackAlignShift);
+}
+
+// 4-byte unwind code for +256MiB "sub sp, sp, #stack_space"
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes
+uint32_t OpAllocL(int32_t stack_space) {
+  assert_true(stack_space >= 0);
+  assert_true(stack_space < (0xFFFFFF * 16));
+  assert_true((stack_space & StackAlignMask) == 0);
+  return xe::byte_swap(UWOP_ALLOC_L |
+                       ((stack_space >> StackAlignShift) & 0xFF'FF'FF));
+}
+
+void Win32A64CodeCache::InitializeUnwindEntry(
+    uint8_t* unwind_entry_address, size_t unwind_table_slot,
+    void* code_execute_address, const EmitFunctionInfo& func_info) {
+  auto unwind_info = reinterpret_cast<UNWIND_INFO*>(unwind_entry_address);
+
+  *unwind_info = {};
+  // ARM64 instructions are always multiples of 4 bytes
+  // Windows ignores the bottom 2 bits
+  unwind_info->FunctionLength = func_info.code_size.total / 4;
+  unwind_info->CodeWords = 2;
+
+  // https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling?view=msvc-170#unwind-codes
+  // The array of unwind codes is a pool of sequences that describe exactly how
+  // to undo the effects of the prolog. They're stored in the same order the
+  // operations need to be undone. The unwind codes can be thought of as a small
+  // instruction set, encoded as a string of bytes. When execution is complete,
+  // the return address to the calling function is in the lr register. And, all
+  // non-volatile registers are restored to their values at the time the
+  // function was called.
+
+  // Function frames are generally:
+  // STP(X29, X30, SP, PRE_INDEXED, -16);
+  // MOV(X29, XSP);
+  // SUB(XSP, XSP, stack_size);
+  // ... function body ...
+  // ADD(XSP, XSP, stack_size);
+  // MOV(XSP, X29);
+  // LDP(X29, X30, SP, POST_INDEXED, 16);
+
+  // These opcodes must undo the epilog and put the return address within lr
+  unwind_info->UnwindCodes[0] = OpAllocL(func_info.stack_size);
+  unwind_info->UnwindCodes[1] =
+      UnwindOpWord(UWOP_SET_FP, OpSaveFpLrX(-16), UWOP_END);
+
+  // Add entry.
+  RUNTIME_FUNCTION& fn_entry = unwind_table_[unwind_table_slot];
+  fn_entry.BeginAddress =
+      DWORD(reinterpret_cast<uint8_t*>(code_execute_address) -
+            generated_code_execute_base_);
+  fn_entry.UnwindData =
+      DWORD(unwind_entry_address - generated_code_execute_base_);
+}
+
+void* Win32A64CodeCache::LookupUnwindInfo(uint64_t host_pc) {
+  return std::bsearch(
+      &host_pc, unwind_table_.data(), unwind_table_count_,
+      sizeof(RUNTIME_FUNCTION),
+      [](const void* key_ptr, const void* element_ptr) {
+        auto key = *reinterpret_cast<const uintptr_t*>(key_ptr) -
+                   kGeneratedCodeExecuteBase;
+        auto element = reinterpret_cast<const RUNTIME_FUNCTION*>(element_ptr);
+        if (key < element->BeginAddress) {
+          return -1;
+        } else if (key > (element->BeginAddress + element->FunctionLength)) {
+          return 1;
+        } else {
+          return 0;
+        }
+      });
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_emitter.cc b/src/xenia/cpu/backend/a64/a64_emitter.cc
new file mode 100644
index 000000000..6ae853ff3
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_emitter.cc
@@ -0,0 +1,995 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+#include "xenia/cpu/backend/a64/a64_util.h"
+
+#include <cstddef>
+
+#include <climits>
+#include <cstring>
+
+#include "third_party/fmt/include/fmt/format.h"
+#include "xenia/base/assert.h"
+#include "xenia/base/atomic.h"
+#include "xenia/base/debugging.h"
+#include "xenia/base/literals.h"
+#include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+#include "xenia/base/memory.h"
+#include "xenia/base/profiling.h"
+#include "xenia/base/vec128.h"
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#include "xenia/cpu/backend/a64/a64_code_cache.h"
+#include "xenia/cpu/backend/a64/a64_function.h"
+#include "xenia/cpu/backend/a64/a64_sequences.h"
+#include "xenia/cpu/backend/a64/a64_stack_layout.h"
+#include "xenia/cpu/cpu_flags.h"
+#include "xenia/cpu/function.h"
+#include "xenia/cpu/function_debug_info.h"
+#include "xenia/cpu/processor.h"
+#include "xenia/cpu/symbol.h"
+#include "xenia/cpu/thread_state.h"
+
+#include "oaknut/feature_detection/cpu_feature.hpp"
+#include "oaknut/feature_detection/feature_detection.hpp"
+#include "oaknut/feature_detection/feature_detection_idregs.hpp"
+
+DEFINE_bool(debugprint_trap_log, false,
+            "Log debugprint traps to the active debugger", "CPU");
+DEFINE_bool(ignore_undefined_externs, true,
+            "Don't exit when an undefined extern is called.", "CPU");
+DEFINE_bool(emit_source_annotations, false,
+            "Add extra movs and nops to make disassembly easier to read.",
+            "CPU");
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+using xe::cpu::hir::HIRBuilder;
+using xe::cpu::hir::Instr;
+using namespace xe::literals;
+using namespace oaknut::util;
+
+static const size_t kStashOffset = 32;
+// static const size_t kStashOffsetHigh = 32 + 32;
+
+// Register indices that the HIR is allowed to use for operands
+const uint8_t A64Emitter::gpr_reg_map_[A64Emitter::GPR_COUNT] = {
+    19, 20, 21, 22, 23, 24, 25, 26,
+};
+
+const uint8_t A64Emitter::fpr_reg_map_[A64Emitter::FPR_COUNT] = {
+    8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+A64Emitter::A64Emitter(A64Backend* backend)
+    : VectorCodeGenerator(assembly_buffer),
+      processor_(backend->processor()),
+      backend_(backend),
+      code_cache_(backend->code_cache()) {
+  oaknut::CpuFeatures cpu_ = oaknut::detect_features();
+
+  // Combine with id register detection
+#if OAKNUT_SUPPORTS_READING_ID_REGISTERS > 0
+#if OAKNUT_SUPPORTS_READING_ID_REGISTERS == 1
+  const std::optional<oaknut::id::IdRegisters> id_registers =
+      oaknut::read_id_registers();
+#elif OAKNUT_SUPPORTS_READING_ID_REGISTERS == 2
+  const std::optional<oaknut::id::IdRegisters> id_registers =
+      oaknut::read_id_registers(0);
+#endif
+  if (id_registers.has_value()) {
+    cpu_ = cpu_ | oaknut::detect_features_via_id_registers(*id_registers);
+  }
+#endif
+
+#define TEST_EMIT_FEATURE(emit, ext)                \
+  if ((cvars::a64_extension_mask & emit) == emit) { \
+    feature_flags_ |= (cpu_.has(ext) ? emit : 0);   \
+  }
+
+  TEST_EMIT_FEATURE(kA64EmitLSE, oaknut::CpuFeature::LSE);
+  TEST_EMIT_FEATURE(kA64EmitF16C, oaknut::CpuFeature::FP16Conv);
+
+#undef TEST_EMIT_FEATURE
+}
+
+A64Emitter::~A64Emitter() = default;
+
+bool A64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
+                      uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
+                      void** out_code_address, size_t* out_code_size,
+                      std::vector<SourceMapEntry>* out_source_map) {
+  SCOPE_profile_cpu_f("cpu");
+
+  // Reset.
+  debug_info_ = debug_info;
+  debug_info_flags_ = debug_info_flags;
+  trace_data_ = &function->trace_data();
+  source_map_arena_.Reset();
+
+  // Fill the generator with code.
+  EmitFunctionInfo func_info = {};
+  if (!Emit(builder, func_info)) {
+    return false;
+  }
+
+  // Copy the final code to the cache and relocate it.
+  *out_code_size = offset();
+  *out_code_address = Emplace(func_info, function);
+
+  // Stash source map.
+  source_map_arena_.CloneContents(out_source_map);
+
+  return true;
+}
+
+void* A64Emitter::Emplace(const EmitFunctionInfo& func_info,
+                          GuestFunction* function) {
+  // Copy the current oaknut instruction-buffer into the code-cache
+  void* new_execute_address;
+  void* new_write_address;
+
+  assert_true(func_info.code_size.total == offset());
+
+  if (function) {
+    code_cache_->PlaceGuestCode(function->address(), assembly_buffer.data(),
+                                func_info, function, new_execute_address,
+                                new_write_address);
+  } else {
+    code_cache_->PlaceHostCode(0, assembly_buffer.data(), func_info,
+                               new_execute_address, new_write_address);
+  }
+
+  // Reset the oaknut instruction-buffer
+  assembly_buffer.clear();
+  label_lookup_.clear();
+
+  return new_execute_address;
+}
+
+bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
+  oaknut::Label epilog_label;
+  epilog_label_ = &epilog_label;
+
+  // Calculate stack size. We need to align things to their natural sizes.
+  // This could be much better (sort by type/etc).
+  auto locals = builder->locals();
+  size_t stack_offset = StackLayout::GUEST_STACK_SIZE;
+  for (auto it = locals.begin(); it != locals.end(); ++it) {
+    auto slot = *it;
+    size_t type_size = GetTypeSize(slot->type);
+
+    // Align to natural size.
+    stack_offset = xe::align(stack_offset, type_size);
+    slot->set_constant((uint32_t)stack_offset);
+    stack_offset += type_size;
+  }
+
+  // Ensure 16b alignment.
+  stack_offset -= StackLayout::GUEST_STACK_SIZE;
+  stack_offset = xe::align(stack_offset, static_cast<size_t>(16));
+
+  struct _code_offsets {
+    size_t prolog;
+    size_t prolog_stack_alloc;
+    size_t body;
+    size_t epilog;
+    size_t tail;
+  } code_offsets = {};
+
+  code_offsets.prolog = offset();
+
+  // Function prolog.
+  // Must be 16b aligned.
+  // Windows is very strict about the form of this and the epilog:
+  // https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=vs-2017
+  // IMPORTANT: any changes to the prolog must be kept in sync with
+  //     A64CodeCache, which dynamically generates exception information.
+  //     Adding or changing anything here must be matched!
+  size_t stack_size = StackLayout::GUEST_STACK_SIZE + stack_offset;
+
+  // The SUB instruction can only encode immediates withi 0xFFF or 0xFFF000
+  // If the stack size is greater than 0xFFF, then just align it to 0x1000
+  if (stack_size > 0xFFF) {
+    stack_size = xe::align(stack_size, static_cast<size_t>(0x1000));
+  }
+
+  assert_true(stack_size % 16 == 0);
+  func_info.stack_size = stack_size;
+  stack_size_ = stack_size;
+
+  STP(X29, X30, SP, PRE_INDEXED, -16);
+  MOV(X29, SP);
+
+  SUB(SP, SP, (uint32_t)stack_size);
+
+  code_offsets.prolog_stack_alloc = offset();
+  code_offsets.body = offset();
+
+  STR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME);
+  STR(X0, SP, StackLayout::GUEST_RET_ADDR);
+  STR(XZR, SP, StackLayout::GUEST_CALL_RET_ADDR);
+
+  // Safe now to do some tracing.
+  if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
+    // We require 32-bit addresses.
+    assert_true(uint64_t(trace_data_->header()) < UINT_MAX);
+    auto trace_header = trace_data_->header();
+
+    // Call count.
+    MOV(W0, 1);
+    MOV(X5, reinterpret_cast<uintptr_t>(
+                low_address(&trace_header->function_call_count)));
+    LDADDAL(X0, X0, X5);
+
+    // Get call history slot.
+    static_assert(FunctionTraceData::kFunctionCallerHistoryCount == 4,
+                  "bitmask depends on count");
+    LDR(X0, X5);
+    AND(W0, W0, 0b00000011);
+
+    // Record call history value into slot (guest addr in W1).
+    MOV(X5, reinterpret_cast<uintptr_t>(
+                low_address(&trace_header->function_caller_history)));
+    STR(W1, X5, X0, oaknut::IndexExt::LSL, 2);
+
+    // Calling thread. Load X0 with thread ID.
+    EmitGetCurrentThreadId();
+    MOV(W5, 1);
+    LSL(W0, W5, W0);
+
+    MOV(X5, reinterpret_cast<uintptr_t>(
+                low_address(&trace_header->function_thread_use)));
+    LDSET(W0, WZR, X5);
+  }
+
+  // Load membase.
+  LDR(GetMembaseReg(), GetContextReg(),
+      offsetof(ppc::PPCContext, virtual_membase));
+
+  // Body.
+  auto block = builder->first_block();
+  while (block) {
+    // Mark block labels.
+    auto label = block->label_head;
+    while (label) {
+      l(label_lookup_[label->name]);
+      label = label->next;
+    }
+
+    // Process instructions.
+    const Instr* instr = block->instr_head;
+    while (instr) {
+      const Instr* new_tail = instr;
+      if (!SelectSequence(this, instr, &new_tail)) {
+        // No sequence found!
+        // NOTE: If you encounter this after adding a new instruction, do a full
+        // rebuild!
+        assert_always();
+        XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
+        break;
+      }
+      instr = new_tail;
+    }
+
+    block = block->next;
+  }
+
+  // Function epilog.
+  l(epilog_label);
+  epilog_label_ = nullptr;
+  EmitTraceUserCallReturn();
+  LDR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME);
+
+  code_offsets.epilog = offset();
+
+  ADD(SP, SP, (uint32_t)stack_size);
+
+  MOV(SP, X29);
+  LDP(X29, X30, SP, POST_INDEXED, 16);
+
+  RET();
+
+  code_offsets.tail = offset();
+
+  if (cvars::emit_source_annotations) {
+    NOP();
+    NOP();
+    NOP();
+    NOP();
+    NOP();
+  }
+
+  assert_zero(code_offsets.prolog);
+  func_info.code_size.total = offset();
+  func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
+  func_info.code_size.body = code_offsets.epilog - code_offsets.body;
+  func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
+  func_info.code_size.tail = offset() - code_offsets.tail;
+  func_info.prolog_stack_alloc_offset =
+      code_offsets.prolog_stack_alloc - code_offsets.prolog;
+
+  return true;
+}
+
+void A64Emitter::MarkSourceOffset(const Instr* i) {
+  auto entry = source_map_arena_.Alloc<SourceMapEntry>();
+  entry->guest_address = static_cast<uint32_t>(i->src1.offset);
+  entry->hir_offset = uint32_t(i->block->ordinal << 16) | i->ordinal;
+  entry->code_offset = static_cast<uint32_t>(offset());
+
+  if (cvars::emit_source_annotations) {
+    NOP();
+    NOP();
+    MOV(X0, entry->guest_address);
+    NOP();
+    NOP();
+  }
+
+  if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) {
+    const uint32_t instruction_index =
+        (entry->guest_address - trace_data_->start_address()) / 4;
+    MOV(X0, 1);
+    MOV(X1, reinterpret_cast<uintptr_t>(
+                low_address(trace_data_->instruction_execute_counts() +
+                            instruction_index * 8)));
+    LDADDAL(X0, ZR, X1);
+  }
+}
+
+void A64Emitter::EmitGetCurrentThreadId() {
+  // X27 must point to context. We could fetch from the stack if needed.
+  LDRH(W0, GetContextReg(), offsetof(ppc::PPCContext, thread_id));
+}
+
+void A64Emitter::EmitTraceUserCallReturn() {}
+
+void A64Emitter::DebugBreak() { BRK(0xF000); }
+
+uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
+  // uint16_t str_len = uint16_t(thread_state->context()->r[4]);
+  auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
+  // TODO(benvanik): truncate to length?
+  XELOGD("(DebugPrint) {}", str);
+
+  if (cvars::debugprint_trap_log) {
+    debugging::DebugPrint("(DebugPrint) {}", str);
+  }
+
+  return 0;
+}
+
+uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  XELOGE("tw/td forced trap hit! This should be a crash!");
+  if (cvars::break_on_debugbreak) {
+    xe::debugging::Break();
+  }
+  return 0;
+}
+
+void A64Emitter::Trap(uint16_t trap_type) {
+  switch (trap_type) {
+    case 20:
+    case 26:
+      // 0x0FE00014 is a 'debug print' where r3 = buffer r4 = length
+      CallNative(TrapDebugPrint, 0);
+      break;
+    case 0:
+    case 22:
+      // Always trap?
+      // TODO(benvanik): post software interrupt to debugger.
+      CallNative(TrapDebugBreak, 0);
+      break;
+    case 25:
+      // ?
+      break;
+    default:
+      XELOGW("Unknown trap type {}", trap_type);
+      BRK(0xF000);
+      break;
+  }
+}
+
+void A64Emitter::UnimplementedInstr(const hir::Instr* i) {
+  // TODO(benvanik): notify debugger.
+  BRK(0xF000);
+  assert_always();
+}
+
+// This is used by the A64ThunkEmitter's ResolveFunctionThunk.
+uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+
+  // TODO(benvanik): required?
+  assert_not_zero(target_address);
+
+  auto fn = thread_state->processor()->ResolveFunction(
+      static_cast<uint32_t>(target_address));
+  assert_not_null(fn);
+  auto a64_fn = static_cast<A64Function*>(fn);
+  uint64_t addr = reinterpret_cast<uint64_t>(a64_fn->machine_code());
+
+  return addr;
+}
+
+void A64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
+  assert_not_null(function);
+  auto fn = static_cast<A64Function*>(function);
+  // Resolve address to the function to call and store in X16.
+  if (fn->machine_code()) {
+    // TODO(benvanik): is it worth it to do this? It removes the need for
+    // a ResolveFunction call, but makes the table less useful.
+    assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
+    MOV(X16, uint32_t(uint64_t(fn->machine_code())));
+  } else if (code_cache_->has_indirection_table()) {
+    // Load the pointer to the indirection table maintained in A64CodeCache.
+    // The target dword will either contain the address of the generated code
+    // or a thunk to ResolveAddress.
+    MOV(W17, function->address());
+    LDR(W16, X17);
+  } else {
+    // Old-style resolve.
+    // Not too important because indirection table is almost always available.
+    // TODO: Overwrite the call-site with a straight call.
+    CallNative(&ResolveFunction, function->address());
+    MOV(X16, X0);
+  }
+
+  // Actually jump/call to X16.
+  if (instr->flags & hir::CALL_TAIL) {
+    // Since we skip the prolog we need to mark the return here.
+    EmitTraceUserCallReturn();
+
+    // Pass the callers return address over.
+    LDR(X0, SP, StackLayout::GUEST_RET_ADDR);
+
+    ADD(SP, SP, static_cast<uint32_t>(stack_size()));
+
+    MOV(SP, X29);
+    LDP(X29, X30, SP, POST_INDEXED, 16);
+
+    BR(X16);
+  } else {
+    // Return address is from the previous SET_RETURN_ADDRESS.
+    LDR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR);
+
+    BLR(X16);
+  }
+}
+
+void A64Emitter::CallIndirect(const hir::Instr* instr,
+                              const oaknut::XReg& reg) {
+  // Check if return.
+  if (instr->flags & hir::CALL_POSSIBLE_RETURN) {
+    LDR(W16, SP, StackLayout::GUEST_RET_ADDR);
+    CMP(reg.toW(), W16);
+    B(oaknut::Cond::EQ, epilog_label());
+  }
+
+  // Load the pointer to the indirection table maintained in A64CodeCache.
+  // The target dword will either contain the address of the generated code
+  // or a thunk to ResolveAddress.
+  if (code_cache_->has_indirection_table()) {
+    if (reg.toW().index() != W17.index()) {
+      MOV(W17, reg.toW());
+    }
+    LDR(W16, X17);
+  } else {
+    // Old-style resolve.
+    // Not too important because indirection table is almost always available.
+    MOV(X0, GetContextReg());
+    MOV(W1, reg.toW());
+
+    MOV(X16, reinterpret_cast<uint64_t>(ResolveFunction));
+    BLR(X16);
+    MOV(X16, X0);
+  }
+
+  // Actually jump/call to X16.
+  if (instr->flags & hir::CALL_TAIL) {
+    // Since we skip the prolog we need to mark the return here.
+    EmitTraceUserCallReturn();
+
+    // Pass the callers return address over.
+    LDR(X0, SP, StackLayout::GUEST_RET_ADDR);
+
+    ADD(SP, SP, static_cast<uint32_t>(stack_size()));
+
+    MOV(SP, X29);
+    LDP(X29, X30, SP, POST_INDEXED, 16);
+
+    BR(X16);
+  } else {
+    // Return address is from the previous SET_RETURN_ADDRESS.
+    LDR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR);
+
+    BLR(X16);
+  }
+}
+
+uint64_t UndefinedCallExtern(void* raw_context, uint64_t function_ptr) {
+  auto function = reinterpret_cast<Function*>(function_ptr);
+  if (!cvars::ignore_undefined_externs) {
+    xe::FatalError(fmt::format("undefined extern call to {:08X} {}",
+                               function->address(), function->name().c_str()));
+  } else {
+    XELOGE("undefined extern call to {:08X} {}", function->address(),
+           function->name());
+  }
+  return 0;
+}
+void A64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
+  bool undefined = true;
+  if (function->behavior() == Function::Behavior::kBuiltin) {
+    auto builtin_function = static_cast<const BuiltinFunction*>(function);
+    if (builtin_function->handler()) {
+      undefined = false;
+      // x0 = target function
+      // x1 = arg0
+      // x2 = arg1
+      // x3 = arg2
+      MOV(X0, reinterpret_cast<uint64_t>(builtin_function->handler()));
+      MOV(X1, reinterpret_cast<uint64_t>(builtin_function->arg0()));
+      MOV(X2, reinterpret_cast<uint64_t>(builtin_function->arg1()));
+
+      auto thunk = backend()->guest_to_host_thunk();
+      MOV(X16, reinterpret_cast<uint64_t>(thunk));
+      BLR(X16);
+
+      // x0 = host return
+    }
+  } else if (function->behavior() == Function::Behavior::kExtern) {
+    auto extern_function = static_cast<const GuestFunction*>(function);
+    if (extern_function->extern_handler()) {
+      undefined = false;
+      // x0 = target function
+      // x1 = arg0
+      // x2 = arg1
+      // x3 = arg2
+      MOV(X0, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
+      LDR(X1, GetContextReg(), offsetof(ppc::PPCContext, kernel_state));
+
+      auto thunk = backend()->guest_to_host_thunk();
+      MOV(X16, reinterpret_cast<uint64_t>(thunk));
+      BLR(X16);
+
+      // x0 = host return
+    }
+  }
+  if (undefined) {
+    CallNative(UndefinedCallExtern, reinterpret_cast<uint64_t>(function));
+  }
+}
+
+void A64Emitter::CallNative(void* fn) { CallNativeSafe(fn); }
+
+void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context)) {
+  CallNativeSafe(reinterpret_cast<void*>(fn));
+}
+
+void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)) {
+  CallNativeSafe(reinterpret_cast<void*>(fn));
+}
+
+void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0),
+                            uint64_t arg0) {
+  MOV(GetNativeParam(0), arg0);
+  CallNativeSafe(reinterpret_cast<void*>(fn));
+}
+
+void A64Emitter::CallNativeSafe(void* fn) {
+  // X0 = target function
+  // X1 = arg0
+  // X2 = arg1
+  // X3 = arg2
+  auto thunk = backend()->guest_to_host_thunk();
+
+  MOV(X0, reinterpret_cast<uint64_t>(fn));
+
+  MOV(X16, reinterpret_cast<uint64_t>(thunk));
+  BLR(X16);
+
+  // X0 = host return
+}
+
+void A64Emitter::SetReturnAddress(uint64_t value) {
+  MOV(X0, value);
+  STR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR);
+}
+
+oaknut::XReg A64Emitter::GetNativeParam(uint32_t param) {
+  if (param == 0)
+    return X1;
+  else if (param == 1)
+    return X2;
+  else if (param == 2)
+    return X3;
+
+  assert_always();
+  return X3;
+}
+
+// Important: If you change these, you must update the thunks in a64_backend.cc!
+oaknut::XReg A64Emitter::GetContextReg() { return X27; }
+oaknut::XReg A64Emitter::GetMembaseReg() { return X28; }
+
+void A64Emitter::ReloadContext() {
+  LDR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME);
+}
+
+void A64Emitter::ReloadMembase() {
+  LDR(GetMembaseReg(), GetContextReg(),
+      offsetof(ppc::PPCContext, virtual_membase));
+}
+
+bool A64Emitter::ConstantFitsIn32Reg(uint64_t v) {
+  if ((v & ~0x7FFFFFFF) == 0) {
+    // Fits under 31 bits, so just load using normal mov.
+    return true;
+  } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
+    // Negative number that fits in 32bits.
+    return true;
+  }
+  return false;
+}
+
+void A64Emitter::MovMem64(const oaknut::XRegSp& addr, intptr_t offset,
+                          uint64_t v) {
+  if (v == 0) {
+    STR(XZR, addr, offset);
+  } else if (!(v >> 32)) {
+    // All high bits are zero, 32-bit MOV
+    MOV(W0, static_cast<uint32_t>(v));
+    STR(X0, addr, offset);
+  } else {
+    // 64bit number that needs double movs.
+    MOV(X0, v);
+    STR(X0, addr, offset);
+  }
+}
+
+static const vec128_t v_consts[] = {
+    /* VZero                */ vec128f(0.0f),
+    /* VOnePD               */ vec128d(1.0),
+    /* VNegativeOne         */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
+    /* VFFFF                */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu),
+    /* VMaskX16Y16          */
+    vec128i(0x0000FFFFu, 0xFFFF0000u, 0x00000000u, 0x00000000u),
+    /* VFlipX16Y16          */
+    vec128i(0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u),
+    /* VFixX16Y16           */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f),
+    /* VNormalizeX16Y16     */
+    vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f),
+    /* V0001                */ vec128f(0.0f, 0.0f, 0.0f, 1.0f),
+    /* V3301                */ vec128f(3.0f, 3.0f, 0.0f, 1.0f),
+    /* V3331                */ vec128f(3.0f, 3.0f, 3.0f, 1.0f),
+    /* V3333                */ vec128f(3.0f, 3.0f, 3.0f, 3.0f),
+    /* VSignMaskPS          */
+    vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
+    /* VSignMaskPD          */
+    vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u),
+    /* VAbsMaskPS           */
+    vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu),
+    /* VAbsMaskPD           */
+    vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu),
+    /* VByteSwapMask        */
+    vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
+    /* VByteOrderMask       */
+    vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
+    /* VPermuteControl15    */ vec128b(15),
+    /* VPermuteByteMask     */ vec128b(0x1F),
+    /* VPackD3DCOLORSat     */ vec128i(0x404000FFu),
+    /* VPackD3DCOLOR        */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u),
+    /* VUnpackD3DCOLOR      */
+    vec128i(0xFFFFFF0Eu, 0xFFFFFF0Du, 0xFFFFFF0Cu, 0xFFFFFF0Fu),
+    /* VPackFLOAT16_2       */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u),
+    /* VUnpackFLOAT16_2     */
+    vec128i(0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu),
+    /* VPackFLOAT16_4       */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u, 0x05040706u),
+    /* VUnpackFLOAT16_4     */
+    vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu),
+    /* VPackSHORT_Min       */ vec128i(0x403F8001u),
+    /* VPackSHORT_Max       */ vec128i(0x40407FFFu),
+    /* VPackSHORT_2         */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u),
+    /* VPackSHORT_4         */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu),
+    /* VUnpackSHORT_2       */
+    vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu),
+    /* VUnpackSHORT_4       */
+    vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu),
+    /* VUnpackSHORT_Overflow */ vec128i(0x403F8000u),
+    /* VPackUINT_2101010_MinUnpacked */
+    vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u),
+    /* VPackUINT_2101010_MaxUnpacked */
+    vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u),
+    /* VPackUINT_2101010_MaskUnpacked */
+    vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u),
+    /* VPackUINT_2101010_MaskPacked */
+    vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
+    /* VPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
+    /* VUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u),
+    /* VPackULONG_4202020_MinUnpacked */
+    vec128i(0x40380001u, 0x40380001u, 0x40380001u, 0x40400000u),
+    /* VPackULONG_4202020_MaxUnpacked */
+    vec128i(0x4047FFFFu, 0x4047FFFFu, 0x4047FFFFu, 0x4040000Fu),
+    /* VPackULONG_4202020_MaskUnpacked */
+    vec128i(0xFFFFFu, 0xFFFFFu, 0xFFFFFu, 0xFu),
+    /* VPackULONG_4202020_PermuteXZ */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0A0908FFu, 0xFF020100u),
+    /* VPackULONG_4202020_PermuteYW */
+    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0CFFFF06u, 0x0504FFFFu),
+    /* VUnpackULONG_4202020_Permute */
+    vec128i(0xFF0E0D0Cu, 0xFF0B0A09u, 0xFF080F0Eu, 0xFFFFFF0Bu),
+    /* VUnpackULONG_4202020_Overflow */ vec128i(0x40380000u),
+    /* VOneOver255          */ vec128f(1.0f / 255.0f),
+    /* VMaskEvenPI16        */
+    vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),
+    /* VShiftMaskEvenPI16   */
+    vec128i(0x0000000Fu, 0x0000000Fu, 0x0000000Fu, 0x0000000Fu),
+    /* VShiftMaskPS         */
+    vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu),
+    /* VShiftByteMask       */
+    vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu),
+    /* VSwapWordMask        */
+    vec128i(0x03030303u, 0x03030303u, 0x03030303u, 0x03030303u),
+    /* VUnsignedDwordMax    */
+    vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u),
+    /* V255                 */ vec128f(255.0f),
+    /* VPI32                */ vec128i(32),
+    /* VSignMaskI8          */
+    vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u),
+    /* VSignMaskI16         */
+    vec128i(0x80008000u, 0x80008000u, 0x80008000u, 0x80008000u),
+    /* VSignMaskI32         */
+    vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
+    /* VSignMaskF32         */
+    vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
+    /* VShortMinPS          */ vec128f(SHRT_MIN),
+    /* VShortMaxPS          */ vec128f(SHRT_MAX),
+    /* VIntMin              */ vec128i(INT_MIN),
+    /* VIntMax              */ vec128i(INT_MAX),
+    /* VIntMaxPD            */ vec128d(INT_MAX),
+    /* VPosIntMinPS         */ vec128f((float)0x80000000u),
+    /* VQNaN                */ vec128i(0x7FC00000u),
+    /* VInt127              */ vec128i(0x7Fu),
+    /* V2To32               */ vec128f(0x1.0p32f),
+};
+
+// First location to try and place constants.
+static const uintptr_t kConstDataLocation = 0x20000000;
+static const uintptr_t kConstDataSize = sizeof(v_consts);
+
+// Increment the location by this amount for every allocation failure.
+static const uintptr_t kConstDataIncrement = 0x00001000;
+
+// This function places constant data that is used by the emitter later on.
+// Only called once and used by multiple instances of the emitter.
+//
+// TODO(DrChat): This should be placed in the code cache with the code, but
+// doing so requires RIP-relative addressing, which is difficult to support
+// given the current setup.
+uintptr_t A64Emitter::PlaceConstData() {
+  uint8_t* ptr = reinterpret_cast<uint8_t*>(kConstDataLocation);
+  void* mem = nullptr;
+  while (!mem) {
+    mem = memory::AllocFixed(
+        ptr, xe::round_up(kConstDataSize, memory::page_size()),
+        memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
+
+    ptr += kConstDataIncrement;
+  }
+
+  // The pointer must not be greater than 31 bits.
+  assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
+  std::memcpy(mem, v_consts, sizeof(v_consts));
+  memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
+
+  return reinterpret_cast<uintptr_t>(mem);
+}
+
+void A64Emitter::FreeConstData(uintptr_t data) {
+  memory::DeallocFixed(reinterpret_cast<void*>(data), 0,
+                       memory::DeallocationType::kRelease);
+}
+
+uintptr_t A64Emitter::GetVConstPtr() const { return backend_->emitter_data(); }
+
+uintptr_t A64Emitter::GetVConstPtr(VConst id) const {
+  // Load through fixed constant table setup by PlaceConstData.
+  // It's important that the pointer is not signed, as it will be sign-extended.
+  return GetVConstPtr() + GetVConstOffset(id);
+}
+
+// Implies possible StashV(0, ...)!
+void A64Emitter::LoadConstantV(oaknut::QReg dest, const vec128_t& v) {
+  if (!v.low && !v.high) {
+    // 0000...
+    // MOVI is implemented as a register-rename while EOR(x, x, x) is not
+    // https://dougallj.github.io/applecpu/firestorm.html
+    MOVI(dest.B16(), 0);
+  } else if (v.low == ~uint64_t(0) && v.high == ~uint64_t(0)) {
+    // 1111...
+    MOVI(dest.B16(), 0xFF);
+  } else {
+    // Try to figure out some common splat-patterns to utilize MOVI rather than
+    // stashing to memory.
+    const bool all_same_u8 =
+        std::adjacent_find(std::cbegin(v.u8), std::cend(v.u8),
+                           std::not_equal_to<>()) == std::cend(v.u8);
+
+    if (all_same_u8) {
+      // 0xXX, 0xXX, 0xXX...
+      MOVI(dest.B16(), v.u8[0]);
+      return;
+    }
+
+    const bool all_same_u16 =
+        std::adjacent_find(std::cbegin(v.u16), std::cend(v.u16),
+                           std::not_equal_to<>()) == std::cend(v.u16);
+
+    if (all_same_u16) {
+      if ((v.u16[0] & 0xFF00) == 0) {
+        // 0x00XX, 0x00XX, 0x00XX...
+        MOVI(dest.H8(), uint8_t(v.u16[0]));
+        return;
+      } else if ((v.u16[0] & 0x00FF) == 0) {
+        // 0xXX00, 0xXX00, 0xXX00...
+        MOVI(dest.H8(), uint8_t(v.u16[0] >> 8), oaknut::util::LSL, 8);
+        return;
+      }
+    }
+
+    const bool all_same_u32 =
+        std::adjacent_find(std::cbegin(v.u32), std::cend(v.u32),
+                           std::not_equal_to<>()) == std::cend(v.u32);
+
+    if (all_same_u32) {
+      if ((v.u32[0] & 0x00FFFFFF) == 0) {
+        // This is used a lot for certain float-splats and should be checked
+        // first before the others
+        // 0xXX000000, 0xXX000000, 0xXX000000...
+        MOVI(dest.S4(), uint8_t(v.u32[0] >> 24), oaknut::util::LSL, 24);
+        return;
+      } else if ((v.u32[0] & 0xFFFFFF00) == 0) {
+        // 0x000000XX, 0x000000XX, 0x000000XX...
+        MOVI(dest.S4(), uint8_t(v.u32[0]));
+        return;
+      } else if ((v.u32[0] & 0xFFFF00FF) == 0) {
+        // 0x0000XX00, 0x0000XX00, 0x0000XX00...
+        MOVI(dest.S4(), uint8_t(v.u32[0] >> 8), oaknut::util::LSL, 8);
+        return;
+      } else if ((v.u32[0] & 0xFF00FFFF) == 0) {
+        // 0x00XX0000, 0x00XX0000, 0x00XX0000...
+        MOVI(dest.S4(), uint8_t(v.u32[0] >> 16), oaknut::util::LSL, 16);
+        return;
+      }
+
+      // Try to utilize FMOV if possible
+      oaknut::FImm8 fp8(0);
+      if (f32_to_fimm8(v.u32[0], fp8)) {
+        FMOV(dest.S4(), fp8);
+        return;
+      }
+    }
+
+    // TODO(benvanik): see what other common values are.
+    // TODO(benvanik): build constant table - 99% are reused.
+    MovMem64(SP, kStashOffset, v.low);
+    MovMem64(SP, kStashOffset + 8, v.high);
+    LDR(dest, SP, kStashOffset);
+  }
+}
+
+void A64Emitter::LoadConstantV(oaknut::QReg dest, float v) {
+  union {
+    float f;
+    uint32_t i;
+  } x = {v};
+  if (!x.i) {
+    // +0.0f (but not -0.0f because it may be used to flip the sign via xor).
+    MOVI(dest.B16(), 0);
+  } else if (x.i == ~uint32_t(0)) {
+    // 1111...
+    MOVI(dest.B16(), 0xFF);
+  } else {
+    // TODO(benvanik): see what other common values are.
+    // TODO(benvanik): build constant table - 99% are reused.
+
+    // Try to utilize FMOV if possible
+    oaknut::FImm8 fp8(0);
+    if (f32_to_fimm8(x.i, fp8)) {
+      FMOV(dest.toS(), fp8);
+      return;
+    }
+
+    MOV(W0, x.i);
+    FMOV(dest.toS(), W0);
+  }
+}
+
+void A64Emitter::LoadConstantV(oaknut::QReg dest, double v) {
+  union {
+    double d;
+    uint64_t i;
+  } x = {v};
+  if (!x.i) {
+    // +0.0 (but not -0.0 because it may be used to flip the sign via xor).
+    MOVI(dest.toD(), oaknut::RepImm(0));
+  } else if (x.i == ~uint64_t(0)) {
+    // 1111...
+    MOVI(dest.toD(), oaknut::RepImm(0xFF));
+  } else {
+    // TODO(benvanik): see what other common values are.
+    // TODO(benvanik): build constant table - 99% are reused.
+
+    // Try to utilize FMOV if possible
+    oaknut::FImm8 fp8(0);
+    if (f64_to_fimm8(x.i, fp8)) {
+      FMOV(dest.toD(), fp8);
+      return;
+    }
+
+    MOV(X0, x.i);
+    FMOV(dest.toD(), X0);
+  }
+}
+
+uintptr_t A64Emitter::StashV(int index, const oaknut::QReg& r) {
+  // auto addr = ptr[rsp + kStashOffset + (index * 16)];
+  // vmovups(addr, r);
+  const auto addr = kStashOffset + (index * 16);
+  STR(r, SP, addr);
+  return addr;
+}
+
+uintptr_t A64Emitter::StashConstantV(int index, float v) {
+  union {
+    float f;
+    uint32_t i;
+  } x = {v};
+  const auto addr = kStashOffset + (index * 16);
+  MovMem64(SP, addr, x.i);
+  MovMem64(SP, addr + 8, 0);
+  return addr;
+}
+
+uintptr_t A64Emitter::StashConstantV(int index, double v) {
+  union {
+    double d;
+    uint64_t i;
+  } x = {v};
+  const auto addr = kStashOffset + (index * 16);
+  MovMem64(SP, addr, x.i);
+  MovMem64(SP, addr + 8, 0);
+  return addr;
+}
+
+uintptr_t A64Emitter::StashConstantV(int index, const vec128_t& v) {
+  const auto addr = kStashOffset + (index * 16);
+  MovMem64(SP, addr, v.low);
+  MovMem64(SP, addr + 8, v.high);
+  return addr;
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_emitter.h b/src/xenia/cpu/backend/a64/a64_emitter.h
new file mode 100644
index 000000000..629c67a4b
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_emitter.h
@@ -0,0 +1,267 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_EMITTER_H_
+#define XENIA_CPU_BACKEND_A64_A64_EMITTER_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "xenia/base/arena.h"
+#include "xenia/cpu/function.h"
+#include "xenia/cpu/function_trace_data.h"
+#include "xenia/cpu/hir/hir_builder.h"
+#include "xenia/cpu/hir/instr.h"
+#include "xenia/cpu/hir/value.h"
+#include "xenia/memory.h"
+
+#include "oaknut/code_block.hpp"
+#include "oaknut/oaknut.hpp"
+
+namespace xe {
+namespace cpu {
+class Processor;
+}  // namespace cpu
+}  // namespace xe
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+class A64Backend;
+class A64CodeCache;
+
+struct EmitFunctionInfo;
+
+enum RegisterFlags {
+  REG_DEST = (1 << 0),
+  REG_ABCD = (1 << 1),
+};
+
+enum VConst {
+  VZero = 0,
+  VOnePD,
+  VNegativeOne,
+  VFFFF,
+  VMaskX16Y16,
+  VFlipX16Y16,
+  VFixX16Y16,
+  VNormalizeX16Y16,
+  V0001,
+  V3301,
+  V3331,
+  V3333,
+  VSignMaskPS,
+  VSignMaskPD,
+  VAbsMaskPS,
+  VAbsMaskPD,
+  VByteSwapMask,
+  VByteOrderMask,
+  VPermuteControl15,
+  VPermuteByteMask,
+  VPackD3DCOLORSat,
+  VPackD3DCOLOR,
+  VUnpackD3DCOLOR,
+  VPackFLOAT16_2,
+  VUnpackFLOAT16_2,
+  VPackFLOAT16_4,
+  VUnpackFLOAT16_4,
+  VPackSHORT_Min,
+  VPackSHORT_Max,
+  VPackSHORT_2,
+  VPackSHORT_4,
+  VUnpackSHORT_2,
+  VUnpackSHORT_4,
+  VUnpackSHORT_Overflow,
+  VPackUINT_2101010_MinUnpacked,
+  VPackUINT_2101010_MaxUnpacked,
+  VPackUINT_2101010_MaskUnpacked,
+  VPackUINT_2101010_MaskPacked,
+  VPackUINT_2101010_Shift,
+  VUnpackUINT_2101010_Overflow,
+  VPackULONG_4202020_MinUnpacked,
+  VPackULONG_4202020_MaxUnpacked,
+  VPackULONG_4202020_MaskUnpacked,
+  VPackULONG_4202020_PermuteXZ,
+  VPackULONG_4202020_PermuteYW,
+  VUnpackULONG_4202020_Permute,
+  VUnpackULONG_4202020_Overflow,
+  VOneOver255,
+  VMaskEvenPI16,
+  VShiftMaskEvenPI16,
+  VShiftMaskPS,
+  VShiftByteMask,
+  VSwapWordMask,
+  VUnsignedDwordMax,
+  V255,
+  VPI32,
+  VSignMaskI8,
+  VSignMaskI16,
+  VSignMaskI32,
+  VSignMaskF32,
+  VShortMinPS,
+  VShortMaxPS,
+  VIntMin,
+  VIntMax,
+  VIntMaxPD,
+  VPosIntMinPS,
+  VQNaN,
+  VInt127,
+  V2To32,
+};
+
+enum A64EmitterFeatureFlags {
+  kA64EmitLSE = 1 << 0,
+  kA64EmitF16C = 1 << 1,
+};
+
+class A64Emitter : public oaknut::VectorCodeGenerator {
+ public:
+  A64Emitter(A64Backend* backend);
+  virtual ~A64Emitter();
+
+  Processor* processor() const { return processor_; }
+  A64Backend* backend() const { return backend_; }
+
+  static uintptr_t PlaceConstData();
+  static void FreeConstData(uintptr_t data);
+
+  bool Emit(GuestFunction* function, hir::HIRBuilder* builder,
+            uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
+            void** out_code_address, size_t* out_code_size,
+            std::vector<SourceMapEntry>* out_source_map);
+
+ public:
+  // Reserved:  XSP, X27, X28
+  // Scratch:   X1-X15, X30 | V0-v7 and V16-V31
+  //            V0-2
+  // Available: X19-X26
+  //            V4-V15 (save to get V3)
+  static const size_t GPR_COUNT = 8;
+  static const size_t FPR_COUNT = 8;
+
+  static void SetupReg(const hir::Value* v, oaknut::WReg& r) {
+    const auto idx = gpr_reg_map_[v->reg.index];
+    r = oaknut::WReg(idx);
+  }
+  static void SetupReg(const hir::Value* v, oaknut::XReg& r) {
+    const auto idx = gpr_reg_map_[v->reg.index];
+    r = oaknut::XReg(idx);
+  }
+  static void SetupReg(const hir::Value* v, oaknut::SReg& r) {
+    const auto idx = fpr_reg_map_[v->reg.index];
+    r = oaknut::SReg(idx);
+  }
+  static void SetupReg(const hir::Value* v, oaknut::DReg& r) {
+    const auto idx = fpr_reg_map_[v->reg.index];
+    r = oaknut::DReg(idx);
+  }
+  static void SetupReg(const hir::Value* v, oaknut::QReg& r) {
+    const auto idx = fpr_reg_map_[v->reg.index];
+    r = oaknut::QReg(idx);
+  }
+
+  // Gets(and possibly create) an HIR label with the specified name
+  oaknut::Label* lookup_label(const char* label_name) {
+    return &label_lookup_[label_name];
+  }
+
+  oaknut::Label& epilog_label() { return *epilog_label_; }
+
+  void MarkSourceOffset(const hir::Instr* i);
+
+  void DebugBreak();
+  void Trap(uint16_t trap_type = 0);
+  void UnimplementedInstr(const hir::Instr* i);
+
+  void Call(const hir::Instr* instr, GuestFunction* function);
+  void CallIndirect(const hir::Instr* instr, const oaknut::XReg& reg);
+  void CallExtern(const hir::Instr* instr, const Function* function);
+  void CallNative(void* fn);
+  void CallNative(uint64_t (*fn)(void* raw_context));
+  void CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0));
+  void CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0),
+                  uint64_t arg0);
+  void CallNativeSafe(void* fn);
+  void SetReturnAddress(uint64_t value);
+
+  static oaknut::XReg GetNativeParam(uint32_t param);
+
+  static oaknut::XReg GetContextReg();
+  static oaknut::XReg GetMembaseReg();
+  void ReloadContext();
+  void ReloadMembase();
+
+  // Moves a 64bit immediate into memory.
+  static bool ConstantFitsIn32Reg(uint64_t v);
+  void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v);
+
+  uintptr_t GetVConstPtr() const;
+  uintptr_t GetVConstPtr(VConst id) const;
+  static constexpr uintptr_t GetVConstOffset(VConst id) {
+    return sizeof(vec128_t) * id;
+  }
+  void LoadConstantV(oaknut::QReg dest, float v);
+  void LoadConstantV(oaknut::QReg dest, double v);
+  void LoadConstantV(oaknut::QReg dest, const vec128_t& v);
+
+  // Returned addresses are relative to XSP
+  uintptr_t StashV(int index, const oaknut::QReg& r);
+  uintptr_t StashConstantV(int index, float v);
+  uintptr_t StashConstantV(int index, double v);
+  uintptr_t StashConstantV(int index, const vec128_t& v);
+
+  bool IsFeatureEnabled(uint32_t feature_flag) const {
+    return (feature_flags_ & feature_flag) == feature_flag;
+  }
+
+  FunctionDebugInfo* debug_info() const { return debug_info_; }
+
+  size_t stack_size() const { return stack_size_; }
+
+ protected:
+  void* Emplace(const EmitFunctionInfo& func_info,
+                GuestFunction* function = nullptr);
+  bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info);
+  void EmitGetCurrentThreadId();
+  void EmitTraceUserCallReturn();
+
+ protected:
+  Processor* processor_ = nullptr;
+  A64Backend* backend_ = nullptr;
+  A64CodeCache* code_cache_ = nullptr;
+  uint32_t feature_flags_ = 0;
+
+  std::vector<std::uint32_t> assembly_buffer;
+
+  oaknut::Label* epilog_label_ = nullptr;
+
+  // Convert from plain-text label-names into oaknut-labels
+  std::unordered_map<std::string, oaknut::Label> label_lookup_;
+
+  hir::Instr* current_instr_ = nullptr;
+
+  FunctionDebugInfo* debug_info_ = nullptr;
+  uint32_t debug_info_flags_ = 0;
+  FunctionTraceData* trace_data_ = nullptr;
+  Arena source_map_arena_;
+
+  size_t stack_size_ = 0;
+
+  static const uint8_t gpr_reg_map_[GPR_COUNT];
+  static const uint8_t fpr_reg_map_[FPR_COUNT];
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_EMITTER_H_
diff --git a/src/xenia/cpu/backend/a64/a64_function.cc b/src/xenia/cpu/backend/a64/a64_function.cc
new file mode 100644
index 000000000..9167bde7c
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_function.cc
@@ -0,0 +1,45 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_function.h"
+
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#include "xenia/cpu/processor.h"
+#include "xenia/cpu/thread_state.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+A64Function::A64Function(Module* module, uint32_t address)
+    : GuestFunction(module, address) {}
+
+A64Function::~A64Function() {
+  // machine_code_ is freed by code cache.
+}
+
+void A64Function::Setup(uint8_t* machine_code, size_t machine_code_length) {
+  machine_code_ = machine_code;
+  machine_code_length_ = machine_code_length;
+}
+
+bool A64Function::CallImpl(ThreadState* thread_state, uint32_t return_address) {
+  auto backend =
+      reinterpret_cast<A64Backend*>(thread_state->processor()->backend());
+  auto thunk = backend->host_to_guest_thunk();
+  thunk(machine_code_, thread_state->context(),
+        reinterpret_cast<void*>(uintptr_t(return_address)));
+  return true;
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_function.h b/src/xenia/cpu/backend/a64/a64_function.h
new file mode 100644
index 000000000..d4c568567
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_function.h
@@ -0,0 +1,44 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_FUNCTION_H_
+#define XENIA_CPU_BACKEND_A64_A64_FUNCTION_H_
+
+#include "xenia/cpu/function.h"
+#include "xenia/cpu/thread_state.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+class A64Function : public GuestFunction {
+ public:
+  A64Function(Module* module, uint32_t address);
+  ~A64Function() override;
+
+  uint8_t* machine_code() const override { return machine_code_; }
+  size_t machine_code_length() const override { return machine_code_length_; }
+
+  void Setup(uint8_t* machine_code, size_t machine_code_length);
+
+ protected:
+  bool CallImpl(ThreadState* thread_state, uint32_t return_address) override;
+
+ private:
+  uint8_t* machine_code_ = nullptr;
+  size_t machine_code_length_ = 0;
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_FUNCTION_H_
diff --git a/src/xenia/cpu/backend/a64/a64_op.h b/src/xenia/cpu/backend/a64/a64_op.h
new file mode 100644
index 000000000..2b2f58932
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_op.h
@@ -0,0 +1,618 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+#ifndef XENIA_CPU_BACKEND_A64_A64_OP_H_
+#define XENIA_CPU_BACKEND_A64_A64_OP_H_
+
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+
+#include "xenia/cpu/hir/instr.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+// TODO(benvanik): direct usings.
+using namespace xe::cpu;
+using namespace xe::cpu::hir;
+using namespace oaknut;
+using namespace oaknut::util;
+
+// Selects the right byte/word/etc from a vector. We need to flip logical
+// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...)
+#define VEC128_B(n) ((n) ^ 0x3)
+#define VEC128_W(n) ((n) ^ 0x1)
+#define VEC128_D(n) (n)
+#define VEC128_F(n) (n)
+
+enum KeyType {
+  KEY_TYPE_X = OPCODE_SIG_TYPE_X,
+  KEY_TYPE_L = OPCODE_SIG_TYPE_L,
+  KEY_TYPE_O = OPCODE_SIG_TYPE_O,
+  KEY_TYPE_S = OPCODE_SIG_TYPE_S,
+  KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE,
+  KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE,
+  KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE,
+  KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE,
+  KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE,
+  KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
+  KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
+};
+
+#pragma pack(push, 1)
+union InstrKey {
+  uint32_t value;
+  struct {
+    uint32_t opcode : 8;
+    uint32_t dest : 5;
+    uint32_t src1 : 5;
+    uint32_t src2 : 5;
+    uint32_t src3 : 5;
+    uint32_t reserved : 4;
+  };
+
+  operator uint32_t() const { return value; }
+
+  InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
+  InstrKey(uint32_t v) : value(v) {}
+  InstrKey(const Instr* i) : value(0) {
+    opcode = i->opcode->num;
+    uint32_t sig = i->opcode->signature;
+    dest =
+        GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0;
+    src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
+    if (src1 == OPCODE_SIG_TYPE_V) {
+      src1 += i->src1.value->type;
+    }
+    src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
+    if (src2 == OPCODE_SIG_TYPE_V) {
+      src2 += i->src2.value->type;
+    }
+    src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
+    if (src3 == OPCODE_SIG_TYPE_V) {
+      src3 += i->src3.value->type;
+    }
+  }
+
+  template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
+            KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
+  struct Construct {
+    static const uint32_t value =
+        (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
+  };
+};
+#pragma pack(pop)
+static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes");
+
+template <typename... Ts>
+struct CombinedStruct;
+template <>
+struct CombinedStruct<> {};
+template <typename T, typename... Ts>
+struct CombinedStruct<T, Ts...> : T, CombinedStruct<Ts...> {};
+
+struct OpBase {};
+
+template <typename T, KeyType KEY_TYPE>
+struct Op : OpBase {
+  static const KeyType key_type = KEY_TYPE;
+};
+
+struct VoidOp : Op<VoidOp, KEY_TYPE_X> {
+ protected:
+  friend struct Op<VoidOp, KEY_TYPE_X>;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  void Load(const Instr::Op& op) {}
+};
+
+struct OffsetOp : Op<OffsetOp, KEY_TYPE_O> {
+  uint64_t value;
+
+ protected:
+  friend struct Op<OffsetOp, KEY_TYPE_O>;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  void Load(const Instr::Op& op) { this->value = op.offset; }
+};
+
+struct SymbolOp : Op<SymbolOp, KEY_TYPE_S> {
+  Function* value;
+
+ protected:
+  friend struct Op<SymbolOp, KEY_TYPE_S>;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  bool Load(const Instr::Op& op) {
+    this->value = op.symbol;
+    return true;
+  }
+};
+
+struct LabelOp : Op<LabelOp, KEY_TYPE_L> {
+  hir::Label* value;
+
+ protected:
+  friend struct Op<LabelOp, KEY_TYPE_L>;
+  template <hir::Opcode OPCODE, typename... Ts>
+  friend struct I;
+  void Load(const Instr::Op& op) { this->value = op.label; }
+};
+
+template <typename T, KeyType KEY_TYPE, typename REG_TYPE, typename CONST_TYPE>
+struct ValueOp : Op<ValueOp<T, KEY_TYPE, REG_TYPE, CONST_TYPE>, KEY_TYPE> {
+  typedef REG_TYPE reg_type;
+  const Value* value;
+  bool is_constant;
+  virtual bool ConstantFitsIn32Reg() const { return true; }
+  const REG_TYPE& reg() const {
+    assert_true(!is_constant);
+    return reg_;
+  }
+  operator const REG_TYPE&() const { return reg(); }
+  bool IsEqual(const T& b) const {
+    if (is_constant && b.is_constant) {
+      return reinterpret_cast<const T*>(this)->constant() == b.constant();
+    } else if (!is_constant && !b.is_constant) {
+      return reg_.index() == b.reg_.index();
+    } else {
+      return false;
+    }
+  }
+  bool IsEqual(const oaknut::Reg& b) const {
+    if (is_constant) {
+      return false;
+    } else if (!is_constant) {
+      return reg_.index() == b.index();
+    } else {
+      return false;
+    }
+  }
+  bool operator==(const T& b) const { return IsEqual(b); }
+  bool operator!=(const T& b) const { return !IsEqual(b); }
+  bool operator==(const oaknut::Reg& b) const { return IsEqual(b); }
+  bool operator!=(const oaknut::Reg& b) const { return !IsEqual(b); }
+  void Load(const Instr::Op& op) {
+    value = op.value;
+    is_constant = value->IsConstant();
+    if (!is_constant) {
+      A64Emitter::SetupReg(value, reg_);
+    }
+  }
+
+ protected:
+  REG_TYPE reg_ = REG_TYPE(0);
+};
+
+struct I8Op : ValueOp<I8Op, KEY_TYPE_V_I8, WReg, int8_t> {
+  typedef ValueOp<I8Op, KEY_TYPE_V_I8, WReg, int8_t> BASE;
+  const int8_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i8;
+  }
+};
+struct I16Op : ValueOp<I16Op, KEY_TYPE_V_I16, WReg, int16_t> {
+  typedef ValueOp<I16Op, KEY_TYPE_V_I16, WReg, int16_t> BASE;
+  const int16_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i16;
+  }
+};
+struct I32Op : ValueOp<I32Op, KEY_TYPE_V_I32, WReg, int32_t> {
+  typedef ValueOp<I32Op, KEY_TYPE_V_I32, WReg, int32_t> BASE;
+  const int32_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i32;
+  }
+};
+struct I64Op : ValueOp<I64Op, KEY_TYPE_V_I64, XReg, int64_t> {
+  typedef ValueOp<I64Op, KEY_TYPE_V_I64, XReg, int64_t> BASE;
+  const int64_t constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.i64;
+  }
+  bool ConstantFitsIn32Reg() const override {
+    int64_t v = BASE::value->constant.i64;
+    if ((v & ~0x7FFFFFFF) == 0) {
+      // Fits under 31 bits, so just load using normal mov.
+      return true;
+    } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
+      // Negative number that fits in 32bits.
+      return true;
+    }
+    return false;
+  }
+};
+struct F32Op : ValueOp<F32Op, KEY_TYPE_V_F32, SReg, float> {
+  typedef ValueOp<F32Op, KEY_TYPE_V_F32, SReg, float> BASE;
+  const float constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.f32;
+  }
+};
+struct F64Op : ValueOp<F64Op, KEY_TYPE_V_F64, DReg, double> {
+  typedef ValueOp<F64Op, KEY_TYPE_V_F64, DReg, double> BASE;
+  const double constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.f64;
+  }
+};
+struct V128Op : ValueOp<V128Op, KEY_TYPE_V_V128, QReg, vec128_t> {
+  typedef ValueOp<V128Op, KEY_TYPE_V_V128, QReg, vec128_t> BASE;
+  const vec128_t& constant() const {
+    assert_true(BASE::is_constant);
+    return BASE::value->constant.v128;
+  }
+};
+
+template <typename DEST, typename... Tf>
+struct DestField;
+template <typename DEST>
+struct DestField<DEST> {
+  DEST dest;
+
+ protected:
+  bool LoadDest(const Instr* i) {
+    Instr::Op op;
+    op.value = i->dest;
+    dest.Load(op);
+    return true;
+  }
+};
+template <>
+struct DestField<VoidOp> {
+ protected:
+  bool LoadDest(const Instr* i) { return true; }
+};
+
+template <hir::Opcode OPCODE, typename... Ts>
+struct I;
+template <hir::Opcode OPCODE, typename DEST>
+struct I<OPCODE, DEST> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  const Instr* instr;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1>
+struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  const Instr* instr;
+  SRC1 src1 = {};
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      src1.Load(i->src1);
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2>
+struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type,
+                          SRC2::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  static const KeyType src2_type = SRC2::key_type;
+  const Instr* instr;
+  SRC1 src1;
+  SRC2 src2;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      src1.Load(i->src1);
+      src2.Load(i->src2);
+      return true;
+    }
+    return false;
+  }
+};
+template <hir::Opcode OPCODE, typename DEST, typename SRC1, typename SRC2,
+          typename SRC3>
+struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
+  typedef DestField<DEST> BASE;
+  static const hir::Opcode opcode = OPCODE;
+  static const uint32_t key =
+      InstrKey::Construct<OPCODE, DEST::key_type, SRC1::key_type,
+                          SRC2::key_type, SRC3::key_type>::value;
+  static const KeyType dest_type = DEST::key_type;
+  static const KeyType src1_type = SRC1::key_type;
+  static const KeyType src2_type = SRC2::key_type;
+  static const KeyType src3_type = SRC3::key_type;
+  const Instr* instr;
+  SRC1 src1;
+  SRC2 src2;
+  SRC3 src3;
+
+ protected:
+  template <typename SEQ, typename T>
+  friend struct Sequence;
+  bool Load(const Instr* i) {
+    if (InstrKey(i).value == key && BASE::LoadDest(i)) {
+      instr = i;
+      src1.Load(i->src1);
+      src2.Load(i->src2);
+      src3.Load(i->src3);
+      return true;
+    }
+    return false;
+  }
+};
+
+template <typename T>
+static const T GetTempReg(A64Emitter& e);
+template <>
+const WReg GetTempReg<WReg>(A64Emitter& e) {
+  return W0;
+}
+template <>
+const XReg GetTempReg<XReg>(A64Emitter& e) {
+  return X0;
+}
+
+template <typename SEQ, typename T>
+struct Sequence {
+  typedef T EmitArgType;
+
+  static constexpr uint32_t head_key() { return T::key; }
+
+  static bool Select(A64Emitter& e, const Instr* i) {
+    T args;
+    if (!args.Load(i)) {
+      return false;
+    }
+    SEQ::Emit(e, args);
+    return true;
+  }
+
+  template <typename REG_FN>
+  static void EmitUnaryOp(A64Emitter& e, const EmitArgType& i,
+                          const REG_FN& reg_fn) {
+    if (i.src1.is_constant) {
+      e.MOV(i.dest, i.src1.constant());
+      reg_fn(e, i.dest);
+    } else {
+      if (i.dest != i.src1) {
+        e.MOV(i.dest, i.src1);
+      }
+      reg_fn(e, i.dest);
+    }
+  }
+
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitCommutativeBinaryOp(A64Emitter& e, const EmitArgType& i,
+                                      const REG_REG_FN& reg_reg_fn,
+                                      const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      if (i.src2.is_constant) {
+        // Both constants.
+        if (i.src1.ConstantFitsIn32Reg()) {
+          e.MOV(i.dest, i.src2.constant());
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
+        } else if (i.src2.ConstantFitsIn32Reg()) {
+          e.MOV(i.dest, i.src1.constant());
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          e.MOV(i.dest, i.src1.constant());
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.MOV(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        // src1 constant.
+        if (i.dest == i.src2) {
+          if (i.src1.ConstantFitsIn32Reg()) {
+            reg_const_fn(e, i.dest, static_cast<int32_t>(i.src1.constant()));
+          } else {
+            auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+            e.MOV(temp, i.src1.constant());
+            reg_reg_fn(e, i.dest, temp);
+          }
+        } else {
+          e.MOV(i.dest, i.src1.constant());
+          reg_reg_fn(e, i.dest, i.src2);
+        }
+      }
+    } else if (i.src2.is_constant) {
+      if (i.dest == i.src1) {
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.MOV(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.MOV(i.dest, i.src2.constant());
+        reg_reg_fn(e, i.dest, i.src1);
+      }
+    } else {
+      if (i.dest == i.src1) {
+        reg_reg_fn(e, i.dest, i.src2);
+      } else if (i.dest == i.src2) {
+        reg_reg_fn(e, i.dest, i.src1);
+      } else {
+        e.MOV(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    }
+  }
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitAssociativeBinaryOp(A64Emitter& e, const EmitArgType& i,
+                                      const REG_REG_FN& reg_reg_fn,
+                                      const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      if (i.dest == i.src2) {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.MOV(temp, i.src2);
+        e.MOV(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, temp);
+      } else {
+        e.MOV(i.dest, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    } else if (i.src2.is_constant) {
+      if (i.dest == i.src1) {
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.MOV(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      } else {
+        e.MOV(i.dest, i.src1);
+        if (i.src2.ConstantFitsIn32Reg()) {
+          reg_const_fn(e, i.dest, static_cast<int32_t>(i.src2.constant()));
+        } else {
+          auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+          e.MOV(temp, i.src2.constant());
+          reg_reg_fn(e, i.dest, temp);
+        }
+      }
+    } else {
+      if (i.dest == i.src1) {
+        reg_reg_fn(e, i.dest, i.src2);
+      } else if (i.dest == i.src2) {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.MOV(temp, i.src2);
+        e.MOV(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, temp);
+      } else {
+        e.MOV(i.dest, i.src1);
+        reg_reg_fn(e, i.dest, i.src2);
+      }
+    }
+  }
+
+  template <typename REG = QReg, typename FN>
+  static void EmitCommutativeBinaryVOp(A64Emitter& e, const EmitArgType& i,
+                                       const FN& fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.LoadConstantV(Q0, i.src1.constant());
+      fn(e, i.dest, REG(0), i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.LoadConstantV(Q0, i.src2.constant());
+      fn(e, i.dest, i.src1, REG(0));
+    } else {
+      fn(e, i.dest, i.src1, i.src2);
+    }
+  }
+
+  template <typename REG = QReg, typename FN>
+  static void EmitAssociativeBinaryVOp(A64Emitter& e, const EmitArgType& i,
+                                       const FN& fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.LoadConstantV(Q0, i.src1.constant());
+      fn(e, i.dest, REG(0), i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.LoadConstantV(Q0, i.src2.constant());
+      fn(e, i.dest, i.src1, REG(0));
+    } else {
+      fn(e, i.dest, i.src1, i.src2);
+    }
+  }
+
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitCommutativeCompareOp(A64Emitter& e, const EmitArgType& i,
+                                       const REG_REG_FN& reg_reg_fn,
+                                       const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      if (i.src1.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.src2, static_cast<int32_t>(i.src1.constant()));
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+        e.MOV(temp, i.src1.constant());
+        reg_reg_fn(e, i.src2, temp);
+      }
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      if (i.src2.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.src1, static_cast<int32_t>(i.src2.constant()));
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.MOV(temp, i.src2.constant());
+        reg_reg_fn(e, i.src1, temp);
+      }
+    } else {
+      reg_reg_fn(e, i.src1, i.src2);
+    }
+  }
+  template <typename REG_REG_FN, typename REG_CONST_FN>
+  static void EmitAssociativeCompareOp(A64Emitter& e, const EmitArgType& i,
+                                       const REG_REG_FN& reg_reg_fn,
+                                       const REG_CONST_FN& reg_const_fn) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      if (i.src1.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.dest, i.src2, static_cast<int32_t>(i.src1.constant()),
+                     true);
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+        e.MOV(temp, i.src1.constant());
+        reg_reg_fn(e, i.dest, i.src2, temp, true);
+      }
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      if (i.src2.ConstantFitsIn32Reg()) {
+        reg_const_fn(e, i.dest, i.src1, static_cast<int32_t>(i.src2.constant()),
+                     false);
+      } else {
+        auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+        e.MOV(temp, i.src2.constant());
+        reg_reg_fn(e, i.dest, i.src1, temp, false);
+      }
+    } else {
+      reg_reg_fn(e, i.dest, i.src1, i.src2, false);
+    }
+  }
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_OP_H_
diff --git a/src/xenia/cpu/backend/a64/a64_seq_control.cc b/src/xenia/cpu/backend/a64/a64_seq_control.cc
new file mode 100644
index 000000000..e68d2955b
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_seq_control.cc
@@ -0,0 +1,551 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_sequences.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "xenia/cpu/backend/a64/a64_op.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+volatile int anchor_control = 0;
+
+// ============================================================================
+// OPCODE_DEBUG_BREAK
+// ============================================================================
+struct DEBUG_BREAK : Sequence<DEBUG_BREAK, I<OPCODE_DEBUG_BREAK, VoidOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) { e.DebugBreak(); }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK, DEBUG_BREAK);
+
+// ============================================================================
+// OPCODE_DEBUG_BREAK_TRUE
+// ============================================================================
+struct DEBUG_BREAK_TRUE_I8
+    : Sequence<DEBUG_BREAK_TRUE_I8, I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.DebugBreak();
+    e.l(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_I16
+    : Sequence<DEBUG_BREAK_TRUE_I16,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.DebugBreak();
+    e.l(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_I32
+    : Sequence<DEBUG_BREAK_TRUE_I32,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.DebugBreak();
+    e.l(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_I64
+    : Sequence<DEBUG_BREAK_TRUE_I64,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.DebugBreak();
+    e.l(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_F32
+    : Sequence<DEBUG_BREAK_TRUE_F32,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.DebugBreak();
+    e.l(skip);
+  }
+};
+struct DEBUG_BREAK_TRUE_F64
+    : Sequence<DEBUG_BREAK_TRUE_F64,
+               I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.DebugBreak();
+    e.l(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE, DEBUG_BREAK_TRUE_I8,
+                     DEBUG_BREAK_TRUE_I16, DEBUG_BREAK_TRUE_I32,
+                     DEBUG_BREAK_TRUE_I64, DEBUG_BREAK_TRUE_F32,
+                     DEBUG_BREAK_TRUE_F64);
+
+// ============================================================================
+// OPCODE_TRAP
+// ============================================================================
+struct TRAP : Sequence<TRAP, I<OPCODE_TRAP, VoidOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.Trap(i.instr->flags);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
+
+// ============================================================================
+// OPCODE_TRAP_TRUE
+// ============================================================================
+struct TRAP_TRUE_I8
+    : Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Trap(i.instr->flags);
+    e.l(skip);
+  }
+};
+struct TRAP_TRUE_I16
+    : Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Trap(i.instr->flags);
+    e.l(skip);
+  }
+};
+struct TRAP_TRUE_I32
+    : Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Trap(i.instr->flags);
+    e.l(skip);
+  }
+};
+struct TRAP_TRUE_I64
+    : Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Trap(i.instr->flags);
+    e.l(skip);
+  }
+};
+struct TRAP_TRUE_F32
+    : Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.Trap(i.instr->flags);
+    e.l(skip);
+  }
+};
+struct TRAP_TRUE_F64
+    : Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.Trap(i.instr->flags);
+    e.l(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
+                     TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_F32,
+                     TRAP_TRUE_F64);
+
+// ============================================================================
+// OPCODE_CALL
+// ============================================================================
+struct CALL : Sequence<CALL, I<OPCODE_CALL, VoidOp, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src1.value->is_guest());
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src1.value));
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL, CALL);
+
+// ============================================================================
+// OPCODE_CALL_TRUE
+// ============================================================================
+struct CALL_TRUE_I8
+    : Sequence<CALL_TRUE_I8, I<OPCODE_CALL_TRUE, VoidOp, I8Op, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.l(skip);
+  }
+};
+struct CALL_TRUE_I16
+    : Sequence<CALL_TRUE_I16, I<OPCODE_CALL_TRUE, VoidOp, I16Op, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.l(skip);
+  }
+};
+struct CALL_TRUE_I32
+    : Sequence<CALL_TRUE_I32, I<OPCODE_CALL_TRUE, VoidOp, I32Op, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.l(skip);
+  }
+};
+struct CALL_TRUE_I64
+    : Sequence<CALL_TRUE_I64, I<OPCODE_CALL_TRUE, VoidOp, I64Op, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.l(skip);
+  }
+};
+struct CALL_TRUE_F32
+    : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.l(skip);
+  }
+};
+struct CALL_TRUE_F64
+    : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->is_guest());
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
+    e.l(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
+                     CALL_TRUE_I32, CALL_TRUE_I64, CALL_TRUE_F32,
+                     CALL_TRUE_F64);
+
+// ============================================================================
+// OPCODE_CALL_INDIRECT
+// ============================================================================
+struct CALL_INDIRECT
+    : Sequence<CALL_INDIRECT, I<OPCODE_CALL_INDIRECT, VoidOp, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CallIndirect(i.instr, i.src1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT);
+
+// ============================================================================
+// OPCODE_CALL_INDIRECT_TRUE
+// ============================================================================
+struct CALL_INDIRECT_TRUE_I8
+    : Sequence<CALL_INDIRECT_TRUE_I8,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.CallIndirect(i.instr, i.src2);
+    e.l(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_I16
+    : Sequence<CALL_INDIRECT_TRUE_I16,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I16Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.CallIndirect(i.instr, i.src2);
+    e.l(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_I32
+    : Sequence<CALL_INDIRECT_TRUE_I32,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.CallIndirect(i.instr, i.src2);
+    e.l(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_I64
+    : Sequence<CALL_INDIRECT_TRUE_I64,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.CBZ(i.src1, skip);
+    e.CallIndirect(i.instr, i.src2);
+    e.l(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_F32
+    : Sequence<CALL_INDIRECT_TRUE_F32,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.CallIndirect(i.instr, i.src2);
+    e.l(skip);
+  }
+};
+struct CALL_INDIRECT_TRUE_F64
+    : Sequence<CALL_INDIRECT_TRUE_F64,
+               I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label skip;
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, skip);
+    e.CallIndirect(i.instr, i.src2);
+    e.l(skip);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
+                     CALL_INDIRECT_TRUE_I16, CALL_INDIRECT_TRUE_I32,
+                     CALL_INDIRECT_TRUE_I64, CALL_INDIRECT_TRUE_F32,
+                     CALL_INDIRECT_TRUE_F64);
+
+// ============================================================================
+// OPCODE_CALL_EXTERN
+// ============================================================================
+struct CALL_EXTERN
+    : Sequence<CALL_EXTERN, I<OPCODE_CALL_EXTERN, VoidOp, SymbolOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CallExtern(i.instr, i.src1.value);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN, CALL_EXTERN);
+
+// ============================================================================
+// OPCODE_RETURN
+// ============================================================================
+struct RETURN : Sequence<RETURN, I<OPCODE_RETURN, VoidOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // If this is the last instruction in the last block, just let us
+    // fall through.
+    if (i.instr->next || i.instr->block->next) {
+      e.B(e.epilog_label());
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_RETURN, RETURN);
+
+// ============================================================================
+// OPCODE_RETURN_TRUE
+// ============================================================================
+struct RETURN_TRUE_I8
+    : Sequence<RETURN_TRUE_I8, I<OPCODE_RETURN_TRUE, VoidOp, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CBNZ(i.src1, e.epilog_label());
+  }
+};
+struct RETURN_TRUE_I16
+    : Sequence<RETURN_TRUE_I16, I<OPCODE_RETURN_TRUE, VoidOp, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CBNZ(i.src1, e.epilog_label());
+  }
+};
+struct RETURN_TRUE_I32
+    : Sequence<RETURN_TRUE_I32, I<OPCODE_RETURN_TRUE, VoidOp, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CBNZ(i.src1, e.epilog_label());
+  }
+};
+struct RETURN_TRUE_I64
+    : Sequence<RETURN_TRUE_I64, I<OPCODE_RETURN_TRUE, VoidOp, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CBNZ(i.src1, e.epilog_label());
+  }
+};
+struct RETURN_TRUE_F32
+    : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1, 0);
+    e.B(Cond::NE, e.epilog_label());
+  }
+};
+struct RETURN_TRUE_F64
+    : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1, 0);
+    e.B(Cond::NE, e.epilog_label());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
+                     RETURN_TRUE_I32, RETURN_TRUE_I64, RETURN_TRUE_F32,
+                     RETURN_TRUE_F64);
+
+// ============================================================================
+// OPCODE_SET_RETURN_ADDRESS
+// ============================================================================
+struct SET_RETURN_ADDRESS
+    : Sequence<SET_RETURN_ADDRESS,
+               I<OPCODE_SET_RETURN_ADDRESS, VoidOp, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SetReturnAddress(i.src1.constant());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS);
+
+// ============================================================================
+// OPCODE_BRANCH
+// ============================================================================
+struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src1.value->name);
+    assert_not_null(label);
+    e.B(*label);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH);
+
+// ============================================================================
+// OPCODE_BRANCH_TRUE
+// ============================================================================
+struct BRANCH_TRUE_I8
+    : Sequence<BRANCH_TRUE_I8, I<OPCODE_BRANCH_TRUE, VoidOp, I8Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBNZ(i.src1, *label);
+  }
+};
+struct BRANCH_TRUE_I16
+    : Sequence<BRANCH_TRUE_I16, I<OPCODE_BRANCH_TRUE, VoidOp, I16Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBNZ(i.src1, *label);
+  }
+};
+struct BRANCH_TRUE_I32
+    : Sequence<BRANCH_TRUE_I32, I<OPCODE_BRANCH_TRUE, VoidOp, I32Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBNZ(i.src1, *label);
+  }
+};
+struct BRANCH_TRUE_I64
+    : Sequence<BRANCH_TRUE_I64, I<OPCODE_BRANCH_TRUE, VoidOp, I64Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBNZ(i.src1, *label);
+  }
+};
+struct BRANCH_TRUE_F32
+    : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.FCMP(i.src1, 0);
+    e.B(Cond::NE, *label);
+  }
+};
+struct BRANCH_TRUE_F64
+    : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.FCMP(i.src1, 0);
+    e.B(Cond::NE, *label);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
+                     BRANCH_TRUE_I32, BRANCH_TRUE_I64, BRANCH_TRUE_F32,
+                     BRANCH_TRUE_F64);
+
+// ============================================================================
+// OPCODE_BRANCH_FALSE
+// ============================================================================
+struct BRANCH_FALSE_I8
+    : Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBZ(i.src1, *label);
+  }
+};
+struct BRANCH_FALSE_I16
+    : Sequence<BRANCH_FALSE_I16,
+               I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBZ(i.src1, *label);
+  }
+};
+struct BRANCH_FALSE_I32
+    : Sequence<BRANCH_FALSE_I32,
+               I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBZ(i.src1, *label);
+  }
+};
+struct BRANCH_FALSE_I64
+    : Sequence<BRANCH_FALSE_I64,
+               I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.CBZ(i.src1, *label);
+  }
+};
+struct BRANCH_FALSE_F32
+    : Sequence<BRANCH_FALSE_F32,
+               I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, *label);
+  }
+};
+struct BRANCH_FALSE_F64
+    : Sequence<BRANCH_FALSE_F64,
+               I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    oaknut::Label* label = e.lookup_label(i.src2.value->name);
+    assert_not_null(label);
+    e.FCMP(i.src1, 0);
+    e.B(Cond::EQ, *label);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16,
+                     BRANCH_FALSE_I32, BRANCH_FALSE_I64, BRANCH_FALSE_F32,
+                     BRANCH_FALSE_F64);
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
\ No newline at end of file
diff --git a/src/xenia/cpu/backend/a64/a64_seq_memory.cc b/src/xenia/cpu/backend/a64/a64_seq_memory.cc
new file mode 100644
index 000000000..d7d66a14d
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_seq_memory.cc
@@ -0,0 +1,1207 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_sequences.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "xenia/base/memory.h"
+#include "xenia/cpu/backend/a64/a64_op.h"
+#include "xenia/cpu/backend/a64/a64_tracers.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+volatile int anchor_memory = 0;
+
+template <typename T>
+XReg ComputeMemoryAddressOffset(A64Emitter& e, const T& guest, const T& offset,
+                                WReg address_register = W3) {
+  assert_true(offset.is_constant);
+  const int32_t offset_const = static_cast<int32_t>(offset.constant());
+
+  if (guest.is_constant) {
+    uint32_t address = static_cast<uint32_t>(guest.constant());
+    address += offset_const;
+    if (address < 0x80000000) {
+      e.MOV(address_register.toX(), address);
+      e.ADD(address_register.toX(), e.GetMembaseReg(), address_register.toX());
+      return address_register.toX();
+    } else {
+      if (address >= 0xE0000000 &&
+          xe::memory::allocation_granularity() > 0x1000) {
+        e.MOV(W0, address + 0x1000);
+      } else {
+        e.MOV(W0, address);
+      }
+      e.ADD(address_register.toX(), e.GetMembaseReg(), X0);
+      return address_register.toX();
+    }
+  } else {
+    if (xe::memory::allocation_granularity() > 0x1000) {
+      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+      // it via memory mapping.
+      e.MOV(W0, 0xE0000000 - offset_const);
+      e.CMP(guest.reg().toW(), W0);
+      e.CSET(W0, Cond::HS);
+      e.ADD(W0, guest.reg().toW(), W0, LSL, 12);
+    } else {
+      // Clear the top 32 bits, as they are likely garbage.
+      // TODO(benvanik): find a way to avoid doing this.
+      e.MOV(W0, guest.reg().toW());
+    }
+    e.MOV(X1, offset_const);
+    e.ADD(X0, X0, X1);
+
+    e.ADD(address_register.toX(), e.GetMembaseReg(), X0);
+    return address_register.toX();
+  }
+}
+
+// Note: most *should* be aligned, but needs to be checked!
+template <typename T>
+XReg ComputeMemoryAddress(A64Emitter& e, const T& guest,
+                          WReg address_register = W3) {
+  if (guest.is_constant) {
+    // TODO(benvanik): figure out how to do this without a temp.
+    // Since the constant is often 0x8... if we tried to use that as a
+    // displacement it would be sign extended and mess things up.
+    const uint32_t address = static_cast<uint32_t>(guest.constant());
+    if (address < 0x80000000) {
+      e.MOV(W0, address);
+      e.ADD(address_register.toX(), e.GetMembaseReg(), X0);
+      return address_register.toX();
+    } else {
+      if (address >= 0xE0000000 &&
+          xe::memory::allocation_granularity() > 0x1000) {
+        e.MOV(W0, address + 0x1000u);
+      } else {
+        e.MOV(W0, address);
+      }
+      e.ADD(address_register.toX(), e.GetMembaseReg(), X0);
+      return address_register.toX();
+    }
+  } else {
+    if (xe::memory::allocation_granularity() > 0x1000) {
+      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+      // it via memory mapping.
+      e.MOV(W0, 0xE0000000);
+      e.CMP(guest.reg().toW(), W0);
+      e.CSET(W0, Cond::HS);
+      e.ADD(W0, guest.reg().toW(), W0, LSL, 12);
+    } else {
+      // Clear the top 32 bits, as they are likely garbage.
+      // TODO(benvanik): find a way to avoid doing this.
+      e.MOV(W0, guest.reg().toW());
+    }
+    e.ADD(address_register.toX(), e.GetMembaseReg(), X0);
+    return address_register.toX();
+  }
+}
+
+// ============================================================================
+// OPCODE_ATOMIC_EXCHANGE
+// ============================================================================
+// Note that the address we use here is a real, host address!
+// This is weird, and should be fixed.
+template <typename SEQ, typename REG, typename ARGS, typename FN>
+void EmitAtomicExchangeXX(A64Emitter& e, const ARGS& i, const FN& fn) {
+  if (i.dest == i.src1) {
+    e.MOV(X0, i.src1);
+    if (i.dest != i.src2) {
+      if (i.src2.is_constant) {
+        e.MOV(i.dest, i.src2.constant());
+      } else {
+        e.MOV(i.dest, i.src2);
+      }
+    }
+    fn(e, i.dest, X0);
+  } else {
+    if (i.dest != i.src2) {
+      if (i.src2.is_constant) {
+        e.MOV(i.dest, i.src2.constant());
+      } else {
+        e.MOV(i.dest, i.src2);
+      }
+    }
+    fn(e, i.dest, i.src1);
+  }
+}
+struct ATOMIC_EXCHANGE_I8
+    : Sequence<ATOMIC_EXCHANGE_I8,
+               I<OPCODE_ATOMIC_EXCHANGE, I8Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I8, WReg>(
+        e, i,
+        [](A64Emitter& e, WReg dest, XReg src) { e.SWPALB(dest, dest, src); });
+  }
+};
+struct ATOMIC_EXCHANGE_I16
+    : Sequence<ATOMIC_EXCHANGE_I16,
+               I<OPCODE_ATOMIC_EXCHANGE, I16Op, I64Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I8, WReg>(
+        e, i,
+        [](A64Emitter& e, WReg dest, XReg src) { e.SWPALH(dest, dest, src); });
+  }
+};
+struct ATOMIC_EXCHANGE_I32
+    : Sequence<ATOMIC_EXCHANGE_I32,
+               I<OPCODE_ATOMIC_EXCHANGE, I32Op, I64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I8, WReg>(
+        e, i,
+        [](A64Emitter& e, WReg dest, XReg src) { e.SWPAL(dest, dest, src); });
+  }
+};
+struct ATOMIC_EXCHANGE_I64
+    : Sequence<ATOMIC_EXCHANGE_I64,
+               I<OPCODE_ATOMIC_EXCHANGE, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAtomicExchangeXX<ATOMIC_EXCHANGE_I8, XReg>(
+        e, i,
+        [](A64Emitter& e, XReg dest, XReg src) { e.SWPAL(dest, dest, src); });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8,
+                     ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32,
+                     ATOMIC_EXCHANGE_I64);
+
+// ============================================================================
+// OPCODE_ATOMIC_COMPARE_EXCHANGE
+// ============================================================================
+struct ATOMIC_COMPARE_EXCHANGE_I32
+    : Sequence<ATOMIC_COMPARE_EXCHANGE_I32,
+               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (xe::memory::allocation_granularity() > 0x1000) {
+      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+      // it via memory mapping.
+      e.MOV(W3, 0xE0000000);
+      e.CMP(i.src1.reg().toW(), W3);
+      e.CSET(W1, Cond::HS);
+      e.ADD(W1, i.src1.reg().toW(), W1, LSL, 12);
+    } else {
+      e.MOV(W1, i.src1.reg().toW());
+    }
+    e.ADD(X1, e.GetMembaseReg(), X1);
+
+    const XReg address = X1;
+    const WReg expected = i.src2;
+    const WReg desired = i.src3;
+    const WReg status = W0;
+
+    if (e.IsFeatureEnabled(kA64EmitLSE)) {
+      e.MOV(status, expected);
+
+      // if([C] == A) [C] = B
+      // else A = [C]
+      e.CASAL(status, desired, address);
+      e.CMP(status, expected);
+      e.CSET(i.dest, Cond::EQ);
+      return;
+    }
+
+    oaknut::Label success, fail, retry;
+
+    e.l(retry);
+    e.LDAXR(W4, address);
+    e.CMP(W4, expected);
+    e.B(Cond::NE, fail);
+
+    e.STLXR(status.toW(), desired, address);
+    e.CBNZ(status, retry);
+    e.B(success);
+
+    e.l(fail);
+    e.CLREX();
+
+    e.l(success);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct ATOMIC_COMPARE_EXCHANGE_I64
+    : Sequence<ATOMIC_COMPARE_EXCHANGE_I64,
+               I<OPCODE_ATOMIC_COMPARE_EXCHANGE, I8Op, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (xe::memory::allocation_granularity() > 0x1000) {
+      // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+      // it via memory mapping.
+      e.MOV(W3, 0xE0000000);
+      e.CMP(i.src1.reg(), X3);
+      e.CSET(W1, Cond::HS);
+      e.ADD(W1, i.src1.reg().toW(), W1, LSL, 12);
+    } else {
+      e.MOV(W1, i.src1.reg().toW());
+    }
+    e.ADD(X1, e.GetMembaseReg(), X1);
+
+    const XReg address = X1;
+    const XReg expected = i.src2;
+    const XReg desired = i.src3;
+    const XReg status = X0;
+
+    if (e.IsFeatureEnabled(kA64EmitLSE)) {
+      e.MOV(status, expected);
+
+      // if([C] == A) [C] = B
+      // else A = [C]
+      e.CASAL(status, desired, address);
+      e.CMP(status, expected);
+      e.CSET(i.dest, Cond::EQ);
+      return;
+    }
+
+    oaknut::Label success, fail, retry;
+
+    e.l(retry);
+    e.LDAXR(X4, address);
+    e.CMP(X4, expected);
+    e.B(Cond::NE, fail);
+
+    e.STLXR(status.toW(), desired, address);
+    e.CBNZ(status, retry);
+    e.B(success);
+
+    e.l(fail);
+    e.CLREX();
+
+    e.l(success);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE,
+                     ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64);
+
+// ============================================================================
+// OPCODE_LOAD_LOCAL
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct LOAD_LOCAL_I8
+    : Sequence<LOAD_LOCAL_I8, I<OPCODE_LOAD_LOCAL, I8Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDRB(i.dest, SP, i.src1.constant());
+    // e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_I16
+    : Sequence<LOAD_LOCAL_I16, I<OPCODE_LOAD_LOCAL, I16Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDRH(i.dest, SP, i.src1.constant());
+    // e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_I32
+    : Sequence<LOAD_LOCAL_I32, I<OPCODE_LOAD_LOCAL, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, SP, i.src1.constant());
+    // e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_I64
+    : Sequence<LOAD_LOCAL_I64, I<OPCODE_LOAD_LOCAL, I64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, SP, i.src1.constant());
+    // e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_F32
+    : Sequence<LOAD_LOCAL_F32, I<OPCODE_LOAD_LOCAL, F32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, SP, i.src1.constant());
+    // e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_F64
+    : Sequence<LOAD_LOCAL_F64, I<OPCODE_LOAD_LOCAL, F64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, SP, i.src1.constant());
+    // e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+struct LOAD_LOCAL_V128
+    : Sequence<LOAD_LOCAL_V128, I<OPCODE_LOAD_LOCAL, V128Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, SP, i.src1.constant());
+    // e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL, LOAD_LOCAL_I8, LOAD_LOCAL_I16,
+                     LOAD_LOCAL_I32, LOAD_LOCAL_I64, LOAD_LOCAL_F32,
+                     LOAD_LOCAL_F64, LOAD_LOCAL_V128);
+
+// ============================================================================
+// OPCODE_STORE_LOCAL
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct STORE_LOCAL_I8
+    : Sequence<STORE_LOCAL_I8, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STRB(i.src2, SP, i.src1.constant());
+  }
+};
+struct STORE_LOCAL_I16
+    : Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STRH(i.src2, SP, i.src1.constant());
+  }
+};
+struct STORE_LOCAL_I32
+    : Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STR(i.src2, SP, i.src1.constant());
+  }
+};
+struct STORE_LOCAL_I64
+    : Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STR(i.src2, SP, i.src1.constant());
+  }
+};
+struct STORE_LOCAL_F32
+    : Sequence<STORE_LOCAL_F32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STR(i.src2, SP, i.src1.constant());
+  }
+};
+struct STORE_LOCAL_F64
+    : Sequence<STORE_LOCAL_F64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STR(i.src2, SP, i.src1.constant());
+  }
+};
+struct STORE_LOCAL_V128
+    : Sequence<STORE_LOCAL_V128, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2);
+    e.STR(i.src2, SP, i.src1.constant());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16,
+                     STORE_LOCAL_I32, STORE_LOCAL_I64, STORE_LOCAL_F32,
+                     STORE_LOCAL_F64, STORE_LOCAL_V128);
+
+// ============================================================================
+// OPCODE_LOAD_CONTEXT
+// ============================================================================
+struct LOAD_CONTEXT_I8
+    : Sequence<LOAD_CONTEXT_I8, I<OPCODE_LOAD_CONTEXT, I8Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDRB(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.LDRB(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI8));
+    }
+  }
+};
+struct LOAD_CONTEXT_I16
+    : Sequence<LOAD_CONTEXT_I16, I<OPCODE_LOAD_CONTEXT, I16Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDRH(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.LDRH(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI16));
+    }
+  }
+};
+struct LOAD_CONTEXT_I32
+    : Sequence<LOAD_CONTEXT_I32, I<OPCODE_LOAD_CONTEXT, I32Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.LDR(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
+    }
+  }
+};
+struct LOAD_CONTEXT_I64
+    : Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.LDR(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI64));
+    }
+  }
+};
+struct LOAD_CONTEXT_F32
+    : Sequence<LOAD_CONTEXT_F32, I<OPCODE_LOAD_CONTEXT, F32Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadF32));
+    }
+  }
+};
+struct LOAD_CONTEXT_F64
+    : Sequence<LOAD_CONTEXT_F64, I<OPCODE_LOAD_CONTEXT, F64Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadF64));
+    }
+  }
+};
+struct LOAD_CONTEXT_V128
+    : Sequence<LOAD_CONTEXT_V128, I<OPCODE_LOAD_CONTEXT, V128Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.LDR(i.dest, e.GetContextReg(), i.src1.value);
+    if (IsTracingData()) {
+      e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT, LOAD_CONTEXT_I8, LOAD_CONTEXT_I16,
+                     LOAD_CONTEXT_I32, LOAD_CONTEXT_I64, LOAD_CONTEXT_F32,
+                     LOAD_CONTEXT_F64, LOAD_CONTEXT_V128);
+
+// ============================================================================
+// OPCODE_STORE_CONTEXT
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct STORE_CONTEXT_I8
+    : Sequence<STORE_CONTEXT_I8,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.MOV(W0, i.src2.constant());
+      e.STRB(W0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STRB(i.src2.reg(), e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.LDRB(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI8));
+    }
+  }
+};
+struct STORE_CONTEXT_I16
+    : Sequence<STORE_CONTEXT_I16,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.MOV(W0, i.src2.constant());
+      e.STRH(W0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STRH(i.src2.reg(), e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.LDRH(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI16));
+    }
+  }
+};
+struct STORE_CONTEXT_I32
+    : Sequence<STORE_CONTEXT_I32,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.MOV(W0, i.src2.constant());
+      e.STR(W0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STR(i.src2.reg(), e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.LDR(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
+    }
+  }
+};
+struct STORE_CONTEXT_I64
+    : Sequence<STORE_CONTEXT_I64,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.MOV(X0, i.src2.constant());
+      e.STR(X0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STR(i.src2.reg(), e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.LDR(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI64));
+    }
+  }
+};
+struct STORE_CONTEXT_F32
+    : Sequence<STORE_CONTEXT_F32,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.MOV(W0, i.src2.value->constant.i32);
+      e.STR(W0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STR(i.src2, e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreF32));
+    }
+  }
+};
+struct STORE_CONTEXT_F64
+    : Sequence<STORE_CONTEXT_F64,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.MOV(X0, i.src2.value->constant.i64);
+      e.STR(X0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STR(i.src2, e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreF64));
+    }
+  }
+};
+struct STORE_CONTEXT_V128
+    : Sequence<STORE_CONTEXT_V128,
+               I<OPCODE_STORE_CONTEXT, VoidOp, OffsetOp, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.LoadConstantV(Q0, i.src2.constant());
+      e.STR(Q0, e.GetContextReg(), i.src1.value);
+    } else {
+      e.STR(i.src2, e.GetContextReg(), i.src1.value);
+    }
+    if (IsTracingData()) {
+      e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value);
+      e.MOV(e.GetNativeParam(0), i.src1.value);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT, STORE_CONTEXT_I8, STORE_CONTEXT_I16,
+                     STORE_CONTEXT_I32, STORE_CONTEXT_I64, STORE_CONTEXT_F32,
+                     STORE_CONTEXT_F64, STORE_CONTEXT_V128);
+
+// ============================================================================
+// OPCODE_LOAD_MMIO
+// ============================================================================
+// Note: all types are always aligned in the context.
+struct LOAD_MMIO_I32
+    : Sequence<LOAD_MMIO_I32, I<OPCODE_LOAD_MMIO, I32Op, OffsetOp, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // uint64_t (context, addr)
+    const auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
+    const auto read_address = uint32_t(i.src2.value);
+    e.MOV(e.GetNativeParam(0), uint64_t(mmio_range->callback_context));
+    e.MOV(e.GetNativeParam(1).toW(), read_address);
+    e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->read));
+    e.REV(i.dest, W0);
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(0).toW(), i.dest);
+      e.MOV(X1, read_address);
+      e.CallNative(reinterpret_cast<void*>(TraceContextLoadI32));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_MMIO, LOAD_MMIO_I32);
+
+// ============================================================================
+// OPCODE_STORE_MMIO
+// ============================================================================
+// Note: all types are always aligned on the stack.
+struct STORE_MMIO_I32
+    : Sequence<STORE_MMIO_I32,
+               I<OPCODE_STORE_MMIO, VoidOp, OffsetOp, OffsetOp, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // void (context, addr, value)
+    const auto mmio_range = reinterpret_cast<MMIORange*>(i.src1.value);
+    const auto write_address = uint32_t(i.src2.value);
+    e.MOV(e.GetNativeParam(0), uint64_t(mmio_range->callback_context));
+    e.MOV(e.GetNativeParam(1).toW(), write_address);
+    if (i.src3.is_constant) {
+      e.MOV(e.GetNativeParam(2).toW(), xe::byte_swap(i.src3.constant()));
+    } else {
+      e.REV(e.GetNativeParam(2).toW(), i.src3);
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(mmio_range->write));
+    if (IsTracingData()) {
+      if (i.src3.is_constant) {
+        e.MOV(e.GetNativeParam(0).toW(), i.src3.constant());
+      } else {
+        e.MOV(e.GetNativeParam(0).toW(), i.src3);
+      }
+      e.MOV(X1, write_address);
+      e.CallNative(reinterpret_cast<void*>(TraceContextStoreI32));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32);
+
+// ============================================================================
+// OPCODE_LOAD_OFFSET
+// ============================================================================
+struct LOAD_OFFSET_I8
+    : Sequence<LOAD_OFFSET_I8, I<OPCODE_LOAD_OFFSET, I8Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    e.LDRB(i.dest, addr_reg);
+  }
+};
+
+struct LOAD_OFFSET_I16
+    : Sequence<LOAD_OFFSET_I16, I<OPCODE_LOAD_OFFSET, I16Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      e.LDRH(i.dest, addr_reg);
+      e.REV16(i.dest, i.dest);
+    } else {
+      e.LDRH(i.dest, addr_reg);
+    }
+  }
+};
+
+struct LOAD_OFFSET_I32
+    : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      e.LDR(i.dest, addr_reg);
+      e.REV(i.dest, i.dest);
+    } else {
+      e.LDR(i.dest, addr_reg);
+    }
+  }
+};
+
+struct LOAD_OFFSET_I64
+    : Sequence<LOAD_OFFSET_I64, I<OPCODE_LOAD_OFFSET, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      e.LDR(i.dest, addr_reg);
+      e.REV(i.dest, i.dest);
+    } else {
+      e.LDR(i.dest, addr_reg);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
+                     LOAD_OFFSET_I32, LOAD_OFFSET_I64);
+
+// ============================================================================
+// OPCODE_STORE_OFFSET
+// ============================================================================
+struct STORE_OFFSET_I8
+    : Sequence<STORE_OFFSET_I8,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.src3.is_constant) {
+      e.MOV(W0, i.src3.constant());
+      e.STRB(W0, addr_reg);
+    } else {
+      e.STRB(i.src3, addr_reg);
+    }
+  }
+};
+
+struct STORE_OFFSET_I16
+    : Sequence<STORE_OFFSET_I16,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src3.is_constant);
+      assert_always("not implemented");
+    } else {
+      if (i.src3.is_constant) {
+        e.MOV(W0, i.src3.constant());
+        e.STRH(W0, addr_reg);
+      } else {
+        e.STRH(i.src3, addr_reg);
+      }
+    }
+  }
+};
+
+struct STORE_OFFSET_I32
+    : Sequence<STORE_OFFSET_I32,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src3.is_constant);
+      assert_always("not implemented");
+    } else {
+      if (i.src3.is_constant) {
+        e.MOV(W0, i.src3.constant());
+        e.STR(W0, addr_reg);
+      } else {
+        e.STR(i.src3, addr_reg);
+      }
+    }
+  }
+};
+
+struct STORE_OFFSET_I64
+    : Sequence<STORE_OFFSET_I64,
+               I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src3.is_constant);
+      assert_always("not implemented");
+    } else {
+      if (i.src3.is_constant) {
+        e.MovMem64(addr_reg, 0, i.src3.constant());
+      } else {
+        e.STR(i.src3, addr_reg);
+      }
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16,
+                     STORE_OFFSET_I32, STORE_OFFSET_I64);
+
+// ============================================================================
+// OPCODE_LOAD
+// ============================================================================
+struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    e.LDRB(i.dest, addr_reg);
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1).toW(), i.dest);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI8));
+    }
+  }
+};
+struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      e.LDRH(i.dest, addr_reg);
+      e.REV16(i.dest, i.dest);
+    } else {
+      e.LDRH(i.dest, addr_reg);
+    }
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1).toW(), i.dest);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI16));
+    }
+  }
+};
+struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      e.LDR(i.dest, addr_reg);
+      e.REV(i.dest, i.dest);
+    } else {
+      e.LDR(i.dest, addr_reg);
+    }
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1).toW(), i.dest);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
+    }
+  }
+};
+struct LOAD_I64 : Sequence<LOAD_I64, I<OPCODE_LOAD, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      e.LDR(i.dest, addr_reg);
+      e.REV64(i.dest, i.dest);
+    } else {
+      e.LDR(i.dest, addr_reg);
+    }
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1), i.dest);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI64));
+    }
+  }
+};
+struct LOAD_F32 : Sequence<LOAD_F32, I<OPCODE_LOAD, F32Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    e.LDR(i.dest, addr_reg);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_always("not implemented yet");
+    }
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF32));
+    }
+  }
+};
+struct LOAD_F64 : Sequence<LOAD_F64, I<OPCODE_LOAD, F64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    e.LDR(i.dest, addr_reg);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_always("not implemented yet");
+    }
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadF64));
+    }
+  }
+};
+struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    e.LDR(i.dest, addr_reg);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      // Reverse upper and lower 64-bit halfs
+      e.REV64(i.dest.reg().B16(), i.dest.reg().B16());
+      // Reverse the 64-bit halfs themselves
+      e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+    }
+    if (IsTracingData()) {
+      e.MOV(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD, LOAD_I8, LOAD_I16, LOAD_I32, LOAD_I64,
+                     LOAD_F32, LOAD_F64, LOAD_V128);
+
+// ============================================================================
+// OPCODE_STORE
+// ============================================================================
+// Note: most *should* be aligned, but needs to be checked!
+struct STORE_I8 : Sequence<STORE_I8, I<OPCODE_STORE, VoidOp, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.src2.is_constant) {
+      e.MOV(W0, i.src2.constant());
+      e.STRB(W0, addr_reg);
+    } else {
+      e.STRB(i.src2.reg(), addr_reg);
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.LDRB(e.GetNativeParam(1).toW(), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI8));
+    }
+  }
+};
+struct STORE_I16 : Sequence<STORE_I16, I<OPCODE_STORE, VoidOp, I64Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.MOV(W0, i.src2.constant());
+        e.STRH(W0, addr_reg);
+      } else {
+        e.STRH(i.src2.reg(), addr_reg);
+      }
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.LDRH(e.GetNativeParam(1).toW(), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI16));
+    }
+  }
+};
+struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.MOV(W0, i.src2.constant());
+        e.STR(W0, addr_reg);
+      } else {
+        e.STR(i.src2.reg(), addr_reg);
+      }
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.LDR(e.GetNativeParam(1).toW(), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
+    }
+  }
+};
+struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.MovMem64(addr_reg, 0, i.src2.constant());
+      } else {
+        e.STR(i.src2.reg(), addr_reg);
+      }
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.LDR(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI64));
+    }
+  }
+};
+struct STORE_F32 : Sequence<STORE_F32, I<OPCODE_STORE, VoidOp, I64Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not yet implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.MOV(W0, i.src2.value->constant.i32);
+        e.STR(W0, addr_reg);
+      } else {
+        e.STR(i.src2, addr_reg);
+      }
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.MOV(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF32));
+    }
+  }
+};
+struct STORE_F64 : Sequence<STORE_F64, I<OPCODE_STORE, VoidOp, I64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      assert_always("not yet implemented");
+    } else {
+      if (i.src2.is_constant) {
+        e.MOV(X0, i.src2.value->constant.i64);
+        e.STR(X0, addr_reg);
+      } else {
+        e.STR(i.src2, addr_reg);
+      }
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.MOV(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreF64));
+    }
+  }
+};
+struct STORE_V128
+    : Sequence<STORE_V128, I<OPCODE_STORE, VoidOp, I64Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
+      assert_false(i.src2.is_constant);
+      // Reverse upper and lower 64-bit halfs
+      e.REV64(Q0.B16(), i.src2.reg().B16());
+      // Reverse the 64-bit halfs themselves
+      e.EXT(Q0.B16(), Q0.B16(), Q0.B16(), 8);
+      e.STR(Q0, addr_reg);
+    } else {
+      if (i.src2.is_constant) {
+        e.LoadConstantV(Q0, i.src2.constant());
+        e.STR(Q0, addr_reg);
+      } else {
+        e.STR(i.src2, addr_reg);
+      }
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.MOV(e.GetNativeParam(1), addr_reg);
+      e.MOV(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreV128));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_STORE, STORE_I8, STORE_I16, STORE_I32, STORE_I64,
+                     STORE_F32, STORE_F64, STORE_V128);
+
+// ============================================================================
+// OPCODE_CACHE_CONTROL
+// ============================================================================
+struct CACHE_CONTROL
+    : Sequence<CACHE_CONTROL,
+               I<OPCODE_CACHE_CONTROL, VoidOp, I64Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    bool is_clflush = false, is_prefetch = false;
+    switch (CacheControlType(i.instr->flags)) {
+      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
+      case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE:
+        is_prefetch = true;
+        break;
+      case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE:
+      case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH:
+        is_clflush = true;
+        break;
+      default:
+        assert_unhandled_case(CacheControlType(i.instr->flags));
+        return;
+    }
+    size_t cache_line_size = i.src2.value;
+
+    XReg addr = X0;
+    uint32_t address_constant;
+    if (i.src1.is_constant) {
+      // TODO(benvanik): figure out how to do this without a temp.
+      // Since the constant is often 0x8... if we tried to use that as a
+      // displacement it would be sign extended and mess things up.
+      address_constant = static_cast<uint32_t>(i.src1.constant());
+      if (address_constant < 0x80000000) {
+        e.ADD(addr, e.GetMembaseReg(), address_constant);
+      } else {
+        if (address_constant >= 0xE0000000 &&
+            xe::memory::allocation_granularity() > 0x1000) {
+          e.MOV(X1, address_constant + 0x1000);
+        } else {
+          e.MOV(X1, address_constant);
+        }
+        e.ADD(addr, e.GetMembaseReg(), X1);
+      }
+    } else {
+      if (xe::memory::allocation_granularity() > 0x1000) {
+        // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
+        // it via memory mapping.
+        e.MOV(X1, 0xE0000000);
+        e.CMP(i.src1.reg(), X1);
+        e.CSET(X1, Cond::HS);
+        e.ADD(X1, i.src1.reg(), X1, LSL, 12);
+      } else {
+        // Clear the top 32 bits, as they are likely garbage.
+        e.MOV(W1, i.src1.reg().toW());
+      }
+      e.ADD(addr, e.GetMembaseReg(), X1);
+    }
+
+    if (is_clflush) {
+      // TODO(wunkolo): These kind of cache-maintenance instructions cause an
+      // illegal-instruction on windows, but is trapped to proper EL1 code on
+      // Linux. Need a way to do cache-maintenance on Windows-Arm
+      // e.DC(DcOp::CIVAC, addr);
+
+      // Full data sync
+      e.DSB(BarrierOp::ISH);
+    }
+    if (is_prefetch) {
+      e.PRFM(PrfOp::PLDL1KEEP, addr);
+    }
+
+    if (cache_line_size >= 128) {
+      // Prefetch the other 64 bytes of the 128-byte cache line.
+      if (i.src1.is_constant && address_constant < 0x80000000) {
+        e.ADD(addr, e.GetMembaseReg(), address_constant ^ 64);
+      } else {
+        e.EOR(X1, X1, 64);
+      }
+      if (is_clflush) {
+        // TODO(wunkolo): These kind of cache-maintenance instructions cause an
+        // illegal-instruction on windows, but is trapped to proper EL1 code on
+        // Linux. Need a way to do cache-maintenance on Windows-Arm
+        // e.DC(DcOp::CIVAC, addr);
+
+        // Full data sync
+        e.DSB(BarrierOp::ISH);
+      }
+      if (is_prefetch) {
+        e.PRFM(PrfOp::PLDL1KEEP, addr);
+      }
+      assert_true(cache_line_size == 128);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CACHE_CONTROL, CACHE_CONTROL);
+
+// ============================================================================
+// OPCODE_MEMORY_BARRIER
+// ============================================================================
+struct MEMORY_BARRIER
+    : Sequence<MEMORY_BARRIER, I<OPCODE_MEMORY_BARRIER, VoidOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.DMB(BarrierOp::SY);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MEMORY_BARRIER, MEMORY_BARRIER);
+
+// ============================================================================
+// OPCODE_MEMSET
+// ============================================================================
+struct MEMSET_I64_I8_I64
+    : Sequence<MEMSET_I64_I8_I64,
+               I<OPCODE_MEMSET, VoidOp, I64Op, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    assert_true(i.src3.is_constant);
+    assert_true(i.src2.constant() == 0);
+    e.MOVI(Q0.B16(), 0);
+    auto addr_reg = ComputeMemoryAddress(e, i.src1);
+    switch (i.src3.constant()) {
+      case 32:
+        e.STP(Q0, Q0, addr_reg, 0 * 16);
+        break;
+      case 128:
+        e.STP(Q0, Q0, addr_reg, 0 * 16);
+        e.STP(Q0, Q0, addr_reg, 2 * 16);
+        e.STP(Q0, Q0, addr_reg, 4 * 16);
+        e.STP(Q0, Q0, addr_reg, 6 * 16);
+        break;
+      default:
+        assert_unhandled_case(i.src3.constant());
+        break;
+    }
+    if (IsTracingData()) {
+      addr_reg = ComputeMemoryAddress(e, i.src1);
+      e.MOV(e.GetNativeParam(2), i.src3.constant());
+      e.MOV(e.GetNativeParam(1), i.src2.constant());
+      e.LDR(e.GetNativeParam(0), addr_reg);
+      e.CallNative(reinterpret_cast<void*>(TraceMemset));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MEMSET, MEMSET_I64_I8_I64);
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_seq_vector.cc b/src/xenia/cpu/backend/a64/a64_seq_vector.cc
new file mode 100644
index 000000000..abc4688ac
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc
@@ -0,0 +1,2170 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Xenia Developers. All rights reserved.                      *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_sequences.h"
+#include "xenia/cpu/backend/a64/a64_util.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "xenia/cpu/backend/a64/a64_op.h"
+
+// For OPCODE_PACK/OPCODE_UNPACK
+#include "third_party/half/include/half.hpp"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+volatile int anchor_vector = 0;
+
+// ============================================================================
+// OPCODE_VECTOR_CONVERT_I2F
+// ============================================================================
+struct VECTOR_CONVERT_I2F
+    : Sequence<VECTOR_CONVERT_I2F,
+               I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      e.UCVTF(i.dest.reg().S4(), i.src1.reg().S4());
+    } else {
+      e.SCVTF(i.dest.reg().S4(), i.src1.reg().S4());
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F, VECTOR_CONVERT_I2F);
+
+// ============================================================================
+// OPCODE_VECTOR_CONVERT_F2I
+// ============================================================================
+struct VECTOR_CONVERT_F2I
+    : Sequence<VECTOR_CONVERT_F2I,
+               I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      e.FCVTZU(i.dest.reg().S4(), i.src1.reg().S4());
+    } else {
+      e.FCVTZS(i.dest.reg().S4(), i.src1.reg().S4());
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);
+
+// ============================================================================
+// OPCODE_LOAD_VECTOR_SHL
+// ============================================================================
+static const vec128_t lvsl_table[16] = {
+    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+    vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
+    vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
+    vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18),
+    vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19),
+    vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
+    vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
+    vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22),
+    vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
+    vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24),
+    vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
+    vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
+    vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
+    vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28),
+    vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29),
+    vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
+};
+struct LOAD_VECTOR_SHL_I8
+    : Sequence<LOAD_VECTOR_SHL_I8, I<OPCODE_LOAD_VECTOR_SHL, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      auto sh = i.src1.constant();
+      assert_true(sh < xe::countof(lvsl_table));
+      e.MOV(X0, reinterpret_cast<uintptr_t>(&lvsl_table[sh]));
+      e.LDR(i.dest, X0);
+    } else {
+      e.MOV(X0, reinterpret_cast<uintptr_t>(lvsl_table));
+      e.AND(X1, i.src1.reg().toX(), 0xf);
+      e.LDR(i.dest, X0, X1, IndexExt::LSL, 4);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL, LOAD_VECTOR_SHL_I8);
+
+// ============================================================================
+// OPCODE_LOAD_VECTOR_SHR
+// ============================================================================
+static const vec128_t lvsr_table[16] = {
+    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+    vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
+    vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29),
+    vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28),
+    vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),
+    vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
+    vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
+    vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24),
+    vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23),
+    vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22),
+    vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
+    vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
+    vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19),
+    vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18),
+    vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
+    vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
+};
+struct LOAD_VECTOR_SHR_I8
+    : Sequence<LOAD_VECTOR_SHR_I8, I<OPCODE_LOAD_VECTOR_SHR, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      auto sh = i.src1.constant();
+      assert_true(sh < xe::countof(lvsr_table));
+      e.MOV(X0, reinterpret_cast<uintptr_t>(&lvsr_table[sh]));
+      e.LDR(i.dest, X0);
+    } else {
+      e.MOV(X0, reinterpret_cast<uintptr_t>(lvsr_table));
+      e.AND(X1, i.src1.reg().toX(), 0xf);
+      e.LDR(i.dest, X0, X1, IndexExt::LSL, 4);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR, LOAD_VECTOR_SHR_I8);
+
+// ============================================================================
+// OPCODE_VECTOR_MAX
+// ============================================================================
+struct VECTOR_MAX
+    : Sequence<VECTOR_MAX, I<OPCODE_VECTOR_MAX, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          uint32_t part_type = i.instr->flags >> 8;
+          if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.UMAX(dest.B16(), src1.B16(), src2.B16());
+                break;
+              case INT16_TYPE:
+                e.UMAX(dest.H8(), src1.H8(), src2.H8());
+                break;
+              case INT32_TYPE:
+                e.UMAX(dest.S4(), src1.S4(), src2.S4());
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          } else {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.SMAX(dest.B16(), src1.B16(), src2.B16());
+                break;
+              case INT16_TYPE:
+                e.SMAX(dest.H8(), src1.H8(), src2.H8());
+                break;
+              case INT32_TYPE:
+                e.SMAX(dest.S4(), src1.S4(), src2.S4());
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MAX, VECTOR_MAX);
+
+// ============================================================================
+// OPCODE_VECTOR_MIN
+// ============================================================================
+struct VECTOR_MIN
+    : Sequence<VECTOR_MIN, I<OPCODE_VECTOR_MIN, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          uint32_t part_type = i.instr->flags >> 8;
+          if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.UMIN(dest.B16(), src1.B16(), src2.B16());
+                break;
+              case INT16_TYPE:
+                e.UMIN(dest.H8(), src1.H8(), src2.H8());
+                break;
+              case INT32_TYPE:
+                e.UMIN(dest.S4(), src1.S4(), src2.S4());
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          } else {
+            switch (part_type) {
+              case INT8_TYPE:
+                e.SMIN(dest.B16(), src1.B16(), src2.B16());
+                break;
+              case INT16_TYPE:
+                e.SMIN(dest.H8(), src1.H8(), src2.H8());
+                break;
+              case INT32_TYPE:
+                e.SMIN(dest.S4(), src1.S4(), src2.S4());
+                break;
+              default:
+                assert_unhandled_case(part_type);
+                break;
+            }
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MIN, VECTOR_MIN);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_EQ
+// ============================================================================
+struct VECTOR_COMPARE_EQ_V128
+    : Sequence<VECTOR_COMPARE_EQ_V128,
+               I<OPCODE_VECTOR_COMPARE_EQ, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.CMEQ(dest.B16(), src1.B16(), src2.B16());
+              break;
+            case INT16_TYPE:
+              e.CMEQ(dest.H8(), src1.H8(), src2.H8());
+              break;
+            case INT32_TYPE:
+              e.CMEQ(dest.S4(), src1.S4(), src2.S4());
+              break;
+            case FLOAT32_TYPE:
+              e.FCMEQ(dest.S4(), src1.S4(), src2.S4());
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ, VECTOR_COMPARE_EQ_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_SGT
+// ============================================================================
+struct VECTOR_COMPARE_SGT_V128
+    : Sequence<VECTOR_COMPARE_SGT_V128,
+               I<OPCODE_VECTOR_COMPARE_SGT, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.CMGT(dest.B16(), src1.B16(), src2.B16());
+              break;
+            case INT16_TYPE:
+              e.CMGT(dest.H8(), src1.H8(), src2.H8());
+              break;
+            case INT32_TYPE:
+              e.CMGT(dest.S4(), src1.S4(), src2.S4());
+              break;
+            case FLOAT32_TYPE:
+              e.FCMGT(dest.S4(), src1.S4(), src2.S4());
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT, VECTOR_COMPARE_SGT_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_SGE
+// ============================================================================
+struct VECTOR_COMPARE_SGE_V128
+    : Sequence<VECTOR_COMPARE_SGE_V128,
+               I<OPCODE_VECTOR_COMPARE_SGE, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.CMGE(dest.B16(), src1.B16(), src2.B16());
+              break;
+            case INT16_TYPE:
+              e.CMGE(dest.H8(), src1.H8(), src2.H8());
+              break;
+            case INT32_TYPE:
+              e.CMGE(dest.S4(), src1.S4(), src2.S4());
+              break;
+            case FLOAT32_TYPE:
+              e.FCMGE(dest.S4(), src1.S4(), src2.S4());
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE, VECTOR_COMPARE_SGE_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_UGT
+// ============================================================================
+struct VECTOR_COMPARE_UGT_V128
+    : Sequence<VECTOR_COMPARE_UGT_V128,
+               I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.CMHI(dest.B16(), src1.B16(), src2.B16());
+              break;
+            case INT16_TYPE:
+              e.CMHI(dest.H8(), src1.H8(), src2.H8());
+              break;
+            case INT32_TYPE:
+              e.CMHI(dest.S4(), src1.S4(), src2.S4());
+              break;
+            case FLOAT32_TYPE:
+              e.FABS(Q0.S4(), src1.S4());
+              e.FABS(Q1.S4(), src2.S4());
+              e.FCMGT(dest.S4(), Q0.S4(), Q1.S4());
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT, VECTOR_COMPARE_UGT_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_COMPARE_UGE
+// ============================================================================
+struct VECTOR_COMPARE_UGE_V128
+    : Sequence<VECTOR_COMPARE_UGE_V128,
+               I<OPCODE_VECTOR_COMPARE_UGE, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAssociativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          switch (i.instr->flags) {
+            case INT8_TYPE:
+              e.CMHS(dest.B16(), src1.B16(), src2.B16());
+              break;
+            case INT16_TYPE:
+              e.CMHS(dest.H8(), src1.H8(), src2.H8());
+              break;
+            case INT32_TYPE:
+              e.CMHS(dest.S4(), src1.S4(), src2.S4());
+              break;
+            case FLOAT32_TYPE:
+              e.FABS(Q0.S4(), src1.S4());
+              e.FABS(Q1.S4(), src2.S4());
+              e.FCMGE(dest.S4(), Q0.S4(), Q1.S4());
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE, VECTOR_COMPARE_UGE_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_ADD
+// ============================================================================
+struct VECTOR_ADD
+    : Sequence<VECTOR_ADD, I<OPCODE_VECTOR_ADD, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, const QReg& dest, QReg src1, QReg src2) {
+          const TypeName part_type =
+              static_cast<TypeName>(i.instr->flags & 0xFF);
+          const uint32_t arithmetic_flags = i.instr->flags >> 8;
+          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
+          bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE);
+          switch (part_type) {
+            case INT8_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  e.UQADD(dest.B16(), src1.B16(), src2.B16());
+                } else {
+                  e.SQADD(dest.B16(), src1.B16(), src2.B16());
+                }
+              } else {
+                e.ADD(dest.B16(), src1.B16(), src2.B16());
+              }
+              break;
+            case INT16_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  e.UQADD(dest.H8(), src1.H8(), src2.H8());
+                } else {
+                  e.SQADD(dest.H8(), src1.H8(), src2.H8());
+                }
+              } else {
+                e.ADD(dest.H8(), src1.H8(), src2.H8());
+              }
+              break;
+            case INT32_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  e.UQADD(dest.S4(), src1.S4(), src2.S4());
+                } else {
+                  e.SQADD(dest.S4(), src1.S4(), src2.S4());
+                }
+              } else {
+                e.ADD(dest.S4(), src1.S4(), src2.S4());
+              }
+              break;
+            case FLOAT32_TYPE:
+              assert_false(is_unsigned);
+              assert_false(saturate);
+              e.FADD(dest.S4(), src1.S4(), src2.S4());
+              break;
+            default:
+              assert_unhandled_case(part_type);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD, VECTOR_ADD);
+
+// ============================================================================
+// OPCODE_VECTOR_SUB
+// ============================================================================
+struct VECTOR_SUB
+    : Sequence<VECTOR_SUB, I<OPCODE_VECTOR_SUB, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, const QReg& dest, QReg src1, QReg src2) {
+          const TypeName part_type =
+              static_cast<TypeName>(i.instr->flags & 0xFF);
+          const uint32_t arithmetic_flags = i.instr->flags >> 8;
+          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
+          bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE);
+          switch (part_type) {
+            case INT8_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  e.UQSUB(dest.B16(), src1.B16(), src2.B16());
+                } else {
+                  e.SQSUB(dest.B16(), src1.B16(), src2.B16());
+                }
+              } else {
+                e.SUB(dest.B16(), src1.B16(), src2.B16());
+              }
+              break;
+            case INT16_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  e.UQSUB(dest.H8(), src1.H8(), src2.H8());
+                } else {
+                  e.SQSUB(dest.H8(), src1.H8(), src2.H8());
+                }
+              } else {
+                e.SUB(dest.H8(), src1.H8(), src2.H8());
+              }
+              break;
+            case INT32_TYPE:
+              if (saturate) {
+                if (is_unsigned) {
+                  e.UQSUB(dest.S4(), src1.S4(), src2.S4());
+                } else {
+                  e.SQSUB(dest.S4(), src1.S4(), src2.S4());
+                }
+              } else {
+                e.SUB(dest.S4(), src1.S4(), src2.S4());
+              }
+              break;
+            case FLOAT32_TYPE:
+              assert_false(is_unsigned);
+              assert_false(saturate);
+              e.FSUB(dest.S4(), src1.S4(), src2.S4());
+              break;
+            default:
+              assert_unhandled_case(part_type);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
+
+// ============================================================================
+// OPCODE_VECTOR_SHL
+// ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static uint8x16_t EmulateVectorShl(void*, std::byte src1[16],
+                                   std::byte src2[16]) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];
+
+  // Load NEON registers into a C array.
+  vst1q_u8(reinterpret_cast<T*>(value), vld1q_u8(src1));
+  vst1q_u8(reinterpret_cast<T*>(shamt), vld1q_u8(src2));
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return vld1q_u8(value);
+}
+struct VECTOR_SHL_V128
+    : Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+  }
+
+  static void EmitInt8(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SHL
+        e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+
+  static void EmitInt16(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SHL
+        e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0] & 0xF);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+
+  static void EmitInt32(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SHL
+        e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0] & 0x1F);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_SHR
+// ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static uint8x16_t EmulateVectorShr(void*, std::byte src1[16],
+                                   std::byte src2[16]) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];
+
+  // Load NEON registers into a C array.
+  vst1q_u8(reinterpret_cast<T*>(value), vld1q_u8(src1));
+  vst1q_u8(reinterpret_cast<T*>(shamt), vld1q_u8(src2));
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return vld1q_u8(value);
+}
+struct VECTOR_SHR_V128
+    : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+  }
+
+  static void EmitInt8(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use USHR
+        e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+
+  static void EmitInt16(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 8 - n; ++n) {
+        if (shamt.u16[n] != shamt.u16[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use USHR
+        e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+
+  static void EmitInt32(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use USHR
+        e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_SHA
+// ============================================================================
+struct VECTOR_SHA_V128
+    : Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitInt32(e, i);
+        break;
+      default:
+        assert_always();
+        break;
+    }
+  }
+
+  static void EmitInt8(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 16 - n; ++n) {
+        if (shamt.u8[n] != shamt.u8[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SSHR
+        e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+
+  static void EmitInt16(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 8 - n; ++n) {
+        if (shamt.u16[n] != shamt.u16[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SSHR
+        e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+
+  static void EmitInt32(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      const auto& shamt = i.src2.constant();
+      bool all_same = true;
+      for (size_t n = 0; n < 4 - n; ++n) {
+        if (shamt.u32[n] != shamt.u32[n + 1]) {
+          all_same = false;
+          break;
+        }
+      }
+      if (all_same) {
+        // Every count is the same, so we can use SSHR
+        e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F);
+        return;
+      }
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_ROTATE_LEFT
+// ============================================================================
+template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+static uint8x16_t EmulateVectorRotateLeft(void*, std::byte src1[16],
+                                          std::byte src2[16]) {
+  alignas(16) T value[16 / sizeof(T)];
+  alignas(16) T shamt[16 / sizeof(T)];
+
+  // Load NEON registers into a C array.
+  vst1q_u8(reinterpret_cast<T*>(value), vld1q_u8(src1));
+  vst1q_u8(reinterpret_cast<T*>(shamt), vld1q_u8(src2));
+
+  for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
+    value[i] = xe::rotate_left<T>(value[i], shamt[i] & ((sizeof(T) * 8) - 1));
+  }
+
+  // Store result and return it.
+  return vld1q_u8(value);
+}
+struct VECTOR_ROTATE_LEFT_V128
+    : Sequence<VECTOR_ROTATE_LEFT_V128,
+               I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        e.CallNativeSafe(
+            reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
+        break;
+      case INT16_TYPE:
+        e.CallNativeSafe(
+            reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
+        break;
+      case INT32_TYPE:
+        e.CallNativeSafe(
+            reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
+        break;
+      default:
+        assert_always();
+        break;
+    }
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
+
+// ============================================================================
+// OPCODE_VECTOR_AVERAGE
+// ============================================================================
+struct VECTOR_AVERAGE
+    : Sequence<VECTOR_AVERAGE,
+               I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i,
+        [&i](A64Emitter& e, const QReg& dest, const QReg& src1,
+             const QReg& src2) {
+          const TypeName part_type =
+              static_cast<TypeName>(i.instr->flags & 0xFF);
+          const uint32_t arithmetic_flags = i.instr->flags >> 8;
+          bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
+          switch (part_type) {
+            case INT8_TYPE:
+              if (is_unsigned) {
+                e.URHADD(dest.B16(), src1.B16(), src2.B16());
+              } else {
+                e.SRHADD(dest.B16(), src1.B16(), src2.B16());
+                assert_always();
+              }
+              break;
+            case INT16_TYPE:
+              if (is_unsigned) {
+                e.URHADD(dest.H8(), src1.H8(), src2.H8());
+              } else {
+                e.SRHADD(dest.H8(), src1.H8(), src2.H8());
+              }
+              break;
+            case INT32_TYPE:
+              if (is_unsigned) {
+                e.URHADD(dest.S4(), src1.S4(), src2.S4());
+              } else {
+                e.SRHADD(dest.S4(), src1.S4(), src2.S4());
+              }
+              break;
+            default:
+              assert_unhandled_case(part_type);
+              break;
+          }
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE, VECTOR_AVERAGE);
+
+// ============================================================================
+// OPCODE_INSERT
+// ============================================================================
+struct INSERT_I8
+    : Sequence<INSERT_I8, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    e.MOV(i.dest.reg().Belem()[i.src2.constant() ^ 0x3], i.src3.reg());
+  }
+};
+struct INSERT_I16
+    : Sequence<INSERT_I16, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    e.MOV(i.dest.reg().Helem()[i.src2.constant() ^ 0x1], i.src3.reg());
+  }
+};
+struct INSERT_I32
+    : Sequence<INSERT_I32, I<OPCODE_INSERT, V128Op, V128Op, I8Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.is_constant);
+    e.MOV(i.dest.reg().Selem()[i.src2.constant()], i.src3.reg());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_INSERT, INSERT_I8, INSERT_I16, INSERT_I32);
+
+// ============================================================================
+// OPCODE_EXTRACT
+// ============================================================================
+// TODO(benvanik): sequence extract/splat:
+//  v0.i32 = extract v0.v128, 0
+//  v0.v128 = splat v0.i32
+// This can be a single broadcast.
+struct EXTRACT_I8
+    : Sequence<EXTRACT_I8, I<OPCODE_EXTRACT, I8Op, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.UMOV(i.dest, i.src1.reg().Belem()[VEC128_B(i.src2.constant())]);
+    } else {
+      // Fixup index
+      e.EOR(W0, i.src2, 0b11);
+      e.AND(W0, W0, 0x1F);
+      e.DUP(Q0.B16(), W0);
+      // Byte-table lookup
+      e.TBL(Q0.B16(), List{i.src1.reg().B16()}, Q0.B16());
+      // Get lowest element
+      e.UMOV(i.dest, Q0.Belem()[0]);
+    }
+  }
+};
+struct EXTRACT_I16
+    : Sequence<EXTRACT_I16, I<OPCODE_EXTRACT, I16Op, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src2.is_constant) {
+      e.UMOV(i.dest, i.src1.reg().Helem()[VEC128_W(i.src2.constant())]);
+    } else {
+      // Fixup index
+      e.EOR(W0, i.src2, 0b01);
+      e.LSL(W0, W0, 1);
+
+      // Replicate index as byte
+      e.MOV(W1, 0x01'01);
+      e.MUL(W0, W0, W1);
+
+      // Byte indices
+      e.ADD(W0, W0, 0x01'00);
+      e.UXTH(W0, W0);
+
+      // Replicate byte indices
+      e.DUP(Q0.H8(), W0);
+      // Byte-table lookup
+      e.TBL(Q0.B16(), List{i.src1.reg().B16()}, Q0.B16());
+      // Get lowest element
+      e.UMOV(i.dest, Q0.Helem()[0]);
+    }
+  }
+};
+struct EXTRACT_I32
+    : Sequence<EXTRACT_I32, I<OPCODE_EXTRACT, I32Op, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    static const vec128_t extract_table_32[4] = {
+        vec128b(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+        vec128b(7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+        vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+        vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+    };
+    if (i.src2.is_constant) {
+      e.UMOV(i.dest, i.src1.reg().Selem()[VEC128_D(i.src2.constant())]);
+    } else {
+      QReg src1 = i.src1.reg();
+      if (i.src1.is_constant) {
+        src1 = Q1;
+        e.LoadConstantV(src1, i.src1.constant());
+      }
+
+      e.AND(X0, i.src2.reg().toX(), 0b11);
+      e.LSL(X0, X0, 4);
+
+      e.MOV(X1, reinterpret_cast<uintptr_t>(extract_table_32));
+      e.LDR(Q0, X1, X0);
+
+      // Byte-table lookup
+      e.TBL(Q0.B16(), List{src1.B16()}, Q0.B16());
+      // Get lowest element
+      e.UMOV(i.dest, Q0.Selem()[0]);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32);
+
+// ============================================================================
+// OPCODE_SPLAT
+// ============================================================================
+// Copy a value into all elements of a vector
+struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      e.MOVI(i.dest.reg().B16(), i.src1.constant());
+    } else {
+      e.DUP(i.dest.reg().B16(), i.src1);
+    }
+  }
+};
+struct SPLAT_I16 : Sequence<SPLAT_I16, I<OPCODE_SPLAT, V128Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      if ((i.src1.constant() & 0xFF'00) == 0) {
+        e.MOVI(i.dest.reg().H8(), i.src1.constant());
+        return;
+      } else if ((i.src1.constant() & 0x00'FF) == 0) {
+        e.MOVI(i.dest.reg().H8(), i.src1.constant(), oaknut::util::LSL, 8);
+        return;
+      }
+      e.MOV(W0, i.src1.constant());
+      e.DUP(i.dest.reg().H8(), W0);
+    } else {
+      e.DUP(i.dest.reg().H8(), i.src1);
+    }
+  }
+};
+struct SPLAT_I32 : Sequence<SPLAT_I32, I<OPCODE_SPLAT, V128Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      oaknut::FImm8 fp8(0);
+      if (f32_to_fimm8(i.src1.value->constant.u32, fp8)) {
+        e.FMOV(i.dest.reg().S4(), fp8);
+        return;
+      } else if ((i.src1.constant() & 0xFF'FF'FF'00) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant());
+        return;
+      } else if ((i.src1.constant() & 0xFF'FF'00'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 8);
+        return;
+      } else if ((i.src1.constant() & 0xFF'00'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 16);
+        return;
+      } else if ((i.src1.constant() & 0x00'FF'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 24);
+        return;
+      }
+      e.MOV(W0, i.src1.constant());
+      e.DUP(i.dest.reg().S4(), W0);
+    } else {
+      e.DUP(i.dest.reg().S4(), i.src1);
+    }
+  }
+};
+struct SPLAT_F32 : Sequence<SPLAT_F32, I<OPCODE_SPLAT, V128Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      oaknut::FImm8 fp8(0);
+      if (f32_to_fimm8(i.src1.value->constant.u32, fp8)) {
+        e.FMOV(i.dest.reg().S4(), fp8);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0xFF'FF'FF'00) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0xFF'FF'00'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL,
+               8);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0xFF'00'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL,
+               16);
+        return;
+      } else if ((i.src1.value->constant.u32 & 0x00'FF'FF'FF) == 0) {
+        e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL,
+               24);
+        return;
+      }
+      e.MOV(W0, i.src1.value->constant.i32);
+      e.DUP(i.dest.reg().S4(), W0);
+    } else {
+      e.DUP(i.dest.reg().S4(), i.src1.reg().toQ().Selem()[0]);
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SPLAT, SPLAT_I8, SPLAT_I16, SPLAT_I32, SPLAT_F32);
+
+// ============================================================================
+// OPCODE_PERMUTE
+// ============================================================================
+struct PERMUTE_I32
+    : Sequence<PERMUTE_I32, I<OPCODE_PERMUTE, V128Op, I32Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.instr->flags == INT32_TYPE);
+    // Permute words between src2 and src3.
+    if (i.src1.is_constant) {
+      // Each byte is a word-index
+      const uint32_t control = i.src1.constant();
+      const QReg indices = Q0;
+
+      // Word to byte index
+      e.MOV(W0, control * 4);
+      e.MOV(indices.Selem()[0], W0);
+
+      // Widen int8 to int16
+      e.ZIP1(indices.B16(), indices.B16(), indices.B16());
+      // Widen int16 to int32
+      e.ZIP1(indices.B16(), indices.B16(), indices.B16());
+
+      // Convert to byte-indices
+      e.MOV(W0, 0x03'02'01'00);
+      e.DUP(Q1.S4(), W0);
+      e.ADD(indices.S4(), indices.S4(), Q1.S4());
+
+      // Table-registers must be sequential indices
+      const QReg table0 = Q2;
+      if (i.src2.is_constant) {
+        e.LoadConstantV(table0, i.src2.constant());
+      } else {
+        e.MOV(table0.B16(), i.src2.reg().B16());
+      }
+
+      const QReg table1 = Q3;
+      if (i.src3.is_constant) {
+        e.LoadConstantV(table1, i.src3.constant());
+      } else {
+        e.MOV(table1.B16(), i.src3.reg().B16());
+      }
+
+      e.TBL(i.dest.reg().B16(), List{table0.B16(), table1.B16()},
+            indices.B16());
+    } else {
+      // Permute by non-constant.
+      assert_always();
+    }
+  }
+};
+struct PERMUTE_V128
+    : Sequence<PERMUTE_V128,
+               I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
+  static void EmitByInt8(A64Emitter& e, const EmitArgType& i) {
+    // Permute bytes between src2 and src3.
+    // src1 is an array of indices corresponding to positions within src2 and
+    // src3.
+    if (i.src3.value->IsConstantZero()) {
+      if (i.src2.value->IsConstantZero()) {
+        // src2 & src3 are zero, so result will always be zero.
+        e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16());
+        return;
+      }
+    }
+
+    const QReg indices = Q0;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(indices, i.src1.constant());
+    } else {
+      e.MOV(indices.B16(), i.src1.reg().B16());
+    }
+
+    // Indices must be endian-swapped
+    e.MOVI(Q1.B16(), 0b11);
+    e.EOR(indices.B16(), indices.B16(), Q1.B16());
+
+    // Modulo 32 the indices
+    e.MOVI(Q1.B16(), 0b0001'1111);
+    e.AND(indices.B16(), indices.B16(), Q1.B16());
+
+    // Table-registers must be sequential indices
+    const QReg table_lo = Q2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(table_lo, i.src2.constant());
+    } else {
+      e.MOV(table_lo.B16(), i.src2.reg().B16());
+    }
+
+    const QReg table_hi = Q3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(table_hi, i.src3.constant());
+    } else {
+      e.MOV(table_hi.B16(), i.src3.reg().B16());
+    }
+
+    e.TBL(i.dest.reg().B16(), List{table_lo.B16(), table_hi.B16()},
+          indices.B16());
+  }
+
+  static void EmitByInt16(A64Emitter& e, const EmitArgType& i) {
+    // Permute bytes between src2 and src3.
+    // src1 is an array of indices corresponding to positions within src2 and
+    // src3.
+    if (i.src3.value->IsConstantZero()) {
+      if (i.src2.value->IsConstantZero()) {
+        // src2 & src3 are zero, so result will always be zero.
+        e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16());
+        return;
+      }
+    }
+
+    const QReg indices = Q0;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(indices, i.src1.constant());
+    } else {
+      e.MOV(indices.B16(), i.src1.reg().B16());
+    }
+
+    // Indices must be endian-swapped
+    e.MOVI(Q1.H8(), 0b1);
+    e.EOR(indices.B16(), indices.B16(), Q1.B16());
+
+    // Modulo-16 the indices
+    e.MOVI(Q1.H8(), 0b0000'1111);
+    e.AND(indices.B16(), indices.B16(), Q1.B16());
+
+    // Convert int16 indices into int8
+    e.MOVI(Q1.B16(), 0x02);
+    e.MUL(indices.H8(), indices.H8(), Q1.H8());
+
+    e.MOVI(Q1.H8(), 0x01, LSL, 8);
+    e.ADD(indices.H8(), indices.H8(), Q1.H8());
+
+    // Table-registers must be sequential indices
+    const QReg table_lo = Q2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(table_lo, i.src2.constant());
+    } else {
+      e.MOV(table_lo.B16(), i.src2.reg().B16());
+    }
+
+    const QReg table_hi = Q3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(table_hi, i.src3.constant());
+    } else {
+      e.MOV(table_hi.B16(), i.src3.reg().B16());
+    }
+
+    e.TBL(i.dest.reg().B16(), List{table_lo.B16(), table_hi.B16()},
+          indices.B16());
+  }
+
+  static void EmitByInt32(A64Emitter& e, const EmitArgType& i) {
+    assert_always();
+  }
+
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case INT8_TYPE:
+        EmitByInt8(e, i);
+        break;
+      case INT16_TYPE:
+        EmitByInt16(e, i);
+        break;
+      case INT32_TYPE:
+        EmitByInt32(e, i);
+        break;
+      default:
+        assert_unhandled_case(i.instr->flags);
+        return;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128);
+
+// ============================================================================
+// OPCODE_SWIZZLE
+// ============================================================================
+struct SWIZZLE
+    : Sequence<SWIZZLE, I<OPCODE_SWIZZLE, V128Op, V128Op, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    auto element_type = i.instr->flags;
+    if (element_type == INT8_TYPE) {
+      assert_always();
+    } else if (element_type == INT16_TYPE) {
+      assert_always();
+    } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) {
+      // Four 2-bit word-indices packed into one 8-bit value
+      const uint8_t swizzle_mask = static_cast<uint8_t>(i.src2.value);
+
+      // Convert to byte-indices
+      const vec128_t indice_vec =
+          vec128i(((swizzle_mask >> 0) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00,
+                  ((swizzle_mask >> 2) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00,
+                  ((swizzle_mask >> 4) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00,
+                  ((swizzle_mask >> 6) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00);
+
+      const QReg indices = Q1;
+      e.LoadConstantV(indices, indice_vec);
+
+      QReg table0 = Q0;
+      if (i.src1.is_constant) {
+        e.LoadConstantV(table0, i.src1.constant());
+      } else {
+        table0 = i.src1;
+      }
+
+      e.TBL(i.dest.reg().B16(), List{table0.B16()}, indices.B16());
+    } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
+      assert_always();
+    } else {
+      assert_always();
+    }
+  };
+};
+EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE);
+
+// ============================================================================
+// OPCODE_PACK
+// ============================================================================
+struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags & PACK_TYPE_MODE) {
+      case PACK_TYPE_D3DCOLOR:
+        EmitD3DCOLOR(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_2:
+        EmitFLOAT16_2(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_4:
+        EmitFLOAT16_4(e, i);
+        break;
+      case PACK_TYPE_SHORT_2:
+        EmitSHORT_2(e, i);
+        break;
+      case PACK_TYPE_SHORT_4:
+        EmitSHORT_4(e, i);
+        break;
+      case PACK_TYPE_UINT_2101010:
+        EmitUINT_2101010(e, i);
+        break;
+      case PACK_TYPE_ULONG_4202020:
+        EmitULONG_4202020(e, i);
+        break;
+      case PACK_TYPE_8_IN_16:
+        Emit8_IN_16(e, i, i.instr->flags);
+        break;
+      case PACK_TYPE_16_IN_32:
+        Emit16_IN_32(e, i, i.instr->flags);
+        break;
+      default:
+        assert_unhandled_case(i.instr->flags);
+        break;
+    }
+  }
+  static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    QReg src = i.src1;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    }
+
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
+    // are valid - max before min to pack NaN as zero (5454082B is heavily
+    // affected by the order - packs 0xFFFFFFFF in matrix code to get a 0
+    // constant).
+    e.LDR(Q0, VConstData, e.GetVConstOffset(V3333));
+    e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4());
+
+    e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLORSat));
+    e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4());
+    // Extract bytes.
+    // RGBA (XYZW) -> ARGB (WXYZ)
+    // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
+    //     ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
+    e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLOR));
+    e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16());
+  }
+  static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) {
+    alignas(16) float a[4];
+    alignas(16) uint16_t b[8];
+    vst1q_u8(a, vld1q_u8(src1));
+    std::memset(b, 0, sizeof(b));
+
+    for (int i = 0; i < 2; i++) {
+      b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
+    }
+
+    return vld1q_u8(b);
+  }
+  static void EmitFLOAT16_2(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
+    // dest = [(src1.x | src1.y), 0, 0, 0]
+
+    if (e.IsFeatureEnabled(kA64EmitF16C)) {
+      const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+      if (i.src1.is_constant) {
+        e.LoadConstantV(src1, i.src1.constant());
+      }
+      e.FCVTN(i.dest.reg().toD().H4(), src1.S4());
+      e.MOVI(Q0.B16(), 0);
+      e.EXT(i.dest.reg().B16(), Q0.B16(), i.dest.reg().B16(), 4);
+      e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+      return;
+    }
+
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static uint8x16_t EmulateFLOAT16_4(void*, std::byte src1[16]) {
+    alignas(16) float a[4];
+    alignas(16) uint16_t b[8];
+    vst1q_u8(a, vld1q_u8(src1));
+    std::memset(b, 0, sizeof(b));
+
+    for (int i = 0; i < 4; i++) {
+      b[7 - (i ^ 2)] =
+          half_float::detail::float2half<std::round_toward_zero>(a[i]);
+    }
+
+    return vld1q_u8(b);
+  }
+  static void EmitFLOAT16_4(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
+    // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]
+
+    if (e.IsFeatureEnabled(kA64EmitF16C)) {
+      const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+      if (i.src1.is_constant) {
+        e.LoadConstantV(src1, i.src1.constant());
+      }
+      e.FCVTN(i.dest.reg().toD().H4(), src1.S4());
+      e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+      e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+      return;
+    }
+
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static void EmitSHORT_2(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    QReg src = i.src1;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    }
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    // Saturate
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
+    e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max));
+    e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Pack
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_2));
+    e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
+  }
+  static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) {
+    assert_true(i.src2.value->IsConstantZero());
+    QReg src = i.src1;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    }
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    // Saturate
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
+    e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max));
+    e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Pack
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_4));
+    e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
+  }
+  static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
+    // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
+    // XYZ are 10 bits, signed and saturated.
+    // W is 2 bits, unsigned and saturated.
+    const QReg src = i.dest;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src, i.src1.constant());
+    }
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    // Saturate.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked));
+    e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaxUnpacked));
+    e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Remove the unneeded bits of the floats.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskUnpacked));
+    e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
+
+    // Shift the components up.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift));
+    e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Combine the components.
+    e.LoadConstantV(Q1, vec128i(0x03'02'01'00 + 0x04'04'04'04 * 2,
+                                0x03'02'01'00 + 0x04'04'04'04 * 3,
+                                0x03'02'01'00 + 0x04'04'04'04 * 0,
+                                0x03'02'01'00 + 0x04'04'04'04 * 1));
+    e.TBL(Q0.B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
+    e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
+
+    e.LoadConstantV(Q1, vec128i(0x03'02'01'00 + 0x04'04'04'04 * 1,
+                                0x03'02'01'00 + 0x04'04'04'04 * 0,
+                                0x03'02'01'00 + 0x04'04'04'04 * 3,
+                                0x03'02'01'00 + 0x04'04'04'04 * 2));
+    e.TBL(Q0.B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
+    e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
+  }
+  static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {
+    // XYZ are 20 bits, signed and saturated.
+    // W is 4 bits, unsigned and saturated.
+    QReg src = i.src1;
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    }
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    // Saturate.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked));
+    e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaxUnpacked));
+    e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Remove the unneeded bits of the floats (so excess nibbles will also be
+    // cleared).
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaskUnpacked));
+    e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
+
+    // Store Y and W shifted left by 4 so vpshufb can be used with them.
+    e.SHL(Q0.S4(), i.dest.reg().S4(), 4);
+
+    // Place XZ where they're supposed to be.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteXZ));
+    e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
+    // Place YW.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteYW));
+    e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16());
+    // Merge XZ and YW.
+    e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
+  }
+  static void Emit8_IN_16(A64Emitter& e, const EmitArgType& i, uint32_t flags) {
+    if (IsPackInUnsigned(flags)) {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> unsigned + saturate
+          const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+          if (i.src1.is_constant) {
+            e.LoadConstantV(src1, i.src1.constant());
+          }
+
+          const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
+          if (i.src2.is_constant) {
+            e.LoadConstantV(src2, i.src2.constant());
+          }
+          e.UQXTN(i.dest.reg().toD().B8(), src2.H8());
+          e.UQXTN2(i.dest.reg().B16(), src1.H8());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        } else {
+          // unsigned -> unsigned
+          e.XTN(i.dest.reg().toD().B8(), i.src2.reg().H8());
+          e.XTN2(i.dest.reg().B16(), i.src1.reg().H8());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> signed + saturate
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      }
+    } else {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> unsigned + saturate
+          const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+          if (i.src1.is_constant) {
+            e.LoadConstantV(src1, i.src1.constant());
+          }
+
+          const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
+          if (i.src2.is_constant) {
+            e.LoadConstantV(src2, i.src2.constant());
+          }
+
+          e.SQXTUN(i.dest.reg().toD().B8(), src2.H8());
+          e.SQXTUN2(i.dest.reg().B16(), src1.H8());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        } else {
+          // signed -> unsigned
+          assert_always();
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> signed + saturate
+          e.SQXTN(i.dest.reg().toD().B8(), i.src2.reg().H8());
+          e.SQXTN2(i.dest.reg().B16(), i.src1.reg().H8());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        } else {
+          // signed -> signed
+          assert_always();
+        }
+      }
+    }
+  }
+  // Pack 2 32-bit vectors into a 16-bit vector.
+  static void Emit16_IN_32(A64Emitter& e, const EmitArgType& i,
+                           uint32_t flags) {
+    // TODO(benvanik): handle src2 (or src1) being constant zero
+    if (IsPackInUnsigned(flags)) {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> unsigned + saturate
+          const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+          if (i.src1.is_constant) {
+            e.LoadConstantV(src1, i.src1.constant());
+          }
+
+          const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
+          if (i.src2.is_constant) {
+            e.LoadConstantV(src2, i.src2.constant());
+          }
+
+          e.UQXTN(i.dest.reg().toD().H4(), src2.S4());
+          e.UQXTN2(i.dest.reg().H8(), src1.S4());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        } else {
+          // unsigned -> unsigned
+          e.XTN(i.dest.reg().toD().H4(), i.src2.reg().S4());
+          e.XTN2(i.dest.reg().H8(), i.src1.reg().S4());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // unsigned -> signed + saturate
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      }
+    } else {
+      if (IsPackOutUnsigned(flags)) {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> unsigned + saturate
+          e.SQXTUN(i.dest.reg().toD().H4(), i.src2.reg().S4());
+          e.SQXTUN2(i.dest.reg().H8(), i.src1.reg().S4());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        } else {
+          // signed -> unsigned
+          assert_always();
+        }
+      } else {
+        if (IsPackOutSaturate(flags)) {
+          // signed -> signed + saturate
+          const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+          if (i.src1.is_constant) {
+            e.LoadConstantV(src1, i.src1.constant());
+          }
+
+          const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
+          if (i.src2.is_constant) {
+            e.LoadConstantV(src2, i.src2.constant());
+          }
+          e.SQXTN(i.dest.reg().toD().H4(), src2.S4());
+          e.SQXTN2(i.dest.reg().H8(), src1.S4());
+
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8);
+        } else {
+          // signed -> signed
+          assert_always();
+        }
+      }
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK);
+
+// ============================================================================
+// OPCODE_UNPACK
+// ============================================================================
+struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags & PACK_TYPE_MODE) {
+      case PACK_TYPE_D3DCOLOR:
+        EmitD3DCOLOR(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_2:
+        EmitFLOAT16_2(e, i);
+        break;
+      case PACK_TYPE_FLOAT16_4:
+        EmitFLOAT16_4(e, i);
+        break;
+      case PACK_TYPE_SHORT_2:
+        EmitSHORT_2(e, i);
+        break;
+      case PACK_TYPE_SHORT_4:
+        EmitSHORT_4(e, i);
+        break;
+      case PACK_TYPE_UINT_2101010:
+        EmitUINT_2101010(e, i);
+        break;
+      case PACK_TYPE_ULONG_4202020:
+        EmitULONG_4202020(e, i);
+        break;
+      case PACK_TYPE_8_IN_16:
+        Emit8_IN_16(e, i, i.instr->flags);
+        break;
+      case PACK_TYPE_16_IN_32:
+        Emit16_IN_32(e, i, i.instr->flags);
+        break;
+      default:
+        assert_unhandled_case(i.instr->flags);
+        break;
+    }
+  }
+  static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) {
+    // ARGB (WXYZ) -> RGBA (XYZW)
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    QReg src(0);
+
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.FMOV(i.dest.reg().S4(), FImm8(0, 7, 0));
+        return;
+      }
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // src = ZZYYXXWW
+    // Unpack to 000000ZZ,000000YY,000000XX,000000WW
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackD3DCOLOR));
+    e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
+    // Add 1.0f to each.
+    e.FMOV(Q1.S4(), FImm8(0, 7, 0));
+    e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
+    // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
+  }
+  static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) {
+    alignas(16) uint16_t a[4];
+    alignas(16) float b[8];
+    vst1q_u8(a, vld1q_u8(src1));
+    std::memset(b, 0, sizeof(b));
+
+    for (int i = 0; i < 2; i++) {
+      b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
+    }
+
+    // Constants, or something
+    b[2] = 0.f;
+    b[3] = 1.f;
+
+    return vld1q_u8(b);
+  }
+  static void EmitFLOAT16_2(A64Emitter& e, const EmitArgType& i) {
+    // 1 bit sign, 5 bit exponent, 10 bit mantissa
+    // D3D10 half float format
+
+    if (e.IsFeatureEnabled(kA64EmitF16C)) {
+      const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+      if (i.src1.is_constant) {
+        e.LoadConstantV(src1, i.src1.constant());
+      }
+
+      // Move the upper 4 bytes to the lower 4 bytes, zero the rest
+      e.EOR(Q0.B16(), Q0.B16(), Q0.B16());
+      e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16(), 12);
+
+      e.FCVTL(i.dest.reg().S4(), i.dest.reg().toD().H4());
+      e.REV64(i.dest.reg().S4(), i.dest.reg().S4());
+
+      // Write 1.0 to element 3
+      e.FMOV(S0, oaknut::FImm8(0, 7, 0));
+      e.MOV(i.dest.reg().Selem()[3], Q0.Selem()[0]);
+      return;
+    }
+
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static uint8x16_t EmulateFLOAT16_4(void*, std::byte src1[16]) {
+    alignas(16) uint16_t a[4];
+    alignas(16) float b[8];
+    vst1q_u8(a, vld1q_u8(src1));
+
+    for (int i = 0; i < 4; i++) {
+      b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
+    }
+
+    return vld1q_u8(b);
+  }
+  static void EmitFLOAT16_4(A64Emitter& e, const EmitArgType& i) {
+    // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
+    if (e.IsFeatureEnabled(kA64EmitF16C)) {
+      const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
+      if (i.src1.is_constant) {
+        e.LoadConstantV(src1, i.src1.constant());
+      }
+      e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.src1.reg().B16(), 8);
+      e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+      e.FCVTL(i.dest.reg().S4(), i.dest.reg().toD().H4());
+      return;
+    }
+
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static void EmitSHORT_2(A64Emitter& e, const EmitArgType& i) {
+    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
+    // (VD.y) = 3.0 + (VB.x)*2^-22
+    // (VD.z) = 0.0
+    // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
+    // src is (xx,xx,xx,VALUE)
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    QReg src(0);
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        src = i.dest;
+        e.LDR(i.dest, VConstData, e.GetVConstOffset(V3301));
+        return;
+      }
+      // TODO(benvanik): check other common constants/perform shuffle/or here.
+      src = i.src1;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Shuffle bytes.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_2));
+    e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
+
+    // If negative, make smaller than 3 - sign extend before adding.
+    e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 16);
+    e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16);
+
+    // Add 3,3,0,1.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(V3301));
+    e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Return quiet NaNs in case of negative overflow.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow));
+    e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
+    e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) {
+    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
+    // (VD.y) = 3.0 + (VB.x)*2^-22
+    // (VD.z) = 3.0 + (VB.y>>16)*2^-22
+    // (VD.w) = 3.0 + (VB.y)*2^-22
+    // src is (xx,xx,VALUE,VALUE)
+
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    QReg src(0);
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.LDR(i.dest, VConstData, e.GetVConstOffset(V3333));
+        return;
+      }
+      // TODO(benvanik): check other common constants/perform shuffle/or here.
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Shuffle bytes.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_4));
+    e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
+
+    // If negative, make smaller than 3 - sign extend before adding.
+    e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 16);
+    e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16);
+
+    // Add 3,3,3,3.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(V3333));
+    e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+
+    // Return quiet NaNs in case of negative overflow.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow));
+    e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
+    e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    QReg src(0);
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331));
+        return;
+      }
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+
+    // Splat W.
+    e.DUP(i.dest.reg().S4(), src.Selem()[3]);
+    // Keep only the needed components.
+    // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskPacked));
+    e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
+
+    // Shift the components down.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift));
+    e.NEG(Q1.S4(), Q1.S4());
+    e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+    // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding.
+    // W is unsigned.
+    e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22);
+    e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22);
+    // Add 3,3,3,1.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(V3331));
+    e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+    // Return quiet NaNs in case of negative overflow.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackUINT_2101010_Overflow));
+    e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
+    e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+    // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
+    // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.}
+  }
+  static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {
+    const XReg VConstData = X3;
+    e.MOV(VConstData, e.GetVConstPtr());
+
+    QReg src(0);
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331));
+        return;
+      }
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW
+    // will have excess 4 lower bits.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Permute));
+    e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
+
+    // Drop the excess nibble of YW.
+    e.USHR(Q0.S4(), i.dest.reg().S4(), 4);
+    // Merge XZ and YW now both starting at offset 0.
+    e.LoadConstantV(Q1, vec128i(3 * 0x04'04'04'04 + 0x03'02'01'00,
+                                2 * 0x04'04'04'04 + 0x03'02'01'00,
+                                1 * 0x04'04'04'04 + 0x03'02'01'00,
+                                0 * 0x04'04'04'04 + 0x03'02'01'00));
+    e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16(), Q0.B16()},
+          Q1.B16());
+
+    // Reorder as XYZW.
+    e.LoadConstantV(Q1, vec128i(3 * 0x04'04'04'04 + 0x03'02'01'00,
+                                1 * 0x04'04'04'04 + 0x03'02'01'00,
+                                2 * 0x04'04'04'04 + 0x03'02'01'00,
+                                0 * 0x04'04'04'04 + 0x03'02'01'00));
+    e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16(), Q0.B16()},
+          Q1.B16());
+    // Drop the excess upper nibble in XZ and sign-extend XYZ.
+    e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12);
+    e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12);
+    // Add 3,3,3,1.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(V3331));
+    e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
+    // Return quiet NaNs in case of negative overflow.
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Overflow));
+    e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
+
+    e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
+    e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static void Emit8_IN_16(A64Emitter& e, const EmitArgType& i, uint32_t flags) {
+    assert_false(IsPackOutSaturate(flags));
+    QReg src(0);
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    if (IsPackToLo(flags)) {
+      // Unpack to LO.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.SXTL2(i.dest.reg().H8(), i.dest.reg().B16());
+        }
+      }
+    } else {
+      // Unpack to HI.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
+          e.SXTL(i.dest.reg().H8(), i.dest.reg().toD().B8());
+        }
+      }
+    }
+  }
+  static void Emit16_IN_32(A64Emitter& e, const EmitArgType& i,
+                           uint32_t flags) {
+    assert_false(IsPackOutSaturate(flags));
+    QReg src(0);
+    if (i.src1.is_constant) {
+      src = i.dest;
+      e.LoadConstantV(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    if (IsPackToLo(flags)) {
+      // Unpack to LO.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.SXTL2(i.dest.reg().S4(), src.H8());
+        }
+      }
+    } else {
+      // Unpack to HI.
+      if (IsPackInUnsigned(flags)) {
+        if (IsPackOutUnsigned(flags)) {
+          // unsigned -> unsigned
+          assert_always();
+        } else {
+          // unsigned -> signed
+          assert_always();
+        }
+      } else {
+        if (IsPackOutUnsigned(flags)) {
+          // signed -> unsigned
+          assert_always();
+        } else {
+          // signed -> signed
+          e.SXTL(i.dest.reg().S4(), src.toD().H4());
+        }
+      }
+    }
+    e.REV64(i.dest.reg().S4(), i.dest.reg().S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_sequences.cc b/src/xenia/cpu/backend/a64/a64_sequences.cc
new file mode 100644
index 000000000..3eb60510e
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_sequences.cc
@@ -0,0 +1,2788 @@
+﻿/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+// A note about vectors:
+// Xenia represents vectors as xyzw pairs, with indices 0123.
+// XMM registers are xyzw pairs with indices 3210, making them more like wzyx.
+// This makes things somewhat confusing. It'd be nice to just shuffle the
+// registers around on load/store, however certain operations require that
+// data be in the right offset.
+// Basically, this identity must hold:
+//   shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w}
+// All indices and operations must respect that.
+//
+// Memory (big endian):
+// [00 01 02 03] [04 05 06 07] [08 09 0A 0B] [0C 0D 0E 0F] (x, y, z, w)
+// load into xmm register:
+// [0F 0E 0D 0C] [0B 0A 09 08] [07 06 05 04] [03 02 01 00] (w, z, y, x)
+
+#include "xenia/cpu/backend/a64/a64_sequences.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/clock.h"
+#include "xenia/base/logging.h"
+#include "xenia/base/string.h"
+#include "xenia/base/threading.h"
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+#include "xenia/cpu/backend/a64/a64_op.h"
+#include "xenia/cpu/backend/a64/a64_tracers.h"
+#include "xenia/cpu/backend/a64/a64_util.h"
+#include "xenia/cpu/hir/hir_builder.h"
+#include "xenia/cpu/processor.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+using namespace oaknut;
+
+// TODO(benvanik): direct usings.
+using namespace xe::cpu;
+using namespace xe::cpu::hir;
+
+using xe::cpu::hir::Instr;
+
+typedef bool (*SequenceSelectFn)(A64Emitter&, const Instr*);
+std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
+
+// ============================================================================
+// OPCODE_COMMENT
+// ============================================================================
+struct COMMENT : Sequence<COMMENT, I<OPCODE_COMMENT, VoidOp, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (IsTracingInstr()) {
+      auto str = reinterpret_cast<const char*>(i.src1.value);
+      // TODO(benvanik): pass through.
+      // TODO(benvanik): don't just leak this memory.
+      auto str_copy = xe_strdup(str);
+      e.MOV(e.GetNativeParam(0), reinterpret_cast<uint64_t>(str_copy));
+      e.CallNative(reinterpret_cast<void*>(TraceString));
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_COMMENT, COMMENT);
+
+// ============================================================================
+// OPCODE_NOP
+// ============================================================================
+struct NOP : Sequence<NOP, I<OPCODE_NOP, VoidOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) { e.NOP(); }
+};
+EMITTER_OPCODE_TABLE(OPCODE_NOP, NOP);
+
+// ============================================================================
+// OPCODE_SOURCE_OFFSET
+// ============================================================================
+struct SOURCE_OFFSET
+    : Sequence<SOURCE_OFFSET, I<OPCODE_SOURCE_OFFSET, VoidOp, OffsetOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.MarkSourceOffset(i.instr);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SOURCE_OFFSET, SOURCE_OFFSET);
+
+// ============================================================================
+// OPCODE_ASSIGN
+// ============================================================================
+struct ASSIGN_I8 : Sequence<ASSIGN_I8, I<OPCODE_ASSIGN, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest, i.src1);
+  }
+};
+struct ASSIGN_I16 : Sequence<ASSIGN_I16, I<OPCODE_ASSIGN, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTH(i.dest, i.src1);
+  }
+};
+struct ASSIGN_I32 : Sequence<ASSIGN_I32, I<OPCODE_ASSIGN, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.MOV(i.dest, i.src1);
+  }
+};
+struct ASSIGN_I64 : Sequence<ASSIGN_I64, I<OPCODE_ASSIGN, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.MOV(i.dest, i.src1);
+  }
+};
+struct ASSIGN_F32 : Sequence<ASSIGN_F32, I<OPCODE_ASSIGN, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FMOV(i.dest, i.src1);
+  }
+};
+struct ASSIGN_F64 : Sequence<ASSIGN_F64, I<OPCODE_ASSIGN, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FMOV(i.dest, i.src1);
+  }
+};
+struct ASSIGN_V128 : Sequence<ASSIGN_V128, I<OPCODE_ASSIGN, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.MOV(i.dest.reg().B16(), i.src1.reg().B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ASSIGN, ASSIGN_I8, ASSIGN_I16, ASSIGN_I32,
+                     ASSIGN_I64, ASSIGN_F32, ASSIGN_F64, ASSIGN_V128);
+
+// ============================================================================
+// OPCODE_CAST
+// ============================================================================
+struct CAST_I32_F32 : Sequence<CAST_I32_F32, I<OPCODE_CAST, I32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FMOV(i.dest, i.src1);
+  }
+};
+struct CAST_I64_F64 : Sequence<CAST_I64_F64, I<OPCODE_CAST, I64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FMOV(i.dest, i.src1);
+  }
+};
+struct CAST_F32_I32 : Sequence<CAST_F32_I32, I<OPCODE_CAST, F32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FMOV(i.dest, i.src1);
+  }
+};
+struct CAST_F64_I64 : Sequence<CAST_F64_I64, I<OPCODE_CAST, F64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FMOV(i.dest, i.src1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CAST, CAST_I32_F32, CAST_I64_F64, CAST_F32_I32,
+                     CAST_F64_I64);
+
+// ============================================================================
+// OPCODE_ZERO_EXTEND
+// ============================================================================
+struct ZERO_EXTEND_I16_I8
+    : Sequence<ZERO_EXTEND_I16_I8, I<OPCODE_ZERO_EXTEND, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest, i.src1);
+  }
+};
+struct ZERO_EXTEND_I32_I8
+    : Sequence<ZERO_EXTEND_I32_I8, I<OPCODE_ZERO_EXTEND, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest, i.src1);
+  }
+};
+struct ZERO_EXTEND_I64_I8
+    : Sequence<ZERO_EXTEND_I64_I8, I<OPCODE_ZERO_EXTEND, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest.reg().toW(), i.src1);
+  }
+};
+struct ZERO_EXTEND_I32_I16
+    : Sequence<ZERO_EXTEND_I32_I16, I<OPCODE_ZERO_EXTEND, I32Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTH(i.dest, i.src1);
+  }
+};
+struct ZERO_EXTEND_I64_I16
+    : Sequence<ZERO_EXTEND_I64_I16, I<OPCODE_ZERO_EXTEND, I64Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTH(i.dest.reg().toW(), i.src1);
+  }
+};
+struct ZERO_EXTEND_I64_I32
+    : Sequence<ZERO_EXTEND_I64_I32, I<OPCODE_ZERO_EXTEND, I64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.MOV(i.dest.reg().toW(), i.src1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ZERO_EXTEND, ZERO_EXTEND_I16_I8, ZERO_EXTEND_I32_I8,
+                     ZERO_EXTEND_I64_I8, ZERO_EXTEND_I32_I16,
+                     ZERO_EXTEND_I64_I16, ZERO_EXTEND_I64_I32);
+
+// ============================================================================
+// OPCODE_SIGN_EXTEND
+// ============================================================================
+struct SIGN_EXTEND_I16_I8
+    : Sequence<SIGN_EXTEND_I16_I8, I<OPCODE_SIGN_EXTEND, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SXTB(i.dest, i.src1);
+  }
+};
+struct SIGN_EXTEND_I32_I8
+    : Sequence<SIGN_EXTEND_I32_I8, I<OPCODE_SIGN_EXTEND, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SXTB(i.dest, i.src1);
+  }
+};
+struct SIGN_EXTEND_I64_I8
+    : Sequence<SIGN_EXTEND_I64_I8, I<OPCODE_SIGN_EXTEND, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SXTB(i.dest, i.src1);
+  }
+};
+struct SIGN_EXTEND_I32_I16
+    : Sequence<SIGN_EXTEND_I32_I16, I<OPCODE_SIGN_EXTEND, I32Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SXTH(i.dest, i.src1);
+  }
+};
+struct SIGN_EXTEND_I64_I16
+    : Sequence<SIGN_EXTEND_I64_I16, I<OPCODE_SIGN_EXTEND, I64Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SXTH(i.dest, i.src1);
+  }
+};
+struct SIGN_EXTEND_I64_I32
+    : Sequence<SIGN_EXTEND_I64_I32, I<OPCODE_SIGN_EXTEND, I64Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SXTW(i.dest, i.src1.reg());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SIGN_EXTEND, SIGN_EXTEND_I16_I8, SIGN_EXTEND_I32_I8,
+                     SIGN_EXTEND_I64_I8, SIGN_EXTEND_I32_I16,
+                     SIGN_EXTEND_I64_I16, SIGN_EXTEND_I64_I32);
+
+// ============================================================================
+// OPCODE_TRUNCATE
+// ============================================================================
+struct TRUNCATE_I8_I16
+    : Sequence<TRUNCATE_I8_I16, I<OPCODE_TRUNCATE, I8Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest, i.src1);
+  }
+};
+struct TRUNCATE_I8_I32
+    : Sequence<TRUNCATE_I8_I32, I<OPCODE_TRUNCATE, I8Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest, i.src1);
+  }
+};
+struct TRUNCATE_I8_I64
+    : Sequence<TRUNCATE_I8_I64, I<OPCODE_TRUNCATE, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTB(i.dest, i.src1.reg().toW());
+  }
+};
+struct TRUNCATE_I16_I32
+    : Sequence<TRUNCATE_I16_I32, I<OPCODE_TRUNCATE, I16Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTH(i.dest, i.src1);
+  }
+};
+struct TRUNCATE_I16_I64
+    : Sequence<TRUNCATE_I16_I64, I<OPCODE_TRUNCATE, I16Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UXTH(i.dest, i.src1.reg().toW());
+  }
+};
+struct TRUNCATE_I32_I64
+    : Sequence<TRUNCATE_I32_I64, I<OPCODE_TRUNCATE, I32Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.MOV(i.dest, i.src1.reg().toW());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_TRUNCATE, TRUNCATE_I8_I16, TRUNCATE_I8_I32,
+                     TRUNCATE_I8_I64, TRUNCATE_I16_I32, TRUNCATE_I16_I64,
+                     TRUNCATE_I32_I64);
+
+// ============================================================================
+// OPCODE_CONVERT
+// ============================================================================
+struct CONVERT_I32_F32
+    : Sequence<CONVERT_I32_F32, I<OPCODE_CONVERT, I32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): saturation check? cvtt* (trunc?)
+    if (i.instr->flags == ROUND_TO_ZERO) {
+      e.FCVTZS(i.dest, i.src1.reg().toS());
+    } else {
+      e.FCVTNS(i.dest, i.src1.reg().toS());
+    }
+  }
+};
+struct CONVERT_I32_F64
+    : Sequence<CONVERT_I32_F64, I<OPCODE_CONVERT, I32Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // Intel returns 0x80000000 if the double value does not fit within an int32
+    // ARM64 and PPC saturates the value instead
+    if (i.instr->flags == ROUND_TO_ZERO) {
+      e.FCVTZS(i.dest, i.src1.reg().toD());
+    } else {
+      e.FCVTNS(i.dest, i.src1.reg().toD());
+    }
+  }
+};
+struct CONVERT_I64_F64
+    : Sequence<CONVERT_I64_F64, I<OPCODE_CONVERT, I64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags == ROUND_TO_ZERO) {
+      e.FCVTZS(i.dest, i.src1.reg().toD());
+    } else {
+      e.FCVTNS(i.dest, i.src1.reg().toD());
+    }
+  }
+};
+struct CONVERT_F32_I32
+    : Sequence<CONVERT_F32_I32, I<OPCODE_CONVERT, F32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SCVTF(i.dest.reg().toS(), i.src1);
+  }
+};
+struct CONVERT_F32_F64
+    : Sequence<CONVERT_F32_F64, I<OPCODE_CONVERT, F32Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCVT(i.dest.reg().toS(), i.src1.reg().toD());
+  }
+};
+struct CONVERT_F64_I64
+    : Sequence<CONVERT_F64_I64, I<OPCODE_CONVERT, F64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.SCVTF(i.dest.reg().toD(), i.src1);
+  }
+};
+struct CONVERT_F64_F32
+    : Sequence<CONVERT_F64_F32, I<OPCODE_CONVERT, F64Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // e.vcvtss2sd(i.dest, i.src1);
+    e.FCVT(i.dest.reg().toD(), i.src1.reg().toS());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CONVERT, CONVERT_I32_F32, CONVERT_I32_F64,
+                     CONVERT_I64_F64, CONVERT_F32_I32, CONVERT_F32_F64,
+                     CONVERT_F64_I64, CONVERT_F64_F32);
+
+// ============================================================================
+// OPCODE_ROUND
+// ============================================================================
+struct ROUND_F32 : Sequence<ROUND_F32, I<OPCODE_ROUND, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case ROUND_TO_ZERO:
+        e.FRINTZ(i.dest.reg().toS(), i.src1.reg().toS());
+        break;
+      case ROUND_TO_NEAREST:
+        e.FRINTN(i.dest.reg().toS(), i.src1.reg().toS());
+        break;
+      case ROUND_TO_MINUS_INFINITY:
+        e.FRINTM(i.dest.reg().toS(), i.src1.reg().toS());
+        break;
+      case ROUND_TO_POSITIVE_INFINITY:
+        e.FRINTP(i.dest.reg().toS(), i.src1.reg().toS());
+        break;
+    }
+  }
+};
+struct ROUND_F64 : Sequence<ROUND_F64, I<OPCODE_ROUND, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case ROUND_TO_ZERO:
+        e.FRINTZ(i.dest, i.src1);
+        break;
+      case ROUND_TO_NEAREST:
+        e.FRINTN(i.dest, i.src1);
+        break;
+      case ROUND_TO_MINUS_INFINITY:
+        e.FRINTM(i.dest, i.src1);
+        break;
+      case ROUND_TO_POSITIVE_INFINITY:
+        e.FRINTP(i.dest, i.src1);
+        break;
+    }
+  }
+};
+struct ROUND_V128 : Sequence<ROUND_V128, I<OPCODE_ROUND, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    switch (i.instr->flags) {
+      case ROUND_TO_ZERO:
+        e.FRINTZ(i.dest.reg().S4(), i.src1.reg().S4());
+        break;
+      case ROUND_TO_NEAREST:
+        e.FRINTN(i.dest.reg().S4(), i.src1.reg().S4());
+        break;
+      case ROUND_TO_MINUS_INFINITY:
+        e.FRINTM(i.dest.reg().S4(), i.src1.reg().S4());
+        break;
+      case ROUND_TO_POSITIVE_INFINITY:
+        e.FRINTP(i.dest.reg().S4(), i.src1.reg().S4());
+        break;
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
+
+// ============================================================================
+// OPCODE_LOAD_CLOCK
+// ============================================================================
+struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // When scaling is disabled and the raw clock source is selected, the code
+    // in the Clock class is actually just forwarding tick counts after one
+    // simple multiply and division. In that case we rather bake the scaling in
+    // here to cut extra function calls with CPU cache misses and stack frame
+    // overhead.
+    if (cvars::clock_no_scaling && cvars::clock_source_raw) {
+      auto ratio = Clock::guest_tick_ratio();
+      // The 360 CPU is an in-order CPU, ARM64 usually isn't. Since it's
+      // resolution however is much higher than the 360's mftb instruction this
+      // can safely be ignored.
+
+      // Read clock cycle count
+      e.MRS(i.dest, SystemReg::CNTVCT_EL0);
+      // Apply tick frequency scaling.
+      e.MOV(X0, ratio.first);
+      e.MUL(i.dest, i.dest, X0);
+      e.MOV(X0, ratio.second);
+      e.UDIV(i.dest, i.dest, X0);
+    } else {
+      e.CallNative(LoadClock);
+      e.MOV(i.dest, X0);
+    }
+  }
+  static uint64_t LoadClock(void* raw_context) {
+    return Clock::QueryGuestTickCount();
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOAD_CLOCK, LOAD_CLOCK);
+
+// ============================================================================
+// OPCODE_CONTEXT_BARRIER
+// ============================================================================
+struct CONTEXT_BARRIER
+    : Sequence<CONTEXT_BARRIER, I<OPCODE_CONTEXT_BARRIER, VoidOp>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {}
+};
+EMITTER_OPCODE_TABLE(OPCODE_CONTEXT_BARRIER, CONTEXT_BARRIER);
+
+// ============================================================================
+// OPCODE_MAX
+// ============================================================================
+struct MAX_F32 : Sequence<MAX_F32, I<OPCODE_MAX, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<SReg>(
+        e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FMAX(dest, src1, src2);
+        });
+  }
+};
+struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<DReg>(
+        e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FMAX(dest, src1, src2);
+        });
+  }
+};
+struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<QReg>(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FMAX(dest.S4(), src1.S4(), src2.S4());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128);
+
+// ============================================================================
+// OPCODE_MIN
+// ============================================================================
+struct MIN_I8 : Sequence<MIN_I8, I<OPCODE_MIN, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.CMP(dest_src, src);
+          e.CSEL(dest_src, dest_src, src, Cond::LO);
+        },
+        [](A64Emitter& e, WReg dest_src, int32_t constant) {
+          e.MOV(W0, constant);
+          e.CMP(dest_src, W0);
+          e.CSEL(dest_src, dest_src, W0, Cond::LO);
+        });
+  }
+};
+struct MIN_I16 : Sequence<MIN_I16, I<OPCODE_MIN, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.CMP(dest_src, src);
+          e.CSEL(dest_src, dest_src, src, Cond::LO);
+        },
+        [](A64Emitter& e, WReg dest_src, int32_t constant) {
+          e.MOV(W0, constant);
+          e.CMP(dest_src, W0);
+          e.CSEL(dest_src, dest_src, W0, Cond::LO);
+        });
+  }
+};
+struct MIN_I32 : Sequence<MIN_I32, I<OPCODE_MIN, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.CMP(dest_src, src);
+          e.CSEL(dest_src, dest_src, src, Cond::LO);
+        },
+        [](A64Emitter& e, WReg dest_src, int32_t constant) {
+          e.MOV(W0, constant);
+          e.CMP(dest_src, W0);
+          e.CSEL(dest_src, dest_src, W0, Cond::LO);
+        });
+  }
+};
+struct MIN_I64 : Sequence<MIN_I64, I<OPCODE_MIN, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, XReg dest_src, XReg src) {
+          e.CMP(dest_src, src);
+          e.CSEL(dest_src, dest_src, src, Cond::LO);
+        },
+        [](A64Emitter& e, XReg dest_src, int64_t constant) {
+          e.MOV(X0, constant);
+          e.CMP(dest_src, X0);
+          e.CSEL(dest_src, dest_src, X0, Cond::LO);
+        });
+  }
+};
+struct MIN_F32 : Sequence<MIN_F32, I<OPCODE_MIN, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<SReg>(
+        e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FMIN(dest, src1, src2);
+        });
+  }
+};
+struct MIN_F64 : Sequence<MIN_F64, I<OPCODE_MIN, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<DReg>(
+        e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FMIN(dest, src1, src2);
+        });
+  }
+};
+struct MIN_V128 : Sequence<MIN_V128, I<OPCODE_MIN, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FMIN(dest.S4(), src1.S4(), src2.S4());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32,
+                     MIN_F64, MIN_V128);
+
+// ============================================================================
+// OPCODE_SELECT
+// ============================================================================
+// dest = src1 ? src2 : src3
+// TODO(benvanik): match compare + select sequences, as often it's something
+//     like SELECT(VECTOR_COMPARE_SGE(a, b), a, b)
+struct SELECT_I8
+    : Sequence<SELECT_I8, I<OPCODE_SELECT, I8Op, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    WReg src2(0);
+    if (i.src2.is_constant) {
+      src2 = W0;
+      e.MOV(src2, i.src2.constant());
+    } else {
+      src2 = i.src2;
+    }
+    e.CMP(i.src1.reg().toX(), 0);
+    e.CSEL(i.dest, src2, i.src3, Cond::NE);
+  }
+};
+struct SELECT_I16
+    : Sequence<SELECT_I16, I<OPCODE_SELECT, I16Op, I8Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    WReg src2(0);
+    if (i.src2.is_constant) {
+      src2 = W0;
+      e.MOV(src2, i.src2.constant());
+    } else {
+      src2 = i.src2;
+    }
+    e.CMP(i.src1.reg().toX(), 0);
+    e.CSEL(i.dest, src2, i.src3, Cond::NE);
+  }
+};
+struct SELECT_I32
+    : Sequence<SELECT_I32, I<OPCODE_SELECT, I32Op, I8Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    WReg src2(0);
+    if (i.src2.is_constant) {
+      src2 = W0;
+      e.MOV(src2, i.src2.constant());
+    } else {
+      src2 = i.src2;
+    }
+    e.CMP(i.src1.reg().toX(), 0);
+    e.CSEL(i.dest, src2, i.src3, Cond::NE);
+  }
+};
+struct SELECT_I64
+    : Sequence<SELECT_I64, I<OPCODE_SELECT, I64Op, I8Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    XReg src2(0);
+    if (i.src2.is_constant) {
+      src2 = X0;
+      e.MOV(src2, i.src2.constant());
+    } else {
+      src2 = i.src2;
+    }
+    e.CMP(i.src1.reg().toX(), 0);
+    e.CSEL(i.dest, src2, i.src3, Cond::NE);
+  }
+};
+struct SELECT_F32
+    : Sequence<SELECT_F32, I<OPCODE_SELECT, F32Op, I8Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // dest = src1 != 0 ? src2 : src3
+
+    SReg src2 = i.src2.is_constant ? S2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    }
+
+    SReg src3 = i.src3.is_constant ? S3 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(src3.toQ(), i.src3.constant());
+    }
+
+    e.CMP(i.src1.reg().toX(), 0);
+    e.FCSEL(i.dest, src2, src3, Cond::NE);
+  }
+};
+struct SELECT_F64
+    : Sequence<SELECT_F64, I<OPCODE_SELECT, F64Op, I8Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // dest = src1 != 0 ? src2 : src3
+
+    const DReg src2 = i.src2.is_constant ? D2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    }
+
+    const DReg src3 = i.src3.is_constant ? D3 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(src3.toQ(), i.src3.constant());
+    }
+
+    e.CMP(i.src1.reg().toX(), 0);
+    e.FCSEL(i.dest, src2, src3, Cond::NE);
+  }
+};
+struct SELECT_V128_I8
+    : Sequence<SELECT_V128_I8, I<OPCODE_SELECT, V128Op, I8Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // dest = src1 != 0 ? src2 : src3
+
+    const QReg src2 = i.src2.is_constant ? Q2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2, i.src2.constant());
+    }
+
+    const QReg src3 = i.src3.is_constant ? Q3 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(src3, i.src3.constant());
+    }
+
+    e.CMP(i.src1.reg().toX(), 0);
+    e.CSETM(W0, Cond::NE);
+    e.DUP(i.dest.reg().S4(), W0);
+    e.BSL(i.dest.reg().B16(), src2.B16(), src3.B16());
+  }
+};
+struct SELECT_V128_V128
+    : Sequence<SELECT_V128_V128,
+               I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    const QReg src1 = Q0;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1, i.src1.constant());
+    } else {
+      e.MOV(src1.B16(), i.src1.reg().B16());
+    }
+
+    const QReg src2 = i.src2.is_constant ? Q2 : i.src2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2, i.src2.constant());
+    }
+
+    const QReg src3 = i.src3.is_constant ? Q3 : i.src3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(src3, i.src3.constant());
+    }
+
+    // src1 ? src2 : src3;
+    e.BSL(src1.B16(), src3.B16(), src2.B16());
+    e.MOV(i.dest.reg().B16(), src1.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
+                     SELECT_I64, SELECT_F32, SELECT_F64, SELECT_V128_I8,
+                     SELECT_V128_V128);
+
+// ============================================================================
+// OPCODE_IS_TRUE
+// ============================================================================
+struct IS_TRUE_I8 : Sequence<IS_TRUE_I8, I<OPCODE_IS_TRUE, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct IS_TRUE_I16 : Sequence<IS_TRUE_I16, I<OPCODE_IS_TRUE, I8Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct IS_TRUE_I32 : Sequence<IS_TRUE_I32, I<OPCODE_IS_TRUE, I8Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct IS_TRUE_I64 : Sequence<IS_TRUE_I64, I<OPCODE_IS_TRUE, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct IS_TRUE_F32 : Sequence<IS_TRUE_F32, I<OPCODE_IS_TRUE, I8Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct IS_TRUE_F64 : Sequence<IS_TRUE_F64, I<OPCODE_IS_TRUE, I8Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct IS_TRUE_V128 : Sequence<IS_TRUE_V128, I<OPCODE_IS_TRUE, I8Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UMAXV(Q0.toS(), i.src1.reg().S4());
+    e.MOV(W0, Q0.Selem()[0]);
+    e.CMP(W0, 0);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32,
+                     IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128);
+
+// ============================================================================
+// OPCODE_IS_FALSE
+// ============================================================================
+struct IS_FALSE_I8 : Sequence<IS_FALSE_I8, I<OPCODE_IS_FALSE, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct IS_FALSE_I16 : Sequence<IS_FALSE_I16, I<OPCODE_IS_FALSE, I8Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct IS_FALSE_I32 : Sequence<IS_FALSE_I32, I<OPCODE_IS_FALSE, I8Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct IS_FALSE_I64 : Sequence<IS_FALSE_I64, I<OPCODE_IS_FALSE, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct IS_FALSE_F32 : Sequence<IS_FALSE_F32, I<OPCODE_IS_FALSE, I8Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct IS_FALSE_F64 : Sequence<IS_FALSE_F64, I<OPCODE_IS_FALSE, I8Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1.reg(), 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct IS_FALSE_V128
+    : Sequence<IS_FALSE_V128, I<OPCODE_IS_FALSE, I8Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.UMAXV(Q0.toS(), i.src1.reg().S4());
+    e.MOV(W0, Q0.Selem()[0]);
+    e.CMP(W0, 0);
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32,
+                     IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128);
+
+// ============================================================================
+// OPCODE_IS_NAN
+// ============================================================================
+struct IS_NAN_F32 : Sequence<IS_NAN_F32, I<OPCODE_IS_NAN, I8Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1, i.src1);
+    e.CSET(i.dest, Cond::VS);
+  }
+};
+
+struct IS_NAN_F64 : Sequence<IS_NAN_F64, I<OPCODE_IS_NAN, I8Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1, i.src1);
+    e.CSET(i.dest, Cond::VS);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_IS_NAN, IS_NAN_F32, IS_NAN_F64);
+
+// ============================================================================
+// OPCODE_COMPARE_EQ
+// ============================================================================
+struct COMPARE_EQ_I8
+    : Sequence<COMPARE_EQ_I8, I<OPCODE_COMPARE_EQ, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, WReg src1, int32_t constant) {
+          e.MOV(W1, constant);
+          e.CMP(src1, W1);
+        });
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct COMPARE_EQ_I16
+    : Sequence<COMPARE_EQ_I16, I<OPCODE_COMPARE_EQ, I8Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, WReg src1, int32_t constant) {
+          e.MOV(W1, constant);
+          e.CMP(src1, W1);
+        });
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct COMPARE_EQ_I32
+    : Sequence<COMPARE_EQ_I32, I<OPCODE_COMPARE_EQ, I8Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, WReg src1, int32_t constant) {
+          e.MOV(W1, constant);
+          e.CMP(src1, W1);
+        });
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct COMPARE_EQ_I64
+    : Sequence<COMPARE_EQ_I64, I<OPCODE_COMPARE_EQ, I8Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, XReg src1, XReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, XReg src1, int32_t constant) {
+          e.MOV(X1, constant);
+          e.CMP(src1, X1);
+        });
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct COMPARE_EQ_F32
+    : Sequence<COMPARE_EQ_F32, I<OPCODE_COMPARE_EQ, I8Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<SReg>(
+        e, i,
+        [&i](A64Emitter& e, I8Op dest, const SReg& src1, const SReg& src2) {
+          e.FCMP(src1, src2);
+        });
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+struct COMPARE_EQ_F64
+    : Sequence<COMPARE_EQ_F64, I<OPCODE_COMPARE_EQ, I8Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<DReg>(
+        e, i,
+        [&i](A64Emitter& e, I8Op dest, const DReg& src1, const DReg& src2) {
+          e.FCMP(src1, src2);
+        });
+    e.CSET(i.dest, Cond::EQ);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EQ, COMPARE_EQ_I8, COMPARE_EQ_I16,
+                     COMPARE_EQ_I32, COMPARE_EQ_I64, COMPARE_EQ_F32,
+                     COMPARE_EQ_F64);
+
+// ============================================================================
+// OPCODE_COMPARE_NE
+// ============================================================================
+struct COMPARE_NE_I8
+    : Sequence<COMPARE_NE_I8, I<OPCODE_COMPARE_NE, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, WReg src1, int32_t constant) {
+          e.MOV(W1, constant);
+          e.CMP(src1, W1);
+        });
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct COMPARE_NE_I16
+    : Sequence<COMPARE_NE_I16, I<OPCODE_COMPARE_NE, I8Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, WReg src1, int32_t constant) {
+          e.MOV(W1, constant);
+          e.CMP(src1, W1);
+        });
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct COMPARE_NE_I32
+    : Sequence<COMPARE_NE_I32, I<OPCODE_COMPARE_NE, I8Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, WReg src1, int32_t constant) {
+          e.MOV(W1, constant);
+          e.CMP(src1, W1);
+        });
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct COMPARE_NE_I64
+    : Sequence<COMPARE_NE_I64, I<OPCODE_COMPARE_NE, I8Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeCompareOp(
+        e, i, [](A64Emitter& e, XReg src1, XReg src2) { e.CMP(src1, src2); },
+        [](A64Emitter& e, XReg src1, int32_t constant) {
+          e.MOV(X1, constant);
+          e.CMP(src1, X1);
+        });
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct COMPARE_NE_F32
+    : Sequence<COMPARE_NE_F32, I<OPCODE_COMPARE_NE, I8Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1, i.src2);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+struct COMPARE_NE_F64
+    : Sequence<COMPARE_NE_F64, I<OPCODE_COMPARE_NE, I8Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FCMP(i.src1, i.src2);
+    e.CSET(i.dest, Cond::NE);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_COMPARE_NE, COMPARE_NE_I8, COMPARE_NE_I16,
+                     COMPARE_NE_I32, COMPARE_NE_I64, COMPARE_NE_F32,
+                     COMPARE_NE_F64);
+
+// ============================================================================
+// OPCODE_COMPARE_*
+// ============================================================================
+#define EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, type,          \
+                                        reg_type)                              \
+  struct COMPARE_##op##_##type                                                 \
+      : Sequence<COMPARE_##op##_##type,                                        \
+                 I<OPCODE_COMPARE_##op, I8Op, type, type>> {                   \
+    static void Emit(A64Emitter& e, const EmitArgType& i) {                    \
+      EmitAssociativeCompareOp(                                                \
+          e, i,                                                                \
+          [](A64Emitter& e, WReg dest, const reg_type& src1,                   \
+             const reg_type& src2, bool inverse) {                             \
+            e.CMP(src1, src2);                                                 \
+            if (!inverse) {                                                    \
+              e.CSET(dest, cond);                                              \
+            } else {                                                           \
+              e.CSET(dest, inverse_cond);                                      \
+            }                                                                  \
+          },                                                                   \
+          [](A64Emitter& e, WReg dest, const reg_type& src1, int32_t constant, \
+             bool inverse) {                                                   \
+            e.MOV(reg_type(1), constant);                                      \
+            e.CMP(src1, reg_type(1));                                          \
+            if (!inverse) {                                                    \
+              e.CSET(dest, cond);                                              \
+            } else {                                                           \
+              e.CSET(dest, inverse_cond);                                      \
+            }                                                                  \
+          });                                                                  \
+    }                                                                          \
+  };
+#define EMITTER_ASSOCIATIVE_COMPARE_XX(op, cond, inverse_cond)          \
+  EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I8Op, WReg);  \
+  EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I16Op, WReg); \
+  EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I32Op, WReg); \
+  EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I64Op, XReg); \
+  EMITTER_OPCODE_TABLE(OPCODE_COMPARE_##op, COMPARE_##op##_I8Op,        \
+                       COMPARE_##op##_I16Op, COMPARE_##op##_I32Op,      \
+                       COMPARE_##op##_I64Op);
+EMITTER_ASSOCIATIVE_COMPARE_XX(SLT, Cond::LT, Cond::GT);  // setl, setg
+EMITTER_ASSOCIATIVE_COMPARE_XX(SLE, Cond::LE, Cond::GE);  // setle, setge
+EMITTER_ASSOCIATIVE_COMPARE_XX(SGT, Cond::GT, Cond::LT);  // setg, setl
+EMITTER_ASSOCIATIVE_COMPARE_XX(SGE, Cond::GE, Cond::LE);  // setge, setle
+EMITTER_ASSOCIATIVE_COMPARE_XX(ULT, Cond::LO, Cond::HI);  // setb, seta
+EMITTER_ASSOCIATIVE_COMPARE_XX(ULE, Cond::LS, Cond::HS);  // setbe, setae
+EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, Cond::HS, Cond::LS);  // setae, setbe
+EMITTER_ASSOCIATIVE_COMPARE_XX(UGT, Cond::HI, Cond::LO);  // seta, setb
+
+// https://web.archive.org/web/20171129015931/https://x86.renejeschke.de/html/file_module_x86_id_288.html
+// Original link: https://x86.renejeschke.de/html/file_module_x86_id_288.html
+#define EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(op, cond)                  \
+  struct COMPARE_##op##_F32                                           \
+      : Sequence<COMPARE_##op##_F32,                                  \
+                 I<OPCODE_COMPARE_##op, I8Op, F32Op, F32Op>> {        \
+    static void Emit(A64Emitter& e, const EmitArgType& i) {           \
+      e.FCMP(i.src1, i.src2);                                         \
+      e.CSET(i.dest, cond);                                           \
+    }                                                                 \
+  };                                                                  \
+  struct COMPARE_##op##_F64                                           \
+      : Sequence<COMPARE_##op##_F64,                                  \
+                 I<OPCODE_COMPARE_##op, I8Op, F64Op, F64Op>> {        \
+    static void Emit(A64Emitter& e, const EmitArgType& i) {           \
+      if (i.src1.is_constant) {                                       \
+        e.LoadConstantV(Q0, i.src1.constant());                       \
+        e.FCMP(D0, i.src2);                                           \
+      } else if (i.src2.is_constant) {                                \
+        e.LoadConstantV(Q0, i.src2.constant());                       \
+        e.FCMP(i.src1, D0);                                           \
+      } else {                                                        \
+        e.FCMP(i.src1, i.src2);                                       \
+      }                                                               \
+      e.CSET(i.dest, cond);                                           \
+    }                                                                 \
+  };                                                                  \
+  EMITTER_OPCODE_TABLE(OPCODE_COMPARE_##op##_FLT, COMPARE_##op##_F32, \
+                       COMPARE_##op##_F64);
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLT, Cond::LT);  // setb
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLE, Cond::LE);  // setbe
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGT, Cond::GT);  // seta
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGE, Cond::GE);  // setae
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULT, Cond::LO);  // setb
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULE, Cond::LS);  // setbe
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGT, Cond::HI);  // seta
+EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGE, Cond::HS);  // setae
+
+// ============================================================================
+// OPCODE_DID_SATURATE
+// ============================================================================
+struct DID_SATURATE
+    : Sequence<DID_SATURATE, I<OPCODE_DID_SATURATE, I8Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // Bit 27 in the FPSR is the QC bit
+    e.MRS(X0, SystemReg::FPSR);
+    e.UBFX(i.dest, W0, 27, 1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, DID_SATURATE);
+
+// ============================================================================
+// OPCODE_ADD
+// ============================================================================
+// TODO(benvanik): put dest/src1|2 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitAddXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitCommutativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, REG dest_src, REG src) {
+        e.ADD(dest_src, dest_src, src);
+      },
+      [](A64Emitter& e, REG dest_src, int32_t constant) {
+        e.MOV(REG(1), constant);
+        e.ADD(dest_src, dest_src, REG(1));
+      });
+}
+struct ADD_I8 : Sequence<ADD_I8, I<OPCODE_ADD, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddXX<ADD_I8, WReg>(e, i);
+  }
+};
+struct ADD_I16 : Sequence<ADD_I16, I<OPCODE_ADD, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddXX<ADD_I16, WReg>(e, i);
+  }
+};
+struct ADD_I32 : Sequence<ADD_I32, I<OPCODE_ADD, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddXX<ADD_I32, WReg>(e, i);
+  }
+};
+struct ADD_I64 : Sequence<ADD_I64, I<OPCODE_ADD, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddXX<ADD_I64, XReg>(e, i);
+  }
+};
+struct ADD_F32 : Sequence<ADD_F32, I<OPCODE_ADD, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<SReg>(
+        e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FADD(dest, src1, src2);
+        });
+  }
+};
+struct ADD_F64 : Sequence<ADD_F64, I<OPCODE_ADD, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp<DReg>(
+        e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FADD(dest, src1, src2);
+        });
+  }
+};
+struct ADD_V128 : Sequence<ADD_V128, I<OPCODE_ADD, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FADD(dest.S4(), src1.S4(), src2.S4());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ADD, ADD_I8, ADD_I16, ADD_I32, ADD_I64, ADD_F32,
+                     ADD_F64, ADD_V128);
+
+// ============================================================================
+// OPCODE_ADD_CARRY
+// ============================================================================
+// TODO(benvanik): put dest/src1|2 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitAddCarryXX(A64Emitter& e, const ARGS& i) {
+  // TODO(benvanik): faster setting? we could probably do some fun math tricks
+  // here to get the carry flag set.
+  if (i.src3.is_constant) {
+    e.MOV(W0, WZR);
+    if (i.src3.constant()) {
+      // Set carry
+      // This is implicitly "SUBS 0 - 0"
+      e.CMP(W0, 0);
+    } else {
+      // Clear carry
+      e.CMN(W0, 0);
+    }
+  } else {
+    // If src3 is non-zero, set the carry flag
+    e.CMP(i.src3.reg().toW(), 0);
+    e.CSET(X0, Cond::NE);
+
+    e.MRS(X1, SystemReg::NZCV);
+    // Assign carry bit
+    e.BFI(X1, X0, 29, 1);
+    e.MSR(SystemReg::NZCV, X1);
+  }
+  SEQ::EmitCommutativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, const REG& dest_src, const REG& src) {
+        e.ADC(dest_src, dest_src, src);
+      },
+      [](A64Emitter& e, const REG& dest_src, int32_t constant) {
+        e.MOV(REG(1), constant);
+        e.ADC(dest_src, dest_src, REG(1));
+      });
+}
+struct ADD_CARRY_I8
+    : Sequence<ADD_CARRY_I8, I<OPCODE_ADD_CARRY, I8Op, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddCarryXX<ADD_CARRY_I8, WReg>(e, i);
+  }
+};
+struct ADD_CARRY_I16
+    : Sequence<ADD_CARRY_I16, I<OPCODE_ADD_CARRY, I16Op, I16Op, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddCarryXX<ADD_CARRY_I16, WReg>(e, i);
+  }
+};
+struct ADD_CARRY_I32
+    : Sequence<ADD_CARRY_I32, I<OPCODE_ADD_CARRY, I32Op, I32Op, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddCarryXX<ADD_CARRY_I32, WReg>(e, i);
+  }
+};
+struct ADD_CARRY_I64
+    : Sequence<ADD_CARRY_I64, I<OPCODE_ADD_CARRY, I64Op, I64Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAddCarryXX<ADD_CARRY_I64, XReg>(e, i);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY, ADD_CARRY_I8, ADD_CARRY_I16,
+                     ADD_CARRY_I32, ADD_CARRY_I64);
+
+// ============================================================================
+// OPCODE_SUB
+// ============================================================================
+// TODO(benvanik): put dest/src1|2 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitSubXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitAssociativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, REG dest_src, REG src) {
+        e.SUB(dest_src, dest_src, src);
+      },
+      [](A64Emitter& e, REG dest_src, int32_t constant) {
+        e.MOV(REG(1), constant);
+        e.SUB(dest_src, dest_src, REG(1));
+      });
+}
+struct SUB_I8 : Sequence<SUB_I8, I<OPCODE_SUB, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitSubXX<SUB_I8, WReg>(e, i);
+  }
+};
+struct SUB_I16 : Sequence<SUB_I16, I<OPCODE_SUB, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitSubXX<SUB_I16, WReg>(e, i);
+  }
+};
+struct SUB_I32 : Sequence<SUB_I32, I<OPCODE_SUB, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitSubXX<SUB_I32, WReg>(e, i);
+  }
+};
+struct SUB_I64 : Sequence<SUB_I64, I<OPCODE_SUB, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitSubXX<SUB_I64, XReg>(e, i);
+  }
+};
+struct SUB_F32 : Sequence<SUB_F32, I<OPCODE_SUB, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitAssociativeBinaryVOp<SReg>(
+        e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FSUB(dest, src1, src2);
+        });
+  }
+};
+struct SUB_F64 : Sequence<SUB_F64, I<OPCODE_SUB, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitAssociativeBinaryVOp<DReg>(
+        e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FSUB(dest, src1, src2);
+        });
+  }
+};
+struct SUB_V128 : Sequence<SUB_V128, I<OPCODE_SUB, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitAssociativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FSUB(dest.S4(), src1.S4(), src2.S4());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32,
+                     SUB_F64, SUB_V128);
+
+// ============================================================================
+// OPCODE_MUL
+// ============================================================================
+// Sign doesn't matter here, as we don't use the high bits.
+// We exploit mulx here to avoid creating too much register pressure.
+struct MUL_I8 : Sequence<MUL_I8, I<OPCODE_MUL, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.MOV(W0, i.src1.constant());
+      e.MUL(i.dest, W0, i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.MOV(W0, i.src2.constant());
+      e.MUL(i.dest, i.src1, W0);
+    } else {
+      e.MUL(i.dest, i.src1, i.src2);
+    }
+  }
+};
+struct MUL_I16 : Sequence<MUL_I16, I<OPCODE_MUL, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.MOV(W0, i.src1.constant());
+      e.MUL(i.dest, W0, i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.MOV(W0, i.src2.constant());
+      e.MUL(i.dest, i.src1, W0);
+    } else {
+      e.MUL(i.dest, i.src1, i.src2);
+    }
+  }
+};
+struct MUL_I32 : Sequence<MUL_I32, I<OPCODE_MUL, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.MOV(W0, i.src1.constant());
+      e.MUL(i.dest, W0, i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.MOV(W0, i.src2.constant());
+      e.MUL(i.dest, i.src1, W0);
+    } else {
+      e.MUL(i.dest, i.src1, i.src2);
+    }
+  }
+};
+struct MUL_I64 : Sequence<MUL_I64, I<OPCODE_MUL, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      assert_true(!i.src2.is_constant);
+      e.MOV(X0, i.src1.constant());
+      e.MUL(i.dest, X0, i.src2);
+    } else if (i.src2.is_constant) {
+      assert_true(!i.src1.is_constant);
+      e.MOV(X0, i.src2.constant());
+      e.MUL(i.dest, i.src1, X0);
+    } else {
+      e.MUL(i.dest, i.src1, i.src2);
+    }
+  }
+};
+struct MUL_F32 : Sequence<MUL_F32, I<OPCODE_MUL, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitCommutativeBinaryVOp<SReg>(
+        e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FMUL(dest, src1, src2);
+        });
+  }
+};
+struct MUL_F64 : Sequence<MUL_F64, I<OPCODE_MUL, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitCommutativeBinaryVOp<DReg>(
+        e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FMUL(dest, src1, src2);
+        });
+  }
+};
+struct MUL_V128 : Sequence<MUL_V128, I<OPCODE_MUL, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FMUL(dest.S4(), src1.S4(), src2.S4());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MUL, MUL_I8, MUL_I16, MUL_I32, MUL_I64, MUL_F32,
+                     MUL_F64, MUL_V128);
+
+// ============================================================================
+// OPCODE_MUL_HI
+// ============================================================================
+struct MUL_HI_I8 : Sequence<MUL_HI_I8, I<OPCODE_MUL_HI, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.MUL(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.MUL(i.dest, i.src1, W0);
+      } else {
+        e.MUL(i.dest, i.src1, i.src2);
+      }
+      e.UBFX(i.dest, i.dest, 8, 8);
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.MUL(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.MUL(i.dest, i.src1, W0);
+      } else {
+        e.MUL(i.dest, i.src1, i.src2);
+      }
+      e.SBFX(i.dest, i.dest, 8, 8);
+    }
+  }
+};
+struct MUL_HI_I16
+    : Sequence<MUL_HI_I16, I<OPCODE_MUL_HI, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.MUL(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.MUL(i.dest, i.src1, W0);
+      } else {
+        e.MUL(i.dest, i.src1, i.src2);
+      }
+      e.UBFX(i.dest, i.dest, 16, 16);
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.MUL(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.MUL(i.dest, i.src1, W0);
+      } else {
+        e.MUL(i.dest, i.src1, i.src2);
+      }
+      e.SBFX(i.dest, i.dest, 16, 16);
+    }
+  }
+};
+struct MUL_HI_I32
+    : Sequence<MUL_HI_I32, I<OPCODE_MUL_HI, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.UMULL(X0, W0, i.src2);
+        e.UBFX(X0, X0, 32, 32);
+        e.MOV(i.dest, X0.toW());
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.UMULL(X0, W0, i.src1);
+        e.UBFX(X0, X0, 32, 32);
+        e.MOV(i.dest, X0.toW());
+      } else {
+        e.UMULL(X0, i.src1, i.src2);
+        e.UBFX(X0, X0, 32, 32);
+        e.MOV(i.dest, X0.toW());
+      }
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.SMULL(X0, W0, i.src2);
+        e.SBFX(X0, X0, 32, 32);
+        e.MOV(i.dest, X0.toW());
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.SMULL(X0, W0, i.src1);
+        e.SBFX(X0, X0, 32, 32);
+        e.MOV(i.dest, X0.toW());
+      } else {
+        e.SMULL(X0, i.src1, i.src2);
+        e.SBFX(X0, X0, 32, 32);
+        e.MOV(i.dest, X0.toW());
+      }
+    }
+  }
+};
+struct MUL_HI_I64
+    : Sequence<MUL_HI_I64, I<OPCODE_MUL_HI, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(X0, i.src1.constant());
+        e.UMULH(i.dest, X0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(X0, i.src2.constant());
+        e.UMULH(i.dest, i.src1, X0);
+      } else {
+        e.UMULH(i.dest, i.src1, i.src2);
+      }
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(X0, i.src1.constant());
+        e.SMULH(i.dest, X0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(X0, i.src2.constant());
+        e.SMULH(i.dest, i.src1, X0);
+      } else {
+        e.SMULH(i.dest, i.src1, i.src2);
+      }
+    }
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MUL_HI, MUL_HI_I8, MUL_HI_I16, MUL_HI_I32,
+                     MUL_HI_I64);
+
+// ============================================================================
+// OPCODE_DIV
+// ============================================================================
+struct DIV_I8 : Sequence<DIV_I8, I<OPCODE_DIV, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.UDIV(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.UDIV(i.dest, i.src1, W0);
+      } else {
+        e.UDIV(i.dest, i.src1, i.src2);
+      }
+      e.UXTB(i.dest, i.dest);
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.SDIV(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.SDIV(i.dest, i.src1, W0);
+      } else {
+        e.SDIV(i.dest, i.src1, i.src2);
+      }
+      e.SXTB(i.dest, i.dest);
+    }
+  }
+};
+struct DIV_I16 : Sequence<DIV_I16, I<OPCODE_DIV, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.UDIV(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.UDIV(i.dest, i.src1, W0);
+      } else {
+        e.UDIV(i.dest, i.src1, i.src2);
+      }
+      e.UXTH(i.dest, i.dest);
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.SDIV(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.SDIV(i.dest, i.src1, W0);
+      } else {
+        e.SDIV(i.dest, i.src1, i.src2);
+      }
+      e.SXTH(i.dest, i.dest);
+    }
+  }
+};
+struct DIV_I32 : Sequence<DIV_I32, I<OPCODE_DIV, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.UDIV(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.UDIV(i.dest, i.src1, W0);
+      } else {
+        e.UDIV(i.dest, i.src1, i.src2);
+      }
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(W0, i.src1.constant());
+        e.SDIV(i.dest, W0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(W0, i.src2.constant());
+        e.SDIV(i.dest, i.src1, W0);
+      } else {
+        e.SDIV(i.dest, i.src1, i.src2);
+      }
+    }
+  }
+};
+struct DIV_I64 : Sequence<DIV_I64, I<OPCODE_DIV, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(X0, i.src1.constant());
+        e.UDIV(i.dest, X0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(X0, i.src2.constant());
+        e.UDIV(i.dest, i.src1, X0);
+      } else {
+        e.UDIV(i.dest, i.src1, i.src2);
+      }
+    } else {
+      if (i.src1.is_constant) {
+        assert_true(!i.src2.is_constant);
+        e.MOV(X0, i.src1.constant());
+        e.SDIV(i.dest, X0, i.src2);
+      } else if (i.src2.is_constant) {
+        assert_true(!i.src1.is_constant);
+        e.MOV(X0, i.src2.constant());
+        e.SDIV(i.dest, i.src1, X0);
+      } else {
+        e.SDIV(i.dest, i.src1, i.src2);
+      }
+    }
+  }
+};
+struct DIV_F32 : Sequence<DIV_F32, I<OPCODE_DIV, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitAssociativeBinaryVOp<SReg>(
+        e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FDIV(dest, src1, src2);
+        });
+  }
+};
+struct DIV_F64 : Sequence<DIV_F64, I<OPCODE_DIV, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitAssociativeBinaryVOp<DReg>(
+        e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FDIV(dest, src1, src2);
+        });
+  }
+};
+struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    EmitAssociativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FDIV(dest.S4(), src1.S4(), src2.S4());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32,
+                     DIV_F64, DIV_V128);
+
+// ============================================================================
+// OPCODE_MUL_ADD
+// ============================================================================
+// d = 1 * 2 + 3
+// $0 = $1x$0 + $2
+struct MUL_ADD_F32
+    : Sequence<MUL_ADD_F32, I<OPCODE_MUL_ADD, F32Op, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    SReg src3 = S3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(src3.toQ(), i.src3.constant());
+    } else {
+      src3 = i.src3.reg();
+    }
+
+    SReg src2 = S2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    } else {
+      src2 = i.src2.reg();
+    }
+
+    SReg src1 = S1;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1.toQ(), i.src1.constant());
+    } else {
+      src1 = i.src1.reg();
+    }
+
+    e.FMADD(i.dest, src1, src2, src3);
+  }
+};
+struct MUL_ADD_F64
+    : Sequence<MUL_ADD_F64, I<OPCODE_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    DReg src3 = D3;
+    if (i.src3.is_constant) {
+      e.LoadConstantV(src3.toQ(), i.src3.constant());
+    } else {
+      src3 = i.src3.reg();
+    }
+
+    DReg src2 = D2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    } else {
+      src2 = i.src2.reg();
+    }
+
+    DReg src1 = D1;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1.toQ(), i.src1.constant());
+    } else {
+      src1 = i.src1.reg();
+    }
+
+    e.FMADD(i.dest, src1, src2, src3);
+  }
+};
+struct MUL_ADD_V128
+    : Sequence<MUL_ADD_V128,
+               I<OPCODE_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    const QReg dest = i.dest.reg();
+    if (i.src3.is_constant) {
+      e.LoadConstantV(dest.toQ(), i.src3.constant());
+    } else {
+      // If i.dest != i.src3, move the addition-term into dest for FMLA
+      if (i.dest != i.src3) {
+        e.MOV(dest.B16(), i.src3.reg().B16());
+      }
+    }
+
+    QReg src2 = Q2;
+    if (i.src2.is_constant) {
+      e.LoadConstantV(src2.toQ(), i.src2.constant());
+    } else {
+      src2 = i.src2.reg();
+    }
+
+    QReg src1 = Q1;
+    if (i.src1.is_constant) {
+      e.LoadConstantV(src1.toQ(), i.src1.constant());
+    } else {
+      src1 = i.src1.reg();
+    }
+
+    e.FMLA(dest.S4(), src1.S4(), src2.S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
+
+// ============================================================================
+// OPCODE_MUL_SUB
+// ============================================================================
+// d = 1 * 2 - 3
+// $0 = $2x$0 - $3
+// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling.
+// dest could be src2 or src3 - need to ensure it's not before overwriting dest
+// perhaps use other 132/213/etc
+// Forms:
+// - 132 -> $1 = $1 * $3 - $2
+// - 213 -> $1 = $2 * $1 - $3
+// - 231 -> $1 = $2 * $3 - $1
+struct MUL_SUB_F32
+    : Sequence<MUL_SUB_F32, I<OPCODE_MUL_SUB, F32Op, F32Op, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    SReg src3(1);
+    if (i.src3.is_constant) {
+      src3 = S1;
+      e.LoadConstantV(src3.toQ(), i.src3.constant());
+    } else {
+      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+      src3 = i.src3.reg();
+      if (i.dest.reg().index() == i.src3.reg().index()) {
+        e.FMOV(S1, i.src3);
+        src3 = S1;
+      }
+    }
+
+    // Multiply operation is commutative.
+    EmitCommutativeBinaryVOp<SReg>(
+        e, i, [&i](A64Emitter& e, SReg dest, SReg src1, SReg src2) {
+          e.FMUL(dest, src1, src2);  // $0 = $1 * $2
+        });
+
+    e.FSUB(i.dest, i.dest, src3);  // $0 = $1 - $2
+  }
+};
+struct MUL_SUB_F64
+    : Sequence<MUL_SUB_F64, I<OPCODE_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    DReg src3(1);
+    if (i.src3.is_constant) {
+      src3 = D1;
+      e.LoadConstantV(src3.toQ(), i.src3.constant());
+    } else {
+      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+      src3 = i.src3.reg();
+      if (i.dest.reg().index() == i.src3.reg().index()) {
+        e.FMOV(D1, i.src3);
+        src3 = D1;
+      }
+    }
+
+    // Multiply operation is commutative.
+    EmitCommutativeBinaryVOp<DReg>(
+        e, i, [&i](A64Emitter& e, DReg dest, DReg src1, DReg src2) {
+          e.FMUL(dest, src1, src2);  // $0 = $1 * $2
+        });
+
+    e.FSUB(i.dest, i.dest, src3);  // $0 = $1 + $2
+  }
+};
+struct MUL_SUB_V128
+    : Sequence<MUL_SUB_V128,
+               I<OPCODE_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    QReg src3(1);
+    if (i.src3.is_constant) {
+      src3 = Q1;
+      e.LoadConstantV(src3, i.src3.constant());
+    } else {
+      // If i.dest == i.src3, back up i.src3 so we don't overwrite it.
+      src3 = i.src3;
+      if (i.dest == i.src3) {
+        e.MOV(Q1.B16(), i.src3.reg().B16());
+        src3 = Q1;
+      }
+    }
+
+    // Multiply operation is commutative.
+    EmitCommutativeBinaryVOp(
+        e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.FMUL(dest.S4(), src1.S4(), src2.S4());  // $0 = $1 * $2
+        });
+
+    e.FSUB(i.dest.reg().S4(), i.dest.reg().S4(), src3.S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F32, MUL_SUB_F64, MUL_SUB_V128);
+
+// ============================================================================
+// OPCODE_NEG
+// ============================================================================
+// TODO(benvanik): put dest/src1 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitNegXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitUnaryOp(
+      e, i, [](A64Emitter& e, REG dest_src) { e.NEG(dest_src, dest_src); });
+}
+struct NEG_I8 : Sequence<NEG_I8, I<OPCODE_NEG, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNegXX<NEG_I8, WReg>(e, i);
+  }
+};
+struct NEG_I16 : Sequence<NEG_I16, I<OPCODE_NEG, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNegXX<NEG_I16, WReg>(e, i);
+  }
+};
+struct NEG_I32 : Sequence<NEG_I32, I<OPCODE_NEG, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNegXX<NEG_I32, WReg>(e, i);
+  }
+};
+struct NEG_I64 : Sequence<NEG_I64, I<OPCODE_NEG, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNegXX<NEG_I64, XReg>(e, i);
+  }
+};
+struct NEG_F32 : Sequence<NEG_F32, I<OPCODE_NEG, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FNEG(i.dest, i.src1);
+  }
+};
+struct NEG_F64 : Sequence<NEG_F64, I<OPCODE_NEG, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FNEG(i.dest, i.src1);
+  }
+};
+struct NEG_V128 : Sequence<NEG_V128, I<OPCODE_NEG, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_true(!i.instr->flags);
+    e.FNEG(i.dest.reg().S4(), i.src1.reg().S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
+                     NEG_F64, NEG_V128);
+
+// ============================================================================
+// OPCODE_ABS
+// ============================================================================
+struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FABS(i.dest, i.src1);
+  }
+};
+struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FABS(i.dest, i.src1);
+  }
+};
+struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FABS(i.dest.reg().S4(), i.src1.reg().S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
+
+// ============================================================================
+// OPCODE_SQRT
+// ============================================================================
+struct SQRT_F32 : Sequence<SQRT_F32, I<OPCODE_SQRT, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FSQRT(i.dest, i.src1);
+  }
+};
+struct SQRT_F64 : Sequence<SQRT_F64, I<OPCODE_SQRT, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FSQRT(i.dest, i.src1);
+  }
+};
+struct SQRT_V128 : Sequence<SQRT_V128, I<OPCODE_SQRT, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FSQRT(i.dest.reg().S4(), i.src1.reg().S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SQRT, SQRT_F32, SQRT_F64, SQRT_V128);
+
+// ============================================================================
+// OPCODE_RSQRT
+// ============================================================================
+// Altivec guarantees an error of < 1/4096 for vrsqrtefp
+struct RSQRT_F32 : Sequence<RSQRT_F32, I<OPCODE_RSQRT, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FRSQRTE(i.dest, i.src1);
+  }
+};
+struct RSQRT_F64 : Sequence<RSQRT_F64, I<OPCODE_RSQRT, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FRSQRTE(i.dest, i.src1);
+  }
+};
+struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FRSQRTE(i.dest.reg().S4(), i.src1.reg().S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_RSQRT, RSQRT_F32, RSQRT_F64, RSQRT_V128);
+
+// ============================================================================
+// OPCODE_RECIP
+// ============================================================================
+// Altivec guarantees an error of < 1/4096 for vrefp
+struct RECIP_F32 : Sequence<RECIP_F32, I<OPCODE_RECIP, F32Op, F32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FRECPE(i.dest, i.src1);
+  }
+};
+struct RECIP_F64 : Sequence<RECIP_F64, I<OPCODE_RECIP, F64Op, F64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FRECPE(i.dest, i.src1);
+  }
+};
+struct RECIP_V128 : Sequence<RECIP_V128, I<OPCODE_RECIP, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.FRECPE(i.dest.reg().S4(), i.src1.reg().S4());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_RECIP, RECIP_F32, RECIP_F64, RECIP_V128);
+
+// ============================================================================
+// OPCODE_POW2
+// ============================================================================
+// TODO(benvanik): use approx here:
+//     https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
+  static float32x4_t EmulatePow2(void*, std::byte src[16]) {
+    float src_value;
+    vst1q_lane_f32(&src_value, vld1q_u8(src), 0);
+    const float result = std::exp2(src_value);
+    return vld1q_lane_f32(&result, vld1q_u8(src), 0);
+  }
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_always();
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ()));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulatePow2));
+    e.FMOV(i.dest, S0);
+  }
+};
+struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
+  static float64x2_t EmulatePow2(void*, std::byte src[16]) {
+    double src_value;
+    vst1q_lane_f64(&src_value, vld1q_u8(src), 0);
+    const double result = std::exp2(src_value);
+    return vld1q_lane_f64(&result, vld1q_u8(src), 0);
+  }
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_always();
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ()));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulatePow2));
+    e.FMOV(i.dest, D0);
+  }
+};
+struct POW2_V128 : Sequence<POW2_V128, I<OPCODE_POW2, V128Op, V128Op>> {
+  static float32x4_t EmulatePow2(void*, std::byte src[16]) {
+    alignas(16) float values[4];
+    vst1q_f32(values, vld1q_u8(src));
+    for (size_t i = 0; i < 4; ++i) {
+      values[i] = std::exp2(values[i]);
+    }
+    return vld1q_f32(values);
+  }
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ()));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulatePow2));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_POW2, POW2_F32, POW2_F64, POW2_V128);
+
+// ============================================================================
+// OPCODE_LOG2
+// ============================================================================
+// TODO(benvanik): use approx here:
+//     https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it!
+struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
+  static float32x4_t EmulateLog2(void*, std::byte src[16]) {
+    float src_value;
+    vst1q_lane_f32(&src_value, vld1q_u8(src), 0);
+    float result = std::log2(src_value);
+    return vld1q_lane_f32(&result, vld1q_u8(src), 0);
+  }
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_always();
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ()));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateLog2));
+    e.FMOV(i.dest, S0);
+  }
+};
+struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
+  static float64x2_t EmulateLog2(void*, std::byte src[16]) {
+    double src_value;
+    vst1q_lane_f64(&src_value, vld1q_u8(src), 0);
+    double result = std::log2(src_value);
+    return vld1q_lane_f64(&result, vld1q_u8(src), 0);
+  }
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    assert_always();
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ()));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateLog2));
+    e.FMOV(i.dest, D0);
+  }
+};
+struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
+  static float32x4_t EmulateLog2(void*, std::byte src[16]) {
+    alignas(16) float values[4];
+    vst1q_f32(values, vld1q_u8(src));
+    for (size_t i = 0; i < 4; ++i) {
+      values[i] = std::log2(values[i]);
+    }
+    return vld1q_f32(values);
+  }
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant()));
+    } else {
+      e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ()));
+    }
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateLog2));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
+
+// ============================================================================
+// OPCODE_DOT_PRODUCT_3
+// ============================================================================
+struct DOT_PRODUCT_3_V128
+    : Sequence<DOT_PRODUCT_3_V128,
+               I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, SReg dest, QReg src1, QReg src2) {
+          e.FMUL(dest.toQ().S4(), src1.S4(), src2.S4());
+          e.MOV(dest.toQ().Selem()[3], WZR);
+          e.FADDP(dest.toQ().S4(), dest.toQ().S4(), dest.toQ().S4());
+          e.FADDP(dest.toS(), dest.toD().S2());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
+
+// ============================================================================
+// OPCODE_DOT_PRODUCT_4
+// ============================================================================
+struct DOT_PRODUCT_4_V128
+    : Sequence<DOT_PRODUCT_4_V128,
+               I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, SReg dest, QReg src1, QReg src2) {
+          e.FMUL(dest.toQ().S4(), src1.S4(), src2.S4());
+          e.FADDP(dest.toQ().S4(), dest.toQ().S4(), dest.toQ().S4());
+          e.FADDP(dest.toS(), dest.toD().S2());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128);
+
+// ============================================================================
+// OPCODE_AND
+// ============================================================================
+// TODO(benvanik): put dest/src1|2 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitAndXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitCommutativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, REG dest_src, REG src) {
+        e.AND(dest_src, dest_src, src);
+      },
+      [](A64Emitter& e, REG dest_src, int32_t constant) {
+        e.MOV(REG(1), constant);
+        e.AND(dest_src, dest_src, REG(1));
+      });
+}
+struct AND_I8 : Sequence<AND_I8, I<OPCODE_AND, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndXX<AND_I8, WReg>(e, i);
+  }
+};
+struct AND_I16 : Sequence<AND_I16, I<OPCODE_AND, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndXX<AND_I16, WReg>(e, i);
+  }
+};
+struct AND_I32 : Sequence<AND_I32, I<OPCODE_AND, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndXX<AND_I32, WReg>(e, i);
+  }
+};
+struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndXX<AND_I64, XReg>(e, i);
+  }
+};
+struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.AND(dest.B16(), src1.B16(), src2.B16());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128);
+
+// ============================================================================
+// OPCODE_AND_NOT
+// ============================================================================
+template <typename SEQ, typename REG, typename ARGS>
+void EmitAndNotXX(A64Emitter& e, const ARGS& i) {
+  if (i.src1.is_constant) {
+    // src1 constant.
+    auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
+    e.MOV(temp, i.src1.constant());
+    e.BIC(i.dest, temp, i.src2);
+  } else if (i.src2.is_constant) {
+    // src2 constant.
+    if (i.dest.reg().index() == i.src1.reg().index()) {
+      auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+      e.MOV(temp, ~i.src2.constant());
+      e.AND(i.dest, i.dest, temp);
+    } else {
+      e.MOV(i.dest, i.src1);
+      auto temp = GetTempReg<typename decltype(i.src2)::reg_type>(e);
+      e.MOV(temp, ~i.src2.constant());
+      e.AND(i.dest, i.dest, temp);
+    }
+  } else {
+    // neither are constant
+    e.BIC(i.dest, i.src1, i.src2);
+  }
+}
+struct AND_NOT_I8 : Sequence<AND_NOT_I8, I<OPCODE_AND_NOT, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndNotXX<AND_NOT_I8, WReg>(e, i);
+  }
+};
+struct AND_NOT_I16
+    : Sequence<AND_NOT_I16, I<OPCODE_AND_NOT, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndNotXX<AND_NOT_I16, WReg>(e, i);
+  }
+};
+struct AND_NOT_I32
+    : Sequence<AND_NOT_I32, I<OPCODE_AND_NOT, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndNotXX<AND_NOT_I32, WReg>(e, i);
+  }
+};
+struct AND_NOT_I64
+    : Sequence<AND_NOT_I64, I<OPCODE_AND_NOT, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitAndNotXX<AND_NOT_I64, XReg>(e, i);
+  }
+};
+struct AND_NOT_V128
+    : Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.BIC(dest.B16(), src1.B16(), src2.B16());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32,
+                     AND_NOT_I64, AND_NOT_V128);
+
+// ============================================================================
+// OPCODE_OR
+// ============================================================================
+// TODO(benvanik): put dest/src1|2 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitOrXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitCommutativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, REG dest_src, REG src) {
+        e.ORR(dest_src, dest_src, src);
+      },
+      [](A64Emitter& e, REG dest_src, int32_t constant) {
+        e.MOV(REG(1), constant);
+        e.ORR(dest_src, dest_src, REG(1));
+      });
+}
+struct OR_I8 : Sequence<OR_I8, I<OPCODE_OR, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitOrXX<OR_I8, WReg>(e, i);
+  }
+};
+struct OR_I16 : Sequence<OR_I16, I<OPCODE_OR, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitOrXX<OR_I16, WReg>(e, i);
+  }
+};
+struct OR_I32 : Sequence<OR_I32, I<OPCODE_OR, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitOrXX<OR_I32, WReg>(e, i);
+  }
+};
+struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitOrXX<OR_I64, XReg>(e, i);
+  }
+};
+struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.ORR(dest.B16(), src1.B16(), src2.B16());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128);
+
+// ============================================================================
+// OPCODE_XOR
+// ============================================================================
+// TODO(benvanik): put dest/src1|2 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitXorXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitCommutativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, REG dest_src, REG src) {
+        e.EOR(dest_src, dest_src, src);
+      },
+      [](A64Emitter& e, REG dest_src, int32_t constant) {
+        e.MOV(REG(1), constant);
+        e.EOR(dest_src, dest_src, REG(1));
+      });
+}
+struct XOR_I8 : Sequence<XOR_I8, I<OPCODE_XOR, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitXorXX<XOR_I8, WReg>(e, i);
+  }
+};
+struct XOR_I16 : Sequence<XOR_I16, I<OPCODE_XOR, I16Op, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitXorXX<XOR_I16, WReg>(e, i);
+  }
+};
+struct XOR_I32 : Sequence<XOR_I32, I<OPCODE_XOR, I32Op, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitXorXX<XOR_I32, WReg>(e, i);
+  }
+};
+struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitXorXX<XOR_I64, XReg>(e, i);
+  }
+};
+struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitCommutativeBinaryVOp(
+        e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) {
+          e.EOR(dest.B16(), src1.B16(), src2.B16());
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128);
+
+// ============================================================================
+// OPCODE_NOT
+// ============================================================================
+// TODO(benvanik): put dest/src1 together.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitNotXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitUnaryOp(
+      e, i, [](A64Emitter& e, REG dest_src) { e.MVN(dest_src, dest_src); });
+}
+struct NOT_I8 : Sequence<NOT_I8, I<OPCODE_NOT, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNotXX<NOT_I8, WReg>(e, i);
+  }
+};
+struct NOT_I16 : Sequence<NOT_I16, I<OPCODE_NOT, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNotXX<NOT_I16, WReg>(e, i);
+  }
+};
+struct NOT_I32 : Sequence<NOT_I32, I<OPCODE_NOT, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNotXX<NOT_I32, WReg>(e, i);
+  }
+};
+struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitNotXX<NOT_I64, XReg>(e, i);
+  }
+};
+struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.NOT(i.dest.reg().B16(), i.src1.reg().B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);
+
+// ============================================================================
+// OPCODE_SHL
+// ============================================================================
+// TODO(benvanik): optimize common shifts.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitShlXX(A64Emitter& e, const ARGS& i) {
+  SEQ::EmitAssociativeBinaryOp(
+      e, i,
+      [](A64Emitter& e, REG dest_src, WReg src) {
+        e.LSL(dest_src, dest_src, REG(src.index()));
+      },
+      [](A64Emitter& e, REG dest_src, int8_t constant) {
+        e.LSL(dest_src, dest_src, constant);
+      });
+}
+struct SHL_I8 : Sequence<SHL_I8, I<OPCODE_SHL, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitShlXX<SHL_I8, WReg>(e, i);
+  }
+};
+struct SHL_I16 : Sequence<SHL_I16, I<OPCODE_SHL, I16Op, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitShlXX<SHL_I16, WReg>(e, i);
+  }
+};
+struct SHL_I32 : Sequence<SHL_I32, I<OPCODE_SHL, I32Op, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitShlXX<SHL_I32, WReg>(e, i);
+  }
+};
+struct SHL_I64 : Sequence<SHL_I64, I<OPCODE_SHL, I64Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitShlXX<SHL_I64, XReg>(e, i);
+  }
+};
+struct SHL_V128 : Sequence<SHL_V128, I<OPCODE_SHL, V128Op, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): native version (with shift magic).
+    if (i.src2.is_constant) {
+      e.MOV(e.GetNativeParam(1), i.src2.constant());
+    } else {
+      e.MOV(e.GetNativeParam(1), i.src2.reg().toX());
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateShlV128));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static float32x4_t EmulateShlV128(void*, std::byte src1[16], uint8_t src2) {
+    // Almost all instances are shamt = 1, but non-constant.
+    // shamt is [0,7]
+    uint8_t shamt = src2 & 0x7;
+    alignas(16) vec128_t value;
+    vst1q_f32(reinterpret_cast<float32x4_t*>(&value), vld1q_u8(src1));
+    for (int i = 0; i < 15; ++i) {
+      value.u8[i ^ 0x3] = (value.u8[i ^ 0x3] << shamt) |
+                          (value.u8[(i + 1) ^ 0x3] >> (8 - shamt));
+    }
+    value.u8[15 ^ 0x3] = value.u8[15 ^ 0x3] << shamt;
+    return vld1q_f32(reinterpret_cast<float32x4_t*>(&value));
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SHL, SHL_I8, SHL_I16, SHL_I32, SHL_I64, SHL_V128);
+
+// ============================================================================
+// OPCODE_SHR
+// ============================================================================
+struct SHR_I8 : Sequence<SHR_I8, I<OPCODE_SHR, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.LSR(dest_src, dest_src, src);
+        },
+        [](A64Emitter& e, WReg dest_src, int8_t constant) {
+          e.LSR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHR_I16 : Sequence<SHR_I16, I<OPCODE_SHR, I16Op, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.LSR(dest_src, dest_src, src);
+        },
+        [](A64Emitter& e, WReg dest_src, int8_t constant) {
+          e.LSR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHR_I32 : Sequence<SHR_I32, I<OPCODE_SHR, I32Op, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.LSR(dest_src, dest_src, src);
+        },
+        [](A64Emitter& e, WReg dest_src, int8_t constant) {
+          e.LSR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHR_I64 : Sequence<SHR_I64, I<OPCODE_SHR, I64Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, XReg dest_src, WReg src) {
+          e.LSR(dest_src, dest_src, src.toX());
+        },
+        [](A64Emitter& e, XReg dest_src, int8_t constant) {
+          e.LSR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // TODO(benvanik): native version (with shift magic).
+    if (i.src2.is_constant) {
+      e.MOV(e.GetNativeParam(1), i.src2.constant());
+    } else {
+      e.MOV(e.GetNativeParam(1), i.src2.reg().toX());
+    }
+    e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
+    e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
+    e.MOV(i.dest.reg().B16(), Q0.B16());
+  }
+  static float32x4_t EmulateShrV128(void*, std::byte src1[16], uint8_t src2) {
+    // Almost all instances are shamt = 1, but non-constant.
+    // shamt is [0,7]
+    uint8_t shamt = src2 & 0x7;
+    alignas(16) vec128_t value;
+    vst1q_f32(reinterpret_cast<float32x4_t*>(&value), vld1q_u8(src1));
+    for (int i = 15; i > 0; --i) {
+      value.u8[i ^ 0x3] = (value.u8[i ^ 0x3] >> shamt) |
+                          (value.u8[(i - 1) ^ 0x3] << (8 - shamt));
+    }
+    value.u8[0 ^ 0x3] = value.u8[0 ^ 0x3] >> shamt;
+    return vld1q_f32(reinterpret_cast<float32x4_t*>(&value));
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SHR, SHR_I8, SHR_I16, SHR_I32, SHR_I64, SHR_V128);
+
+// ============================================================================
+// OPCODE_SHA
+// ============================================================================
+struct SHA_I8 : Sequence<SHA_I8, I<OPCODE_SHA, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.SXTB(dest_src, dest_src);
+          e.ASR(dest_src, dest_src, src);
+        },
+        [](A64Emitter& e, WReg dest_src, int8_t constant) {
+          e.SXTB(dest_src, dest_src);
+          e.ASR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHA_I16 : Sequence<SHA_I16, I<OPCODE_SHA, I16Op, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.SXTH(dest_src, dest_src);
+          e.ASR(dest_src, dest_src, src);
+        },
+        [](A64Emitter& e, WReg dest_src, int8_t constant) {
+          e.ASR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHA_I32 : Sequence<SHA_I32, I<OPCODE_SHA, I32Op, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, WReg dest_src, WReg src) {
+          e.ASR(dest_src, dest_src, src);
+        },
+        [](A64Emitter& e, WReg dest_src, int8_t constant) {
+          e.ASR(dest_src, dest_src, constant);
+        });
+  }
+};
+struct SHA_I64 : Sequence<SHA_I64, I<OPCODE_SHA, I64Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    Sequence::EmitAssociativeBinaryOp(
+        e, i,
+        [](A64Emitter& e, XReg dest_src, WReg src) {
+          e.ASR(dest_src, dest_src, src.toX());
+        },
+        [](A64Emitter& e, XReg dest_src, int8_t constant) {
+          e.ASR(dest_src, dest_src, constant);
+        });
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SHA, SHA_I8, SHA_I16, SHA_I32, SHA_I64);
+
+// ============================================================================
+// OPCODE_ROTATE_LEFT
+// ============================================================================
+// TODO(benvanik): put dest/src1 together, src2 in cl.
+template <typename SEQ, typename REG, typename ARGS>
+void EmitRotateLeftXX(A64Emitter& e, const ARGS& i) {
+  // ; rotate r1 left by r2, producing r0
+  // ; (destroys r2)
+  //                                     ; r1 = ABCDEFGH
+  // lslv    r0, r1, r2                  ; r0 = EFGH0000
+  // mvn     r2, r2                      ; r2 = leftover bits
+  // lsrv    r2, r1, r2                  ; r2 = 0000ABCD
+  // orr     r0, r0, r2                  ; r0 = EFGHABCD
+  if (i.src1.is_constant) {
+    e.MOV(REG(0), i.src1.constant());
+  } else {
+    e.MOV(REG(0), i.src1.reg());
+  }
+
+  if (i.src2.is_constant) {
+    e.MOV(REG(1), i.src2.constant());
+  } else {
+    e.MOV(W1, i.src2.reg().toW());
+  }
+
+  e.LSLV(i.dest, REG(0), REG(1));
+  e.MVN(REG(1), REG(1));
+  e.LSRV(REG(1), REG(0), REG(1));
+  e.ORR(i.dest, i.dest, REG(1));
+}
+struct ROTATE_LEFT_I8
+    : Sequence<ROTATE_LEFT_I8, I<OPCODE_ROTATE_LEFT, I8Op, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitRotateLeftXX<ROTATE_LEFT_I8, WReg>(e, i);
+  }
+};
+struct ROTATE_LEFT_I16
+    : Sequence<ROTATE_LEFT_I16, I<OPCODE_ROTATE_LEFT, I16Op, I16Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitRotateLeftXX<ROTATE_LEFT_I16, WReg>(e, i);
+  }
+};
+struct ROTATE_LEFT_I32
+    : Sequence<ROTATE_LEFT_I32, I<OPCODE_ROTATE_LEFT, I32Op, I32Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      e.MOV(W0, i.src1.constant());
+    } else {
+      e.MOV(W0, i.src1.reg());
+    }
+
+    if (i.src2.is_constant) {
+      e.MOV(W1, i.src2.constant());
+    } else {
+      e.SXTB(W1, i.src2.reg());
+    }
+    e.NEG(W1, W1);
+
+    e.ROR(i.dest, W0, W1);
+  }
+};
+struct ROTATE_LEFT_I64
+    : Sequence<ROTATE_LEFT_I64, I<OPCODE_ROTATE_LEFT, I64Op, I64Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    if (i.src1.is_constant) {
+      e.MOV(X0, i.src1.constant());
+    } else {
+      e.MOV(X0, i.src1.reg());
+    }
+
+    if (i.src2.is_constant) {
+      e.MOV(X1, i.src2.constant());
+    } else {
+      e.SXTB(X1, i.src2.reg().toW());
+    }
+    e.NEG(X1, X1);
+
+    e.ROR(i.dest, X0, X1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT, ROTATE_LEFT_I8, ROTATE_LEFT_I16,
+                     ROTATE_LEFT_I32, ROTATE_LEFT_I64);
+
+// ============================================================================
+// OPCODE_BYTE_SWAP
+// ============================================================================
+// TODO(benvanik): put dest/src1 together.
+struct BYTE_SWAP_I16
+    : Sequence<BYTE_SWAP_I16, I<OPCODE_BYTE_SWAP, I16Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitUnaryOp(e, i, [](A64Emitter& e, WReg dest_src) {
+      e.REV16(dest_src, dest_src);
+    });
+  }
+};
+struct BYTE_SWAP_I32
+    : Sequence<BYTE_SWAP_I32, I<OPCODE_BYTE_SWAP, I32Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitUnaryOp(
+        e, i, [](A64Emitter& e, WReg dest_src) { e.REV(dest_src, dest_src); });
+  }
+};
+struct BYTE_SWAP_I64
+    : Sequence<BYTE_SWAP_I64, I<OPCODE_BYTE_SWAP, I64Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    EmitUnaryOp(
+        e, i, [](A64Emitter& e, XReg dest_src) { e.REV(dest_src, dest_src); });
+  }
+};
+struct BYTE_SWAP_V128
+    : Sequence<BYTE_SWAP_V128, I<OPCODE_BYTE_SWAP, V128Op, V128Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // Reverse upper and lower 64-bit halfs
+    e.REV32(i.dest.reg().B16(), i.src1.reg().B16());
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP, BYTE_SWAP_I16, BYTE_SWAP_I32,
+                     BYTE_SWAP_I64, BYTE_SWAP_V128);
+
+// ============================================================================
+// OPCODE_CNTLZ
+// Count leading zeroes
+// ============================================================================
+struct CNTLZ_I8 : Sequence<CNTLZ_I8, I<OPCODE_CNTLZ, I8Op, I8Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // No 8bit lzcnt, so do 32 and sub 24.
+    e.UXTB(i.dest, i.src1);
+    e.CLZ(i.dest, i.dest);
+    e.SUB(i.dest.reg(), i.dest.reg(), 24);
+  }
+};
+struct CNTLZ_I16 : Sequence<CNTLZ_I16, I<OPCODE_CNTLZ, I8Op, I16Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // No 16bit lzcnt, so do 32 and sub 16.
+    e.UXTH(i.dest, i.src1);
+    e.CLZ(i.dest, i.dest);
+    e.SUB(i.dest.reg(), i.dest.reg(), 16);
+  }
+};
+struct CNTLZ_I32 : Sequence<CNTLZ_I32, I<OPCODE_CNTLZ, I8Op, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CLZ(i.dest, i.src1);
+  }
+};
+struct CNTLZ_I64 : Sequence<CNTLZ_I64, I<OPCODE_CNTLZ, I8Op, I64Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    e.CLZ(i.dest.reg().toX(), i.src1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_CNTLZ, CNTLZ_I8, CNTLZ_I16, CNTLZ_I32, CNTLZ_I64);
+
+// ============================================================================
+// OPCODE_SET_ROUNDING_MODE
+// ============================================================================
+// Input: FPSCR (PPC format)
+// Convert from PPC rounding mode to ARM
+// PPC | ARM |
+// 00  | 00  | nearest
+// 01  | 11  | toward zero
+// 10  | 01  | toward +infinity
+// 11  | 10  | toward -infinity
+static const uint8_t fpcr_table[] = {
+    0b0'00,  // |--|nearest
+    0b0'11,  // |--|toward zero
+    0b0'01,  // |--|toward +infinity
+    0b0'10,  // |--|toward -infinity
+    0b1'00,  // |FZ|nearest
+    0b1'11,  // |FZ|toward zero
+    0b1'01,  // |FZ|toward +infinity
+    0b1'10,  // |FZ|toward -infinity
+};
+struct SET_ROUNDING_MODE_I32
+    : Sequence<SET_ROUNDING_MODE_I32,
+               I<OPCODE_SET_ROUNDING_MODE, VoidOp, I32Op>> {
+  static void Emit(A64Emitter& e, const EmitArgType& i) {
+    // Low 3 bits are |Non-IEEE:1|RoundingMode:2|
+    // Non-IEEE bit is flush-to-zero
+    e.AND(W1, i.src1, 0b111);
+
+    // Use the low 3 bits as an index into a LUT
+    e.MOV(X0, reinterpret_cast<uintptr_t>(fpcr_table));
+    e.LDRB(W0, X0, X1);
+
+    // Replace FPCR bits with new value
+    e.MRS(X1, SystemReg::FPCR);
+    e.BFI(X1, X0, 23, 3);
+    e.MSR(SystemReg::FPCR, X1);
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
+
+// Include anchors to other sequence sources so they get included in the build.
+extern volatile int anchor_control;
+static int anchor_control_dest = anchor_control;
+
+extern volatile int anchor_memory;
+static int anchor_memory_dest = anchor_memory;
+
+extern volatile int anchor_vector;
+static int anchor_vector_dest = anchor_vector;
+
+bool SelectSequence(A64Emitter* e, const Instr* i, const Instr** new_tail) {
+  const InstrKey key(i);
+  auto it = sequence_table.find(key);
+  if (it != sequence_table.end()) {
+    if (it->second(*e, i)) {
+      *new_tail = i->next;
+      return true;
+    }
+  }
+  XELOGE("No sequence match for variant {}", i->opcode->name);
+  return false;
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_sequences.h b/src/xenia/cpu/backend/a64/a64_sequences.h
new file mode 100644
index 000000000..b47382633
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_sequences.h
@@ -0,0 +1,51 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_SEQUENCES_H_
+#define XENIA_CPU_BACKEND_A64_A64_SEQUENCES_H_
+
+#include "xenia/cpu/hir/instr.h"
+
+#include <unordered_map>
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+class A64Emitter;
+
+typedef bool (*SequenceSelectFn)(A64Emitter&, const hir::Instr*);
+extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
+
+template <typename T>
+bool Register() {
+  sequence_table.insert({T::head_key(), T::Select});
+  return true;
+}
+
+template <typename T, typename Tn, typename... Ts>
+static bool Register() {
+  bool b = true;
+  b = b && Register<T>();          // Call the above function
+  b = b && Register<Tn, Ts...>();  // Call ourself again (recursively)
+  return b;
+}
+#define EMITTER_OPCODE_TABLE(name, ...) \
+  const auto A64_INSTR_##name = Register<__VA_ARGS__>();
+
+bool SelectSequence(A64Emitter* e, const hir::Instr* i,
+                    const hir::Instr** new_tail);
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_SEQUENCES_H_
diff --git a/src/xenia/cpu/backend/a64/a64_stack_layout.h b/src/xenia/cpu/backend/a64/a64_stack_layout.h
new file mode 100644
index 000000000..ee8cbcfac
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_stack_layout.h
@@ -0,0 +1,129 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_STACK_LAYOUT_H_
+#define XENIA_CPU_BACKEND_A64_A64_STACK_LAYOUT_H_
+
+#include "xenia/base/vec128.h"
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+class StackLayout {
+ public:
+  /**
+   * Stack Layout
+   * ----------------------------
+   * NOTE: stack must always be 16b aligned.
+   *
+   * Thunk stack:
+   *      Non-Volatile         Volatile
+   *  +------------------+------------------+
+   *  | arg temp, 3 * 8  | arg temp, 3 * 8  | sp + 0x000
+   *  |                  |                  |
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | rbx              | (unused)         | sp + 0x018
+   *  +------------------+------------------+
+   *  | rbp              | X1               | sp + 0x020
+   *  +------------------+------------------+
+   *  | rcx (Win32)      | X2               | sp + 0x028
+   *  +------------------+------------------+
+   *  | rsi (Win32)      | X3               | sp + 0x030
+   *  +------------------+------------------+
+   *  | rdi (Win32)      | X4               | sp + 0x038
+   *  +------------------+------------------+
+   *  | r12              | X5               | sp + 0x040
+   *  +------------------+------------------+
+   *  | r13              | X6               | sp + 0x048
+   *  +------------------+------------------+
+   *  | r14              | X7               | sp + 0x050
+   *  +------------------+------------------+
+   *  | r15              | X8               | sp + 0x058
+   *  +------------------+------------------+
+   *  | xmm6 (Win32)     | X9               | sp + 0x060
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm7 (Win32)     | X10              | sp + 0x070
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm8 (Win32)     | X11              | sp + 0x080
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm9 (Win32)     | X12              | sp + 0x090
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm10 (Win32)    | X13              | sp + 0x0A0
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm11 (Win32)    | X14              | sp + 0x0B0
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm12 (Win32)    | X15              | sp + 0x0C0
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm13 (Win32)    | X16              | sp + 0x0D0
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm14 (Win32)    | X17              | sp + 0x0E0
+   *  |                  |                  |
+   *  +------------------+------------------+
+   *  | xmm15 (Win32)    | X18              | sp + 0x0F0
+   *  |                  |                  |
+   *  +------------------+------------------+
+   */
+  XEPACKEDSTRUCT(Thunk, {
+    uint64_t arg_temp[3];
+    uint64_t r[17];
+    vec128_t xmm[22];
+  });
+  static_assert(sizeof(Thunk) % 16 == 0,
+                "sizeof(Thunk) must be a multiple of 16!");
+  static const size_t THUNK_STACK_SIZE = sizeof(Thunk);
+
+  /**
+   *
+   *
+   * Guest stack:
+   *  +------------------+
+   *  | arg temp, 3 * 8  | sp + 0
+   *  |                  |
+   *  |                  |
+   *  +------------------+
+   *  | scratch, 48b     | sp + 32(kStashOffset)
+   *  |                  |
+   *  +------------------+
+   *  | X0  / context    | sp + 80
+   *  +------------------+
+   *  | guest ret addr   | sp + 88
+   *  +------------------+
+   *  | call ret addr    | sp + 96
+   *  +------------------+
+   *    ... locals ...
+   *  +------------------+
+   *  | (return address) |
+   *  +------------------+
+   *
+   */
+  static const size_t GUEST_STACK_SIZE = 96 + 16;
+  static const size_t GUEST_CTX_HOME = 80;
+  static const size_t GUEST_RET_ADDR = 88;
+  static const size_t GUEST_CALL_RET_ADDR = 96;
+};
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_STACK_LAYOUT_H_
diff --git a/src/xenia/cpu/backend/a64/a64_tracers.cc b/src/xenia/cpu/backend/a64/a64_tracers.cc
new file mode 100644
index 000000000..146f50982
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_tracers.cc
@@ -0,0 +1,225 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/cpu/backend/a64/a64_tracers.h"
+
+#include <cinttypes>
+
+#include "xenia/base/logging.h"
+#include "xenia/base/vec128.h"
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+#include "xenia/cpu/processor.h"
+#include "xenia/cpu/thread_state.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+#define ITRACE 0
+#define DTRACE 0
+
+#define TARGET_THREAD 0
+
+bool trace_enabled = true;
+
+#define THREAD_MATCH \
+  (!TARGET_THREAD || thread_state->thread_id() == TARGET_THREAD)
+#define IFLUSH()
+#define IPRINT(s)                    \
+  if (trace_enabled && THREAD_MATCH) \
+  xe::logging::AppendLogLine(xe::LogLevel::Debug, 't', s)
+#define DFLUSH()
+#define DPRINT(...)                  \
+  if (trace_enabled && THREAD_MATCH) \
+  xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 't', __VA_ARGS__)
+
+uint32_t GetTracingMode() {
+  uint32_t mode = 0;
+#if ITRACE
+  mode |= TRACING_INSTR;
+#endif  // ITRACE
+#if DTRACE
+  mode |= TRACING_DATA;
+#endif  // DTRACE
+  return mode;
+}
+
+void TraceString(void* raw_context, const char* str) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  IPRINT(str);
+  IFLUSH();
+}
+
+void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = ctx i8 +{}\n", (int8_t)value, value, offset);
+}
+void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = ctx i16 +{}\n", (int16_t)value, value, offset);
+}
+void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = ctx i32 +{}\n", (int32_t)value, value, offset);
+}
+void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = ctx i64 +{}\n", (int64_t)value, value, offset);
+}
+void TraceContextLoadF32(void* raw_context, uint64_t offset,
+                         float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = ctx f32 +{}\n", xe::m128_f32<0>(value),
+         xe::m128_i32<0>(value), offset);
+}
+void TraceContextLoadF64(void* raw_context, uint64_t offset,
+                         const double* value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  // auto v = _mm_loadu_pd(value);
+  auto v = vld1q_f64(value);
+  DPRINT("{} ({:X}) = ctx f64 +{}\n", xe::m128_f64<0>(v), xe::m128_i64<0>(v),
+         offset);
+}
+void TraceContextLoadV128(void* raw_context, uint64_t offset,
+                          float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("[{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}] = ctx v128 +{}\n",
+         xe::m128_f32<0>(value), xe::m128_f32<1>(value), xe::m128_f32<2>(value),
+         xe::m128_f32<3>(value), xe::m128_i32<0>(value), xe::m128_i32<1>(value),
+         xe::m128_i32<2>(value), xe::m128_i32<3>(value), offset);
+}
+
+void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("ctx i8 +{} = {} ({:X})\n", offset, (int8_t)value, value);
+}
+void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("ctx i16 +{} = {} ({:X})\n", offset, (int16_t)value, value);
+}
+void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("ctx i32 +{} = {} ({:X})\n", offset, (int32_t)value, value);
+}
+void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("ctx i64 +{} = {} ({:X})\n", offset, (int64_t)value, value);
+}
+void TraceContextStoreF32(void* raw_context, uint64_t offset,
+                          float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("ctx f32 +{} = {} ({:X})\n", offset, xe::m128_f32<0>(value),
+         xe::m128_i32<0>(value));
+}
+void TraceContextStoreF64(void* raw_context, uint64_t offset,
+                          const double* value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  // auto v = _mm_loadu_pd(value);
+  auto v = vld1q_f64(value);
+  DPRINT("ctx f64 +{} = {} ({:X})\n", offset, xe::m128_f64<0>(v),
+         xe::m128_i64<0>(v));
+}
+void TraceContextStoreV128(void* raw_context, uint64_t offset,
+                           float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("ctx v128 +{} = [{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}]\n",
+         offset, xe::m128_f32<0>(value), xe::m128_f32<1>(value),
+         xe::m128_f32<2>(value), xe::m128_f32<3>(value), xe::m128_i32<0>(value),
+         xe::m128_i32<1>(value), xe::m128_i32<2>(value),
+         xe::m128_i32<3>(value));
+}
+
+void TraceMemoryLoadI8(void* raw_context, uint32_t address, uint8_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = load.i8 {:08X}\n", (int8_t)value, value, address);
+}
+void TraceMemoryLoadI16(void* raw_context, uint32_t address, uint16_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = load.i16 {:08X}\n", (int16_t)value, value, address);
+}
+void TraceMemoryLoadI32(void* raw_context, uint32_t address, uint32_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = load.i32 {:08X}\n", (int32_t)value, value, address);
+}
+void TraceMemoryLoadI64(void* raw_context, uint32_t address, uint64_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = load.i64 {:08X}\n", (int64_t)value, value, address);
+}
+void TraceMemoryLoadF32(void* raw_context, uint32_t address,
+                        float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = load.f32 {:08X}\n", xe::m128_f32<0>(value),
+         xe::m128_i32<0>(value), address);
+}
+void TraceMemoryLoadF64(void* raw_context, uint32_t address,
+                        float64x2_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("{} ({:X}) = load.f64 {:08X}\n", xe::m128_f64<0>(value),
+         xe::m128_i64<0>(value), address);
+}
+void TraceMemoryLoadV128(void* raw_context, uint32_t address,
+                         float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT(
+      "[{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}] = load.v128 {:08X}\n",
+      xe::m128_f32<0>(value), xe::m128_f32<1>(value), xe::m128_f32<2>(value),
+      xe::m128_f32<3>(value), xe::m128_i32<0>(value), xe::m128_i32<1>(value),
+      xe::m128_i32<2>(value), xe::m128_i32<3>(value), address);
+}
+
+void TraceMemoryStoreI8(void* raw_context, uint32_t address, uint8_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("store.i8 {:08X} = {} ({:X})\n", address, (int8_t)value, value);
+}
+void TraceMemoryStoreI16(void* raw_context, uint32_t address, uint16_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("store.i16 {:08X} = {} ({:X})\n", address, (int16_t)value, value);
+}
+void TraceMemoryStoreI32(void* raw_context, uint32_t address, uint32_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("store.i32 {:08X} = {} ({:X})\n", address, (int32_t)value, value);
+}
+void TraceMemoryStoreI64(void* raw_context, uint32_t address, uint64_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("store.i64 {:08X} = {} ({:X})\n", address, (int64_t)value, value);
+}
+void TraceMemoryStoreF32(void* raw_context, uint32_t address,
+                         float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("store.f32 {:08X} = {} ({:X})\n", address, xe::m128_f32<0>(value),
+         xe::m128_i32<0>(value));
+}
+void TraceMemoryStoreF64(void* raw_context, uint32_t address,
+                         float64x2_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("store.f64 {:08X} = {} ({:X})\n", address, xe::m128_f64<0>(value),
+         xe::m128_i64<0>(value));
+}
+void TraceMemoryStoreV128(void* raw_context, uint32_t address,
+                          float32x4_t value) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT(
+      "store.v128 {:08X} = [{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}]\n",
+      address, xe::m128_f32<0>(value), xe::m128_f32<1>(value),
+      xe::m128_f32<2>(value), xe::m128_f32<3>(value), xe::m128_i32<0>(value),
+      xe::m128_i32<1>(value), xe::m128_i32<2>(value), xe::m128_i32<3>(value));
+}
+
+void TraceMemset(void* raw_context, uint32_t address, uint8_t value,
+                 uint32_t length) {
+  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
+  DPRINT("memset {:08X}-{:08X} ({}) = {:02X}", address, address + length,
+         length, value);
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
diff --git a/src/xenia/cpu/backend/a64/a64_tracers.h b/src/xenia/cpu/backend/a64/a64_tracers.h
new file mode 100644
index 000000000..62b740356
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_tracers.h
@@ -0,0 +1,82 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_TRACERS_H_
+#define XENIA_CPU_BACKEND_A64_A64_TRACERS_H_
+
+#include <arm64_neon.h>
+#include <cstdint>
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+class A64Emitter;
+
+enum TracingMode {
+  TRACING_INSTR = (1 << 1),
+  TRACING_DATA = (1 << 2),
+};
+
+uint32_t GetTracingMode();
+inline bool IsTracingInstr() { return (GetTracingMode() & TRACING_INSTR) != 0; }
+inline bool IsTracingData() { return (GetTracingMode() & TRACING_DATA) != 0; }
+
+void TraceString(void* raw_context, const char* str);
+
+void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value);
+void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value);
+void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value);
+void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value);
+void TraceContextLoadF32(void* raw_context, uint64_t offset, float32x4_t value);
+void TraceContextLoadF64(void* raw_context, uint64_t offset,
+                         const double* value);
+void TraceContextLoadV128(void* raw_context, uint64_t offset,
+                          float32x4_t value);
+
+void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value);
+void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value);
+void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value);
+void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value);
+void TraceContextStoreF32(void* raw_context, uint64_t offset,
+                          float32x4_t value);
+void TraceContextStoreF64(void* raw_context, uint64_t offset,
+                          const double* value);
+void TraceContextStoreV128(void* raw_context, uint64_t offset,
+                           float32x4_t value);
+
+void TraceMemoryLoadI8(void* raw_context, uint32_t address, uint8_t value);
+void TraceMemoryLoadI16(void* raw_context, uint32_t address, uint16_t value);
+void TraceMemoryLoadI32(void* raw_context, uint32_t address, uint32_t value);
+void TraceMemoryLoadI64(void* raw_context, uint32_t address, uint64_t value);
+void TraceMemoryLoadF32(void* raw_context, uint32_t address, float32x4_t value);
+void TraceMemoryLoadF64(void* raw_context, uint32_t address, float64x2_t value);
+void TraceMemoryLoadV128(void* raw_context, uint32_t address,
+                         float32x4_t value);
+
+void TraceMemoryStoreI8(void* raw_context, uint32_t address, uint8_t value);
+void TraceMemoryStoreI16(void* raw_context, uint32_t address, uint16_t value);
+void TraceMemoryStoreI32(void* raw_context, uint32_t address, uint32_t value);
+void TraceMemoryStoreI64(void* raw_context, uint32_t address, uint64_t value);
+void TraceMemoryStoreF32(void* raw_context, uint32_t address,
+                         float32x4_t value);
+void TraceMemoryStoreF64(void* raw_context, uint32_t address,
+                         float64x2_t value);
+void TraceMemoryStoreV128(void* raw_context, uint32_t address,
+                          float32x4_t value);
+
+void TraceMemset(void* raw_context, uint32_t address, uint8_t value,
+                 uint32_t length);
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_TRACERS_H_
diff --git a/src/xenia/cpu/backend/a64/a64_util.h b/src/xenia/cpu/backend/a64/a64_util.h
new file mode 100644
index 000000000..0b950b8ae
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/a64_util.h
@@ -0,0 +1,95 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2024 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_CPU_BACKEND_A64_A64_UTIL_H_
+#define XENIA_CPU_BACKEND_A64_A64_UTIL_H_
+
+#include "xenia/base/vec128.h"
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#include "xenia/cpu/backend/a64/a64_emitter.h"
+
+namespace xe {
+namespace cpu {
+namespace backend {
+namespace a64 {
+
+// Attempts to convert an fp32 bit-value into an fp8-immediate value for FMOV
+// returns false if the value cannot be represented
+// C2.2.3 Modified immediate constants in A64  ing-point instructions
+// abcdefgh
+//    V
+// aBbbbbbc defgh000 00000000 00000000
+// B = NOT(b)
+constexpr bool f32_to_fimm8(uint32_t u32, oaknut::FImm8& fp8) {
+  const uint32_t sign = (u32 >> 31) & 1;
+  int32_t exp = ((u32 >> 23) & 0xff) - 127;
+  int64_t mantissa = u32 & 0x7fffff;
+
+  // Too many mantissa bits
+  if (mantissa & 0x7ffff) {
+    return false;
+  }
+  // Too many exp bits
+  if (exp < -3 || exp > 4) {
+    return false;
+  }
+
+  // mantissa = (16 + e:f:g:h) / 16.
+  mantissa >>= 19;
+  if ((mantissa & 0b1111) != mantissa) {
+    return false;
+  }
+
+  // exp = (NOT(b):c:d) - 3
+  exp = ((exp + 3) & 0b111) ^ 0b100;
+
+  fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa));
+  return true;
+}
+
+// Attempts to convert an fp64 bit-value into an fp8-immediate value for FMOV
+// returns false if the value cannot be represented
+// C2.2.3 Modified immediate constants in A64 floating-point instructions
+// abcdefgh
+//    V
+// aBbbbbbb bbcdefgh 00000000 00000000 00000000 00000000 00000000 00000000
+// B = NOT(b)
+constexpr bool f64_to_fimm8(uint64_t u64, oaknut::FImm8& fp8) {
+  const uint32_t sign = (u64 >> 63) & 1;
+  int32_t exp = ((u64 >> 52) & 0x7ff) - 1023;
+  int64_t mantissa = u64 & 0xfffffffffffffULL;
+
+  // Too many mantissa bits
+  if (mantissa & 0xffffffffffffULL) {
+    return false;
+  }
+  // Too many exp bits
+  if (exp < -3 || exp > 4) {
+    return false;
+  }
+
+  // mantissa = (16 + e:f:g:h) / 16.
+  mantissa >>= 48;
+  if ((mantissa & 0b1111) != mantissa) {
+    return false;
+  }
+
+  // exp = (NOT(b):c:d) - 3
+  exp = ((exp + 3) & 0b111) ^ 0b100;
+
+  fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa));
+  return true;
+}
+
+}  // namespace a64
+}  // namespace backend
+}  // namespace cpu
+}  // namespace xe
+
+#endif  // XENIA_CPU_BACKEND_A64_A64_UTIL_H_
diff --git a/src/xenia/cpu/backend/a64/premake5.lua b/src/xenia/cpu/backend/a64/premake5.lua
new file mode 100644
index 000000000..32b2d51a0
--- /dev/null
+++ b/src/xenia/cpu/backend/a64/premake5.lua
@@ -0,0 +1,31 @@
+project_root = "../../../../.."
+include(project_root.."/tools/build")
+
+group("src")
+project("xenia-cpu-backend-a64")
+  uuid("495f3f3e-f5e8-489a-bd0f-289d0495bc08")
+  filter("architecture:ARM64")
+    kind("StaticLib")
+  filter("architecture:not ARM64")
+    kind("None")
+  filter({})
+  language("C++")
+  cppdialect("C++20")
+  links({
+    "fmt",
+    "xenia-base",
+    "xenia-cpu",
+  })
+  defines({
+  })
+
+  disablewarnings({
+    -- Silence errors in oaknut
+    "4146", -- unary minus operator applied to unsigned type, result still unsigned
+    "4267" -- 'initializing': conversion from 'size_t' to 'uint32_t', possible loss of data
+  })
+
+  includedirs({
+    project_root.."/third_party/oaknut/include",
+  })
+  local_platform_files()
diff --git a/src/xenia/cpu/backend/x64/premake5.lua b/src/xenia/cpu/backend/x64/premake5.lua
index f2a990f29..90e5288cf 100644
--- a/src/xenia/cpu/backend/x64/premake5.lua
+++ b/src/xenia/cpu/backend/x64/premake5.lua
@@ -4,7 +4,11 @@ include(project_root.."/tools/build")
 group("src")
 project("xenia-cpu-backend-x64")
   uuid("7d8d5dce-4696-4197-952a-09506f725afe")
-  kind("StaticLib")
+  filter("architecture:x86_64")
+    kind("StaticLib")
+  filter("architecture:not x86_64")
+    kind("None")
+  filter({})
   language("C++")
   links({
     "capstone",
diff --git a/src/xenia/cpu/breakpoint.cc b/src/xenia/cpu/breakpoint.cc
index 9572d4760..ebcd84efb 100644
--- a/src/xenia/cpu/breakpoint.cc
+++ b/src/xenia/cpu/breakpoint.cc
@@ -48,7 +48,8 @@ std::string Breakpoint::to_string() const {
     str += " " + functions[0]->name();
     return str;
   } else {
-    return std::string("x64 ") + xe::string_util::to_hex_string(host_address());
+    return std::string(XE_HOST_ARCH_NAME " ") +
+           xe::string_util::to_hex_string(host_address());
   }
 }
 
diff --git a/src/xenia/cpu/cpu_flags.cc b/src/xenia/cpu/cpu_flags.cc
index 614dabae8..de7fb78e8 100644
--- a/src/xenia/cpu/cpu_flags.cc
+++ b/src/xenia/cpu/cpu_flags.cc
@@ -9,7 +9,7 @@
 
 #include "xenia/cpu/cpu_flags.h"
 
-DEFINE_string(cpu, "any", "CPU backend [any, x64].", "CPU");
+DEFINE_string(cpu, "any", "CPU backend [any, x64, a64].", "CPU");
 
 DEFINE_string(
     load_module_map, "",
diff --git a/src/xenia/cpu/ppc/testing/ppc_testing_main.cc b/src/xenia/cpu/ppc/testing/ppc_testing_main.cc
index 5faa4998e..639b14ba3 100644
--- a/src/xenia/cpu/ppc/testing/ppc_testing_main.cc
+++ b/src/xenia/cpu/ppc/testing/ppc_testing_main.cc
@@ -23,6 +23,8 @@
 
 #if XE_ARCH_AMD64
 #include "xenia/cpu/backend/x64/x64_backend.h"
+#elif XE_ARCH_ARM64
+#include "xenia/cpu/backend/a64/a64_backend.h"
 #endif  // XE_ARCH
 
 #if XE_COMPILER_MSVC
@@ -203,11 +205,17 @@ class TestRunner {
       if (cvars::cpu == "x64") {
         backend.reset(new xe::cpu::backend::x64::X64Backend());
       }
+#elif XE_ARCH_ARM64
+      if (cvars::cpu == "a64") {
+        backend.reset(new xe::cpu::backend::a64::A64Backend());
+      }
 #endif  // XE_ARCH
       if (cvars::cpu == "any") {
         if (!backend) {
 #if XE_ARCH_AMD64
           backend.reset(new xe::cpu::backend::x64::X64Backend());
+#elif XE_ARCH_ARM64
+          backend.reset(new xe::cpu::backend::a64::A64Backend());
 #endif  // XE_ARCH
         }
       }
diff --git a/src/xenia/cpu/ppc/testing/premake5.lua b/src/xenia/cpu/ppc/testing/premake5.lua
index bca2bb81e..96afb593e 100644
--- a/src/xenia/cpu/ppc/testing/premake5.lua
+++ b/src/xenia/cpu/ppc/testing/premake5.lua
@@ -27,7 +27,11 @@ project("xenia-cpu-ppc-tests")
     links({
       "xenia-cpu-backend-x64",
     })
-  filter("platforms:Windows")
+  filter("architecture:ARM64")
+    links({
+      "xenia-cpu-backend-a64",
+    })
+  filter("platforms:Windows-*")
     debugdir(project_root)
     debugargs({
       "2>&1",
diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc
index eb63a1abf..cf8c028f0 100644
--- a/src/xenia/cpu/processor.cc
+++ b/src/xenia/cpu/processor.cc
@@ -34,7 +34,11 @@
 #include "xenia/cpu/xex_module.h"
 
 // TODO(benvanik): based on compiler support
+#ifdef XE_ARCH_AMD64
 #include "xenia/cpu/backend/x64/x64_backend.h"
+#elif XE_ARCH_ARM64
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#endif // XE_ARCH
 
 #if 0 && DEBUG
 #define DEFAULT_DEBUG_FLAG true
diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h
index 0aa06a26d..5e13ab818 100644
--- a/src/xenia/cpu/processor.h
+++ b/src/xenia/cpu/processor.h
@@ -162,7 +162,7 @@ class Processor {
   // This will cancel any active step operations and resume all threads.
   void Continue();
 
-  // Steps the given thread a single x64 host instruction.
+  // Steps the given thread a single host instruction.
   // If the step is over a branch the branch will be followed.
   void StepHostInstruction(uint32_t thread_id);
 
diff --git a/src/xenia/cpu/stack_walker_win.cc b/src/xenia/cpu/stack_walker_win.cc
index aaaab140a..7444e725a 100644
--- a/src/xenia/cpu/stack_walker_win.cc
+++ b/src/xenia/cpu/stack_walker_win.cc
@@ -58,6 +58,12 @@ LPSYMFUNCTIONTABLEACCESS64 sym_function_table_access_64_ = nullptr;
 LPSYMGETMODULEBASE64 sym_get_module_base_64_ = nullptr;
 LPSYMGETSYMFROMADDR64 sym_get_sym_from_addr_64_ = nullptr;
 
+#if XE_ARCH_AMD64
+static const DWORD kMachineType = IMAGE_FILE_MACHINE_AMD64;
+#elif XE_ARCH_ARM64
+static const DWORD kMachineType = IMAGE_FILE_MACHINE_ARM64;
+#endif
+
 namespace xe {
 namespace cpu {
 
@@ -173,40 +179,70 @@ class Win32StackWalker : public StackWalker {
     } else {
       // Copy thread context local. We will be modifying it during stack
       // walking, so we don't want to mess with the incoming copy.
+#if XE_ARCH_AMD64
       thread_context.Rip = in_host_context->rip;
       thread_context.EFlags = in_host_context->eflags;
       std::memcpy(&thread_context.Rax, in_host_context->int_registers,
                   sizeof(in_host_context->int_registers));
       std::memcpy(&thread_context.Xmm0, in_host_context->xmm_registers,
                   sizeof(in_host_context->xmm_registers));
+#elif XE_ARCH_ARM64
+      thread_context.Pc = in_host_context->pc;
+      thread_context.Cpsr = in_host_context->cpsr;
+      std::memcpy(thread_context.X, in_host_context->x,
+                  sizeof(in_host_context->x));
+      std::memcpy(&thread_context.V, in_host_context->v,
+                  sizeof(in_host_context->v));
+#endif
     }
 
     if (out_host_context) {
       // Write out the captured thread context if the caller asked for it.
+#if XE_ARCH_AMD64
       out_host_context->rip = thread_context.Rip;
       out_host_context->eflags = thread_context.EFlags;
       std::memcpy(out_host_context->int_registers, &thread_context.Rax,
                   sizeof(out_host_context->int_registers));
       std::memcpy(out_host_context->xmm_registers, &thread_context.Xmm0,
                   sizeof(out_host_context->xmm_registers));
+#elif XE_ARCH_ARM64
+      out_host_context->pc = thread_context.Pc;
+      out_host_context->cpsr = thread_context.Cpsr;
+      std::memcpy(out_host_context->x, &thread_context.X,
+                  sizeof(out_host_context->x));
+      std::memcpy(out_host_context->v, &thread_context.V,
+                  sizeof(out_host_context->v));
+#endif
     }
 
     // Setup the frame for walking.
     STACKFRAME64 stack_frame = {0};
     stack_frame.AddrPC.Mode = AddrModeFlat;
+#if XE_ARCH_AMD64
     stack_frame.AddrPC.Offset = thread_context.Rip;
+#elif XE_ARCH_ARM64
+    stack_frame.AddrPC.Offset = thread_context.Pc;
+#endif
     stack_frame.AddrFrame.Mode = AddrModeFlat;
+#if XE_ARCH_AMD64
     stack_frame.AddrFrame.Offset = thread_context.Rbp;
+#elif XE_ARCH_ARM64
+    stack_frame.AddrFrame.Offset = thread_context.Fp;
+#endif
     stack_frame.AddrStack.Mode = AddrModeFlat;
+#if XE_ARCH_AMD64
     stack_frame.AddrStack.Offset = thread_context.Rsp;
+#elif XE_ARCH_ARM64
+    stack_frame.AddrStack.Offset = thread_context.Sp;
+#endif
 
     // Walk the stack.
     // Note that StackWalk64 is thread safe, though other dbghelp functions are
     // not.
     size_t frame_index = 0;
     while (frame_index < frame_count &&
-           stack_walk_64_(IMAGE_FILE_MACHINE_AMD64, GetCurrentProcess(),
-                          thread_handle, &stack_frame, &thread_context, nullptr,
+           stack_walk_64_(kMachineType, GetCurrentProcess(), thread_handle,
+                          &stack_frame, &thread_context, nullptr,
                           XSymFunctionTableAccess64, XSymGetModuleBase64,
                           nullptr) == TRUE) {
       if (frame_index >= frame_offset) {
@@ -237,7 +273,7 @@ class Win32StackWalker : public StackWalker {
         if (function) {
           frame.guest_symbol.function = function;
           // Figure out where in guest code we are by looking up the
-          // displacement in x64 from the JIT'ed code start to the PC.
+          // displacement in bytes from the JIT'ed code start to the PC.
           if (function->is_guest()) {
             auto guest_function = static_cast<GuestFunction*>(function);
             // Adjust the host PC by -1 so that we will go back into whatever
diff --git a/src/xenia/cpu/testing/premake5.lua b/src/xenia/cpu/testing/premake5.lua
index 5e70fb3f8..afc1540e7 100644
--- a/src/xenia/cpu/testing/premake5.lua
+++ b/src/xenia/cpu/testing/premake5.lua
@@ -19,6 +19,12 @@ test_suite("xenia-cpu-tests", project_root, ".", {
       links = {
         "xenia-cpu-backend-x64",
       },
-    }
+    },
+    {
+      filter = 'architecture:ARM64',
+      links = {
+        "xenia-cpu-backend-a64",
+      },
+    },
   },
 })
diff --git a/src/xenia/cpu/testing/util.h b/src/xenia/cpu/testing/util.h
index 8f6df2d57..f77c5a11a 100644
--- a/src/xenia/cpu/testing/util.h
+++ b/src/xenia/cpu/testing/util.h
@@ -13,7 +13,12 @@
 #include <vector>
 
 #include "xenia/base/platform.h"
+#if XE_ARCH_AMD64
 #include "xenia/cpu/backend/x64/x64_backend.h"
+#elif XE_ARCH_ARM64
+#include "xenia/cpu/backend/a64/a64_backend.h"
+#endif // XE_ARCH
+
 #include "xenia/cpu/hir/hir_builder.h"
 #include "xenia/cpu/ppc/ppc_context.h"
 #include "xenia/cpu/ppc/ppc_frontend.h"
@@ -39,6 +44,8 @@ class TestFunction {
       std::unique_ptr<xe::cpu::backend::Backend> backend;
 #if XE_ARCH_AMD64
       backend.reset(new xe::cpu::backend::x64::X64Backend());
+#elif XE_ARCH_ARM64
+      backend.reset(new xe::cpu::backend::a64::A64Backend());
 #endif  // XE_ARCH
       if (backend) {
         auto processor = std::make_unique<Processor>(memory.get(), nullptr);
@@ -74,7 +81,7 @@ class TestFunction {
       uint32_t stack_address = memory_size - stack_size;
       uint32_t thread_state_address = stack_address - 0x1000;
       auto thread_state = std::make_unique<ThreadState>(processor.get(), 0x100);
-      assert_always();  // TODO: Allocate a thread stack!!!
+      // assert_always();  // TODO: Allocate a thread stack!!!
       auto ctx = thread_state->context();
       ctx->lr = 0xBCBCBCBC;
 
diff --git a/src/xenia/debug/ui/debug_window.cc b/src/xenia/debug/ui/debug_window.cc
index 07d4404db..06dba452a 100644
--- a/src/xenia/debug/ui/debug_window.cc
+++ b/src/xenia/debug/ui/debug_window.cc
@@ -63,7 +63,13 @@ DebugWindow::DebugWindow(Emulator* emulator,
       processor_(emulator->processor()),
       app_context_(app_context),
       window_(xe::ui::Window::Create(app_context_, kBaseTitle, 1500, 1000)) {
-  if (cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) != CS_ERR_OK) {
+  if (
+#ifdef XE_ARCH_AMD64
+      cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_)
+#elif XE_ARCH_ARM64
+      cs_open(CS_ARCH_ARM64, CS_MODE_LITTLE_ENDIAN, &capstone_handle_)
+#endif
+      != CS_ERR_OK) {
     assert_always("Failed to initialize capstone");
   }
   cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL);
@@ -338,7 +344,7 @@ void DebugWindow::DrawSourcePane() {
   //   copy button
   //   address start - end
   //   name text box (editable)
-  //   combo for interleaved + [ppc, hir, opt hir, x64 + byte with sizes]
+  //   combo for interleaved + [ppc, hir, opt hir, asm + byte with sizes]
   ImGui::AlignTextToFramePadding();
   ImGui::Text("%s", function->module()->name().c_str());
   ImGui::SameLine();
@@ -383,11 +389,11 @@ void DebugWindow::DrawSourcePane() {
   }
   ImGui::SameLine();
   if (state_.source_display_mode > 0) {
-    // Only show x64 step button if we have x64 visible.
+    // Only show asm step button if we have asm visible.
     ImGui::Dummy(ImVec2(4, 0));
     ImGui::SameLine();
     ImGui::PushButtonRepeat(true);
-    if (ImGui::ButtonEx("Step x64", ImVec2(0, 0),
+    if (ImGui::ButtonEx("Step " XE_HOST_ARCH_NAME, ImVec2(0, 0),
                         can_step ? 0 : ImGuiItemFlags_Disabled)) {
       // By enabling the button when stepping we allow repeat behavior.
       if (processor_->execution_state() != cpu::ExecutionState::kStepping) {
@@ -396,8 +402,8 @@ void DebugWindow::DrawSourcePane() {
     }
     ImGui::PopButtonRepeat();
     if (ImGui::IsItemHovered()) {
-      ImGui::SetTooltip(
-          "Step one x64 instruction on the current thread (hold for many).");
+      ImGui::SetTooltip("Step one " XE_HOST_ARCH_NAME
+                        " instruction on the current thread (hold for many).");
     }
     ImGui::SameLine();
   }
@@ -412,9 +418,9 @@ void DebugWindow::DrawSourcePane() {
   if (function->is_guest()) {
     const char* kSourceDisplayModes[] = {
         "PPC",
-        "PPC+HIR+x64",
-        "PPC+HIR (opt)+x64",
-        "PPC+x64",
+        "PPC+HIR+" XE_HOST_ARCH_NAME,
+        "PPC+HIR (opt)+" XE_HOST_ARCH_NAME,
+        "PPC+" XE_HOST_ARCH_NAME,
     };
     ImGui::PushItemWidth(90);
     ImGui::Combo("##display_mode", &state_.source_display_mode,
@@ -459,7 +465,7 @@ void DebugWindow::DrawGuestFunctionSource() {
   //     labels get their own line with duped addresses
   //       show xrefs to labels?
   //     hir greyed and offset (background color change?)
-  //     x64 greyed and offset with native address
+  //     asm greyed and offset with native address
   //     hover on registers/etc for tooltip/highlight others
   //     click register to go to location of last write
   //     click code address to jump to code
@@ -472,18 +478,18 @@ void DebugWindow::DrawGuestFunctionSource() {
 
   bool draw_hir = false;
   bool draw_hir_opt = false;
-  bool draw_x64 = false;
+  bool draw_asm = false;
   switch (state_.source_display_mode) {
     case 1:
       draw_hir = true;
-      draw_x64 = true;
+      draw_asm = true;
       break;
     case 2:
       draw_hir_opt = true;
-      draw_x64 = true;
+      draw_asm = true;
       break;
     case 3:
-      draw_x64 = true;
+      draw_asm = true;
       break;
   }
 
@@ -498,8 +504,8 @@ void DebugWindow::DrawGuestFunctionSource() {
   if (draw_hir_opt) {
     // TODO(benvanik): get HIR and draw preamble.
   }
-  if (draw_x64) {
-    // x64 preamble.
+  if (draw_asm) {
+    // asm preamble.
     DrawMachineCodeSource(function->machine_code(), source_map[0].code_offset);
   }
 
@@ -512,7 +518,7 @@ void DebugWindow::DrawGuestFunctionSource() {
     bool is_current_instr = address == guest_pc;
     if (is_current_instr) {
       ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.0f, 1.0f, 0.0f, 1.0f));
-      if (!draw_x64) {
+      if (!draw_asm) {
         ScrollToSourceIfPcChanged();
       }
     }
@@ -548,7 +554,7 @@ void DebugWindow::DrawGuestFunctionSource() {
       if (draw_hir_opt) {
         // TODO(benvanik): get HIR and draw for this PPC function.
       }
-      if (draw_x64) {
+      if (draw_asm) {
         const uint8_t* machine_code_start =
             function->machine_code() + source_map[source_map_index].code_offset;
         const size_t machine_code_length =
@@ -851,10 +857,10 @@ void DebugWindow::DrawRegistersPane() {
   if (state_.register_group == RegisterGroup::kHostGeneral) {
     ImGui::PushStyleColor(ImGuiCol_Button,
                           ImGui::GetStyle().Colors[ImGuiCol_ButtonActive]);
-    ImGui::Button("x64");
+    ImGui::Button(XE_HOST_ARCH_NAME);
     ImGui::PopStyleColor();
   } else {
-    if (ImGui::Button("x64")) {
+    if (ImGui::Button(XE_HOST_ARCH_NAME)) {
       state_.register_group = RegisterGroup::kHostGeneral;
     }
   }
@@ -862,10 +868,10 @@ void DebugWindow::DrawRegistersPane() {
   if (state_.register_group == RegisterGroup::kHostVector) {
     ImGui::PushStyleColor(ImGuiCol_Button,
                           ImGui::GetStyle().Colors[ImGuiCol_ButtonActive]);
-    ImGui::Button("XMM");
+    ImGui::Button(XE_HOST_ARCH_NAME "-vec");
     ImGui::PopStyleColor();
   } else {
-    if (ImGui::Button("XMM")) {
+    if (ImGui::Button(XE_HOST_ARCH_NAME "-vec")) {
       state_.register_group = RegisterGroup::kHostVector;
     }
   }
@@ -958,6 +964,7 @@ void DebugWindow::DrawRegistersPane() {
     } break;
     case RegisterGroup::kHostGeneral: {
       ImGui::BeginChild("##host_general");
+#if XE_ARCH_AMD64
       for (int i = 0; i < 18; ++i) {
         auto reg = static_cast<X64Register>(i);
         ImGui::BeginGroup();
@@ -995,6 +1002,46 @@ void DebugWindow::DrawRegistersPane() {
             i, thread_info->host_context.xmm_registers[i].f32);
         ImGui::EndGroup();
       }
+#elif XE_ARCH_ARM64
+      // TODO(wunkolo): print ARM64 registers
+      for (int i = 0; i < 34; ++i) {
+        auto reg = static_cast<Arm64Register>(i);
+        ImGui::BeginGroup();
+        ImGui::AlignTextToFramePadding();
+        ImGui::Text("%3s", HostThreadContext::GetRegisterName(reg));
+        ImGui::SameLine();
+        ImGui::Dummy(ImVec2(4, 0));
+        ImGui::SameLine();
+        if (reg == Arm64Register::kPc) {
+          dirty_guest_context |=
+              DrawRegisterTextBox(i, &thread_info->host_context.pc);
+        } else if (reg == Arm64Register::kPstate) {
+          dirty_guest_context =
+              DrawRegisterTextBox(i, &thread_info->host_context.cpsr);
+        } else {
+          dirty_guest_context |=
+              DrawRegisterTextBox(i, &thread_info->host_context.x[i]);
+        }
+        ImGui::EndGroup();
+      }
+      ImGui::EndChild();
+    } break;
+    case RegisterGroup::kHostVector: {
+      ImGui::BeginChild("##host_vector");
+      for (int i = 0; i < 32; ++i) {
+        auto reg = static_cast<Arm64Register>(
+            static_cast<int>(Arm64Register::kV0) + i);
+        ImGui::BeginGroup();
+        ImGui::AlignTextToFramePadding();
+        ImGui::Text("%5s", HostThreadContext::GetRegisterName(reg));
+        ImGui::SameLine();
+        ImGui::Dummy(ImVec2(4, 0));
+        ImGui::SameLine();
+        dirty_host_context |=
+            DrawRegisterTextBoxes(i, thread_info->host_context.v[i].f32);
+        ImGui::EndGroup();
+      }
+#endif
       ImGui::EndChild();
     }
   }
@@ -1144,7 +1191,8 @@ void DebugWindow::DrawBreakpointsPane() {
     ImGui::OpenPopup("##add_code_breakpoint");
   }
   if (ImGui::IsItemHovered()) {
-    ImGui::SetTooltip("Add a code breakpoint for either PPC or x64.");
+    ImGui::SetTooltip(
+        "Add a code breakpoint for either PPC or " XE_HOST_ARCH_NAME ".");
   }
   // TODO(benvanik): remove this set focus workaround when imgui is fixed:
   // https://github.com/ocornut/imgui/issues/343
@@ -1178,15 +1226,15 @@ void DebugWindow::DrawBreakpointsPane() {
     ImGui::Dummy(ImVec2(0, 2));
 
     ImGui::AlignTextToFramePadding();
-    ImGui::Text("x64");
+    ImGui::Text(XE_HOST_ARCH_NAME);
     ImGui::SameLine();
     ImGui::Dummy(ImVec2(2, 0));
     ImGui::SameLine();
-    static char x64_buffer[64] = {0};
+    static char asm_buffer[64] = {0};
     ImGui::PushItemWidth(100);
-    if (ImGui::InputText("##host_address", x64_buffer, 17, input_flags)) {
-      uint64_t address = string_util::from_string<uint64_t>(x64_buffer, true);
-      x64_buffer[0] = 0;
+    if (ImGui::InputText("##host_address", asm_buffer, 17, input_flags)) {
+      uint64_t address = string_util::from_string<uint64_t>(asm_buffer, true);
+      asm_buffer[0] = 0;
       CreateCodeBreakpoint(Breakpoint::AddressType::kHost, address);
       ImGui::CloseCurrentPopup();
     }
diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc
index cca28982f..836ba3420 100644
--- a/src/xenia/emulator.cc
+++ b/src/xenia/emulator.cc
@@ -53,6 +53,8 @@
 
 #if XE_ARCH_AMD64
 #include "xenia/cpu/backend/x64/x64_backend.h"
+#elif XE_ARCH_ARM64
+#include "xenia/cpu/backend/a64/a64_backend.h"
 #endif  // XE_ARCH
 
 DECLARE_int32(user_language);
@@ -172,11 +174,18 @@ X_STATUS Emulator::Setup(
   if (cvars::cpu == "x64") {
     backend.reset(new xe::cpu::backend::x64::X64Backend());
   }
+#elif XE_ARCH_ARM64
+  if (cvars::cpu == "a64") {
+    backend.reset(new xe::cpu::backend::a64::A64Backend());
+  }
 #endif  // XE_ARCH
   if (cvars::cpu == "any") {
     if (!backend) {
 #if XE_ARCH_AMD64
       backend.reset(new xe::cpu::backend::x64::X64Backend());
+#elif XE_ARCH_ARM64
+      // TODO(wunkolo): Arm64 backend
+      backend.reset(new xe::cpu::backend::a64::A64Backend());
 #endif  // XE_ARCH
     }
   }
diff --git a/src/xenia/gpu/d3d12/premake5.lua b/src/xenia/gpu/d3d12/premake5.lua
index f0ee8cc02..92633f74c 100644
--- a/src/xenia/gpu/d3d12/premake5.lua
+++ b/src/xenia/gpu/d3d12/premake5.lua
@@ -70,6 +70,11 @@ project("xenia-gpu-d3d12-trace-viewer")
       "xenia-cpu-backend-x64",
     })
 
+  filter("architecture:ARM64")
+    links({
+      "xenia-cpu-backend-a64",
+    })
+
 group("src")
 project("xenia-gpu-d3d12-trace-dump")
   uuid("686b859c-0046-44c4-a02c-41fc3fb75698")
@@ -120,3 +125,8 @@ project("xenia-gpu-d3d12-trace-dump")
     links({
       "xenia-cpu-backend-x64",
     })
+
+  filter("architecture:ARM64")
+    links({
+      "xenia-cpu-backend-a64",
+    })
diff --git a/src/xenia/gpu/premake5.lua b/src/xenia/gpu/premake5.lua
index 971d6ef70..850580ca2 100644
--- a/src/xenia/gpu/premake5.lua
+++ b/src/xenia/gpu/premake5.lua
@@ -43,7 +43,7 @@ project("xenia-gpu-shader-compiler")
     "../base/console_app_main_"..platform_suffix..".cc",
   })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     -- Only create the .user file if it doesn't already exist.
     local user_file = project_root.."/build/xenia-gpu-shader-compiler.vcxproj.user"
     if not os.isfile(user_file) then
diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua
index 90ae7c46e..41f862aeb 100644
--- a/src/xenia/gpu/vulkan/premake5.lua
+++ b/src/xenia/gpu/vulkan/premake5.lua
@@ -68,6 +68,11 @@ project("xenia-gpu-vulkan-trace-viewer")
       "xenia-cpu-backend-x64",
     })
 
+  filter("architecture:ARM64")
+    links({
+      "xenia-cpu-backend-a64",
+    })
+
   filter("platforms:Linux")
     links({
       "X11",
@@ -75,7 +80,7 @@ project("xenia-gpu-vulkan-trace-viewer")
       "X11-xcb",
     })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     -- Only create the .user file if it doesn't already exist.
     local user_file = project_root.."/build/xenia-gpu-vulkan-trace-viewer.vcxproj.user"
     if not os.isfile(user_file) then
@@ -131,6 +136,11 @@ project("xenia-gpu-vulkan-trace-dump")
       "xenia-cpu-backend-x64",
     })
 
+  filter("architecture:ARM64")
+    links({
+      "xenia-cpu-backend-a64",
+    })
+
   filter("platforms:Linux")
     links({
       "X11",
@@ -138,7 +148,7 @@ project("xenia-gpu-vulkan-trace-dump")
       "X11-xcb",
     })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     -- Only create the .user file if it doesn't already exist.
     local user_file = project_root.."/build/xenia-gpu-vulkan-trace-dump.vcxproj.user"
     if not os.isfile(user_file) then
diff --git a/src/xenia/hid/premake5.lua b/src/xenia/hid/premake5.lua
index 4e961f623..844a313f4 100644
--- a/src/xenia/hid/premake5.lua
+++ b/src/xenia/hid/premake5.lua
@@ -53,7 +53,7 @@ project("xenia-hid-demo")
       "X11-xcb",
     })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     links({
       "xenia-hid-winkey",
       "xenia-hid-xinput",
diff --git a/src/xenia/ui/premake5.lua b/src/xenia/ui/premake5.lua
index 6aff82bec..8f50fd515 100644
--- a/src/xenia/ui/premake5.lua
+++ b/src/xenia/ui/premake5.lua
@@ -19,7 +19,7 @@ project("xenia-ui")
     -- Exports JNI functions.
     wholelib("On")
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     links({
       "dwmapi",
       "dxgi",
diff --git a/third_party/SDL2.lua b/third_party/SDL2.lua
index 972aa1aa7..2186de6b7 100644
--- a/third_party/SDL2.lua
+++ b/third_party/SDL2.lua
@@ -26,7 +26,7 @@ end
 -- Call this function in project scope to include the SDL2 headers.
 --
 function sdl2_include()
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     includedirs({
       path.getrelative(".", third_party_path) .. "/SDL2/include",
     })
diff --git a/third_party/capstone.lua b/third_party/capstone.lua
index 6dc415974..8dfb328f7 100644
--- a/third_party/capstone.lua
+++ b/third_party/capstone.lua
@@ -4,13 +4,37 @@ project("capstone")
   kind("StaticLib")
   language("C")
   defines({
-    "CAPSTONE_X86_ATT_DISABLE",
     "CAPSTONE_DIET_NO",
-    "CAPSTONE_X86_REDUCE_NO",
-    "CAPSTONE_HAS_X86",
     "CAPSTONE_USE_SYS_DYN_MEM",
     "_LIB",
   })
+  filter("architecture:x86_64")
+    defines({
+      "CAPSTONE_HAS_X86",
+      "CAPSTONE_X86_ATT_DISABLE",
+      "CAPSTONE_X86_REDUCE_NO",
+    })
+    files({
+      "capstone/arch/X86/*.c",
+      "capstone/arch/X86/*.h",
+      "capstone/arch/X86/*.inc",
+    })
+    force_compile_as_c({
+      "capstone/arch/X86/**.c",
+    })
+  filter("architecture:ARM64")
+    defines({
+      "CAPSTONE_HAS_ARM64",
+    })
+    files({
+      "capstone/arch/AArch64/*.c",
+      "capstone/arch/AArch64/*.h",
+      "capstone/arch/AArch64/*.inc",
+    })
+    force_compile_as_c({
+      "capstone/arch/AArch64/**.c",
+    })
+  filter({})
   includedirs({
     "capstone",
     "capstone/include",
@@ -32,12 +56,7 @@ project("capstone")
     "capstone/SStream.h",
     "capstone/utils.c",
     "capstone/utils.h",
-
-    "capstone/arch/X86/*.c",
-    "capstone/arch/X86/*.h",
-    "capstone/arch/X86/*.inc",
   })
   force_compile_as_c({
-    "capstone/**.c",
-    "capstone/arch/X86/**.c",
-  })
+      "capstone/**.c",
+  })
\ No newline at end of file
diff --git a/third_party/discord-rpc.lua b/third_party/discord-rpc.lua
index 1f6e795f8..ca7d0370e 100644
--- a/third_party/discord-rpc.lua
+++ b/third_party/discord-rpc.lua
@@ -30,7 +30,7 @@ project("discord-rpc")
     files({
       "discord-rpc/src/discord_register_osx.m"
     })
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     files({
       "discord-rpc/src/connection_win.cpp",
       "discord-rpc/src/discord_register_win.cpp"
diff --git a/third_party/microprofile/microprofileui.h b/third_party/microprofile/microprofileui.h
index d422445dd..8f47a619d 100644
--- a/third_party/microprofile/microprofileui.h
+++ b/third_party/microprofile/microprofileui.h
@@ -3252,7 +3252,7 @@ void MicroProfileDraw(uint32_t nWidth, uint32_t nHeight)
 
 #if MICROPROFILE_CONTEXT_SWITCH_TRACE
 				MicroProfileStringArrayAddLiteral(&Debug, "Context Switch");
-				MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", S.nContextSwitchUsage, MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE / S.nContextSwitchUsage );
+				MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", S.nContextSwitchUsage, S.nContextSwitchUsage ? MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE / S.nContextSwitchUsage : 0 );
 #endif
 
 				for(int i = 0; i < MICROPROFILE_MAX_THREADS; ++i)
diff --git a/third_party/mspack.lua b/third_party/mspack.lua
index c1d1b44a5..94d6a6c81 100644
--- a/third_party/mspack.lua
+++ b/third_party/mspack.lua
@@ -28,7 +28,7 @@ project("mspack")
       "mspack/system.h",
   })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     defines({
     })
   filter("platforms:Linux")
diff --git a/third_party/oaknut b/third_party/oaknut
new file mode 160000
index 000000000..94c726ce0
--- /dev/null
+++ b/third_party/oaknut
@@ -0,0 +1 @@
+Subproject commit 94c726ce0338b054eb8cb5ea91de8fe6c19f4392
diff --git a/third_party/snappy.lua b/third_party/snappy.lua
index bf13b762e..3e6b1009d 100644
--- a/third_party/snappy.lua
+++ b/third_party/snappy.lua
@@ -18,5 +18,5 @@ project("snappy")
     "snappy/snappy.h",
   })
 
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     warnings("Off")  -- Too many warnings.
diff --git a/tools/build/scripts/platform_files.lua b/tools/build/scripts/platform_files.lua
index ec1579cf0..332436dad 100644
--- a/tools/build/scripts/platform_files.lua
+++ b/tools/build/scripts/platform_files.lua
@@ -20,7 +20,7 @@ local function match_platform_files(base_path, base_match)
   removefiles({base_path.."/".."**_android.h", base_path.."/".."**_android.cc"})
   removefiles({base_path.."/".."**_mac.h", base_path.."/".."**_mac.cc"})
   removefiles({base_path.."/".."**_win.h", base_path.."/".."**_win.cc"})
-  filter("platforms:Windows")
+  filter("platforms:Windows-*")
     files({
       base_path.."/"..base_match.."_win.h",
       base_path.."/"..base_match.."_win.cc",
diff --git a/xenia-build b/xenia-build
index 130032323..cfd134143 100755
--- a/xenia-build
+++ b/xenia-build
@@ -781,6 +781,8 @@ class BaseBuildCommand(Command):
         self.parser.add_argument(
             '--target', action='append', default=[],
             help='Builds only the given target(s).')
+        self.parser.add_argument(
+            '--arch', default='x86_64', help='Builds only the given architecture')
         self.parser.add_argument(
             '--force', action='store_true',
             help='Forces a full rebuild.')
@@ -823,6 +825,7 @@ class BaseBuildCommand(Command):
                     '/m',
                     '/v:m',
                     '/p:Configuration=' + args['config'],
+                    '/p:Platform=' + "Windows-" + args['arch'],
                     ] + ([targets] if targets is not None else []) + pass_args,
                     shell=False)
         elif sys.platform == 'darwin':