dolphin/Source/Core/Common/Arm64Emitter.h

// Copyright 2015 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#pragma once

#include <array>
#include <bit>
#include <cstring>
#include <functional>
#include <optional>
#include <type_traits>
#include <utility>

#include "Common/ArmCommon.h"
#include "Common/Assert.h"
#include "Common/BitSet.h"
#include "Common/BitUtils.h"
#include "Common/CodeBlock.h"
#include "Common/Common.h"
#include "Common/CommonTypes.h"
#include "Common/MathUtil.h"
#include "Common/SmallVector.h"

namespace Arm64Gen
{
// X30 serves a dual purpose as a link register
// Encoded as <u3:type><u5:reg>
// Types:
// 000 - 32bit GPR
// 001 - 64bit GPR
// 010 - VFP single precision
// 100 - VFP double precision
// 110 - VFP quad precision
enum class ARM64Reg
{
  // 32bit registers
  W0 = 0,
  W1,
  W2,
  W3,
  W4,
  W5,
  W6,
  W7,
  W8,
  W9,
  W10,
  W11,
  W12,
  W13,
  W14,
  W15,
  W16,
  W17,
  W18,
  W19,
  W20,
  W21,
  W22,
  W23,
  W24,
  W25,
  W26,
  W27,
  W28,
  W29,
  W30,

  WSP,  // 32bit stack pointer

  // 64bit registers
  X0 = 0x20,
  X1,
  X2,
  X3,
  X4,
  X5,
  X6,
  X7,
  X8,
  X9,
  X10,
  X11,
  X12,
  X13,
  X14,
  X15,
  X16,
  X17,
  X18,
  X19,
  X20,
  X21,
  X22,
  X23,
  X24,
  X25,
  X26,
  X27,
  X28,
  X29,
  X30,

  SP,  // 64bit stack pointer

  // VFP single precision registers
  S0 = 0x40,
  S1,
  S2,
  S3,
  S4,
  S5,
  S6,
  S7,
  S8,
  S9,
  S10,
  S11,
  S12,
  S13,
  S14,
  S15,
  S16,
  S17,
  S18,
  S19,
  S20,
  S21,
  S22,
  S23,
  S24,
  S25,
  S26,
  S27,
  S28,
  S29,
  S30,
  S31,

  // VFP Double Precision registers
  D0 = 0x80,
  D1,
  D2,
  D3,
  D4,
  D5,
  D6,
  D7,
  D8,
  D9,
  D10,
  D11,
  D12,
  D13,
  D14,
  D15,
  D16,
  D17,
  D18,
  D19,
  D20,
  D21,
  D22,
  D23,
  D24,
  D25,
  D26,
  D27,
  D28,
  D29,
  D30,
  D31,

  // ASIMD Quad-Word registers
  Q0 = 0xC0,
  Q1,
  Q2,
  Q3,
  Q4,
  Q5,
  Q6,
  Q7,
  Q8,
  Q9,
  Q10,
  Q11,
  Q12,
  Q13,
  Q14,
  Q15,
  Q16,
  Q17,
  Q18,
  Q19,
  Q20,
  Q21,
  Q22,
  Q23,
  Q24,
  Q25,
  Q26,
  Q27,
  Q28,
  Q29,
  Q30,
  Q31,

  // For PRFM(prefetch memory) encoding
  // This is encoded in the Rt register
  // Data preload
  PLDL1KEEP = 0,
  PLDL1STRM,
  PLDL2KEEP,
  PLDL2STRM,
  PLDL3KEEP,
  PLDL3STRM,
  // Instruction preload
  PLIL1KEEP = 8,
  PLIL1STRM,
  PLIL2KEEP,
  PLIL2STRM,
  PLIL3KEEP,
  PLIL3STRM,
  // Prepare for store
  PLTL1KEEP = 16,
  PLTL1STRM,
  PLTL2KEEP,
  PLTL2STRM,
  PLTL3KEEP,
  PLTL3STRM,

  WZR = WSP,
  ZR = SP,

  INVALID_REG = -1,
};

constexpr int operator&(const ARM64Reg& reg, const int mask)
{
  return static_cast<int>(reg) & mask;
}
constexpr int operator|(const ARM64Reg& reg, const int mask)
{
  return static_cast<int>(reg) | mask;
}
constexpr ARM64Reg operator+(const ARM64Reg& reg, const int addend)
{
  return static_cast<ARM64Reg>(static_cast<int>(reg) + addend);
}
constexpr bool Is64Bit(ARM64Reg reg)
{
  return (reg & 0x20) != 0;
}
constexpr bool IsSingle(ARM64Reg reg)
{
  return (reg & 0xC0) == 0x40;
}
constexpr bool IsDouble(ARM64Reg reg)
{
  return (reg & 0xC0) == 0x80;
}
constexpr bool IsScalar(ARM64Reg reg)
{
  return IsSingle(reg) || IsDouble(reg);
}
constexpr bool IsQuad(ARM64Reg reg)
{
  return (reg & 0xC0) == 0xC0;
}
constexpr bool IsVector(ARM64Reg reg)
{
  return (reg & 0xC0) != 0;
}
constexpr bool IsGPR(ARM64Reg reg)
{
  return static_cast<int>(reg) < 0x40;
}

constexpr int DecodeReg(ARM64Reg reg)
{
  return reg & 0x1F;
}
constexpr ARM64Reg EncodeRegTo32(ARM64Reg reg)
{
  return static_cast<ARM64Reg>(DecodeReg(reg));
}
constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg)
{
  return static_cast<ARM64Reg>(reg | 0x20);
}
constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg)
{
  return static_cast<ARM64Reg>(ARM64Reg::S0 | DecodeReg(reg));
}
constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg)
{
  return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
}
constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg)
{
  return static_cast<ARM64Reg>(reg | 0xC0);
}

enum class ShiftType
{
  // Logical Shift Left
  LSL = 0,
  // Logical Shift Right
  LSR = 1,
  // Arithmetic Shift Right
  ASR = 2,
  // Rotate Right
  ROR = 3,
};

enum class ExtendSpecifier
{
  UXTB = 0x0,
  UXTH = 0x1,
  UXTW = 0x2, /* Also LSL on 32bit width */
  UXTX = 0x3, /* Also LSL on 64bit width */
  SXTB = 0x4,
  SXTH = 0x5,
  SXTW = 0x6,
  SXTX = 0x7,
};

enum class IndexType
{
  Unsigned,
  Post,
  Pre,
  Signed,  // used in LDP/STP
};

enum class ShiftAmount
{
  Shift0,
  Shift16,
  Shift32,
  Shift48,
};

enum class RoundingMode
{
  A,  // round to nearest, ties to away
  M,  // round towards -inf
  N,  // round to nearest, ties to even
  P,  // round towards +inf
  Z,  // round towards zero
};

enum class GPRSize
{
  B32,
  B64,
};

struct FixupBranch
{
  enum class Type : u32
  {
    CBZ,
    CBNZ,
    BConditional,
    TBZ,
    TBNZ,
    B,
    BL,
  };

  u8* ptr;
  Type type;
  // Used with B.cond
  CCFlags cond;
  // Used with TBZ/TBNZ
  u8 bit;
  // Used with Test/Compare and Branch
  ARM64Reg reg;
};

enum class PStateField
{
  SPSel = 0,
  DAIFSet,
  DAIFClr,
  NZCV,  // The only system registers accessible from EL0 (user space)
  PMCR_EL0,
  PMCCNTR_EL0,
  FPCR = 0x340,
  FPSR = 0x341,
};

enum class SystemHint
{
  NOP,
  YIELD,
  WFE,
  WFI,
  SEV,
  SEVL,
};

enum class BarrierType
{
  OSHLD = 1,
  OSHST = 2,
  OSH = 3,
  NSHLD = 5,
  NSHST = 6,
  NSH = 7,
  ISHLD = 9,
  ISHST = 10,
  ISH = 11,
  LD = 13,
  ST = 14,
  SY = 15,
};

class ArithOption
{
private:
  enum class WidthSpecifier
  {
    Default,
    Width32Bit,
    Width64Bit,
  };

  enum class TypeSpecifier
  {
    ExtendedReg,
    Immediate,
    ShiftedReg,
  };

  ARM64Reg m_destReg;
  WidthSpecifier m_width;
  ExtendSpecifier m_extend;
  TypeSpecifier m_type;
  ShiftType m_shifttype;
  u32 m_shift;

public:
  ArithOption(ARM64Reg Rd, bool index = false)
  {
    // Indexed registers are a certain feature of AARch64
    // On Loadstore instructions that use a register offset
    // We can have the register as an index
    // If we are indexing then the offset register will
    // be shifted to the left so we are indexing at intervals
    // of the size of what we are loading
    // 8-bit: Index does nothing
    // 16-bit: Index LSL 1
    // 32-bit: Index LSL 2
    // 64-bit: Index LSL 3
    if (index)
      m_shift = 4;
    else
      m_shift = 0;

    m_destReg = Rd;
    m_type = TypeSpecifier::ExtendedReg;
    if (Is64Bit(Rd))
    {
      m_width = WidthSpecifier::Width64Bit;
      m_extend = ExtendSpecifier::UXTX;
    }
    else
    {
      m_width = WidthSpecifier::Width32Bit;
      m_extend = ExtendSpecifier::UXTW;
    }
    m_shifttype = ShiftType::LSL;
  }
  ArithOption(ARM64Reg Rd, ExtendSpecifier extend_type, u32 shift = 0)
  {
    m_destReg = Rd;
    m_width = Is64Bit(Rd) ? WidthSpecifier::Width64Bit : WidthSpecifier::Width32Bit;
    m_extend = extend_type;
    m_type = TypeSpecifier::ExtendedReg;
    m_shifttype = ShiftType::LSL;
    m_shift = shift;
  }
  ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
  {
    m_destReg = Rd;
    m_shift = shift;
    m_shifttype = shift_type;
    m_type = TypeSpecifier::ShiftedReg;
    if (Is64Bit(Rd))
    {
      m_width = WidthSpecifier::Width64Bit;
      if (shift == 64)
        m_shift = 0;
      m_extend = ExtendSpecifier::UXTX;
    }
    else
    {
      m_width = WidthSpecifier::Width32Bit;
      if (shift == 32)
        m_shift = 0;
      m_extend = ExtendSpecifier::UXTW;
    }
  }
  ARM64Reg GetReg() const { return m_destReg; }
  u32 GetData() const
  {
    switch (m_type)
    {
    case TypeSpecifier::ExtendedReg:
      return (static_cast<u32>(m_extend) << 13) | (m_shift << 10);
    case TypeSpecifier::ShiftedReg:
      return (static_cast<u32>(m_shifttype) << 22) | (m_shift << 10);
    default:
      DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData");
      break;
    }
    return 0;
  }

  bool IsExtended() const { return m_type == TypeSpecifier::ExtendedReg; }
};

struct LogicalImm
{
  constexpr LogicalImm() {}

  constexpr LogicalImm(u8 r_, u8 s_, bool n_) : r(r_), s(s_), n(n_), valid(true) {}

  constexpr LogicalImm(u64 value, GPRSize size)
  {
    // Logical immediates are encoded using parameters n, imm_s and imm_r using
    // the following table:
    //
    //    N   imms    immr    size        S             R
    //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
    //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
    //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
    //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
    //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
    //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
    // (s bits must not be all set)
    //
    // A pattern is constructed of size bits, where the least significant S+1 bits
    // are set. The pattern is rotated right by R, and repeated across a 32 or
    // 64-bit value, depending on destination register width.

    if (size == GPRSize::B32)
    {
      // To handle 32-bit logical immediates, the very easiest thing is to repeat
      // the input value twice to make a 64-bit word. The correct encoding of that
      // as a logical immediate will also be the correct encoding of the 32-bit
      // value.

      value = (value << 32) | (value & 0xFFFFFFFF);
    }

    if (value == 0 || (~value) == 0)
    {
      valid = false;
      return;
    }

    // Normalize value, rotating it such that the LSB is 1:
    // If LSB is already one, we mask away the trailing sequence of ones and
    // pick the next sequence of ones. This ensures we get a complete element
    // that has not been cut-in-half due to rotation across the word boundary.

    const int rotation = std::countr_zero(value & (value + 1));
    const u64 normalized = std::rotr(value, rotation);

    const int element_size = std::countr_zero(normalized & (normalized + 1));
    const int ones = std::countr_one(normalized);

    // Check the value is repeating; also ensures element size is a power of two.

    if (std::rotr(value, element_size) != value)
    {
      valid = false;
      return;
    }

    // Now we're done. We just have to encode the S output in such a way that
    // it gives both the number of set bits and the length of the repeated
    // segment.

    r = static_cast<u8>((element_size - rotation) & (element_size - 1));
    s = static_cast<u8>((((~element_size + 1) << 1) | (ones - 1)) & 0x3f);
    n = Common::ExtractBit<6>(element_size);

    valid = true;
  }

  constexpr operator bool() const { return valid; }

  u8 r = 0;
  u8 s = 0;
  bool n = false;
  bool valid = false;
};

class ARM64XEmitter
{
  friend class ARM64FloatEmitter;

private:
  struct RegisterMove
  {
    ARM64Reg dst;
    ARM64Reg src;
  };

  // Pointer to memory where code will be emitted to.
  u8* m_code = nullptr;

  // Pointer past the end of the memory region we're allowed to emit to.
  // Writes that would reach this memory are refused and will set the m_write_failed flag instead.
  u8* m_code_end = nullptr;

  u8* m_lastCacheFlushEnd = nullptr;

  // Set to true when a write request happens that would write past m_code_end.
  // Must be cleared with SetCodePtr() afterwards.
  bool m_write_failed = false;

  void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
  void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
  void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
  void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
  void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
  void EncodeExceptionInst(u32 instenc, u32 imm);
  void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
  void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
                            ArithOption Option);
  void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
  void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
  void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
  void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
  void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
  void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
  void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
  void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
  void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
  void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
  void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
  void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm);
  void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
                           s32 imm);
  void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
  void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);

  [[nodiscard]] FixupBranch WriteFixupBranch();

  // This function solves the "parallel moves" problem common in compilers.
  // The arguments are mutated!
  void ParallelMoves(RegisterMove* begin, RegisterMove* end, std::array<u8, 32>* source_gpr_usages);

  template <typename T>
  void MOVI2RImpl(ARM64Reg Rd, T imm);

protected:
  void Write32(u32 value);

public:
  ARM64XEmitter() = default;
  ARM64XEmitter(u8* code, u8* code_end)
      : m_code(code), m_code_end(code_end), m_lastCacheFlushEnd(code)
  {
  }

  virtual ~ARM64XEmitter() {}

  void SetCodePtr(u8* ptr, u8* end, bool write_failed = false);

  void SetCodePtrUnsafe(u8* ptr, u8* end, bool write_failed = false);
  const u8* GetCodePtr() const { return m_code; }
  u8* GetWritableCodePtr() { return m_code; }
  const u8* GetCodeEnd() const { return m_code_end; }
  u8* GetWritableCodeEnd() { return m_code_end; }
  void ReserveCodeSpace(u32 bytes);
  u8* AlignCode16();
  u8* AlignCodePage();
  void FlushIcache();
  void FlushIcacheSection(u8* start, u8* end);

  // Should be checked after a block of code has been generated to see if the code has been
  // successfully written to memory. Do not call the generated code when this returns true!
  bool HasWriteFailed() const { return m_write_failed; }

  // FixupBranch branching
  void SetJumpTarget(FixupBranch const& branch);
  [[nodiscard]] FixupBranch CBZ(ARM64Reg Rt);
  [[nodiscard]] FixupBranch CBNZ(ARM64Reg Rt);
  [[nodiscard]] FixupBranch B(CCFlags cond);
  [[nodiscard]] FixupBranch TBZ(ARM64Reg Rt, u8 bit);
  [[nodiscard]] FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
  [[nodiscard]] FixupBranch B();
  [[nodiscard]] FixupBranch BL();

  // Compare and Branch
  void CBZ(ARM64Reg Rt, const void* ptr);
  void CBNZ(ARM64Reg Rt, const void* ptr);

  // Conditional Branch
  void B(CCFlags cond, const void* ptr);

  // Test and Branch
  void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
  void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);

  // Unconditional Branch
  void B(const void* ptr);
  void BL(const void* ptr);

  // Unconditional Branch (register)
  void BR(ARM64Reg Rn);
  void BLR(ARM64Reg Rn);
  void RET(ARM64Reg Rn = ARM64Reg::X30);
  void ERET();
  void DRPS();

  // Exception generation
  void SVC(u32 imm);
  void HVC(u32 imm);
  void SMC(u32 imm);
  void BRK(u32 imm);
  void HLT(u32 imm);
  void DCPS1(u32 imm);
  void DCPS2(u32 imm);
  void DCPS3(u32 imm);

  // System
  void _MSR(PStateField field, u8 imm);
  void _MSR(PStateField field, ARM64Reg Rt);
  void MRS(ARM64Reg Rt, PStateField field);
  void CNTVCT(ARM64Reg Rt);

  void HINT(SystemHint op);
  void NOP() { HINT(SystemHint::NOP); }
  void SEV() { HINT(SystemHint::SEV); }
  void SEVL() { HINT(SystemHint::SEVL); }
  void WFE() { HINT(SystemHint::WFE); }
  void WFI() { HINT(SystemHint::WFI); }
  void YIELD() { HINT(SystemHint::YIELD); }

  void CLREX();
  void DSB(BarrierType type);
  void DMB(BarrierType type);
  void ISB(BarrierType type);

  // Add/Subtract (Extended/Shifted register)
  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
  void CMN(ARM64Reg Rn, ARM64Reg Rm);
  void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
  void CMP(ARM64Reg Rn, ARM64Reg Rm);
  void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);

  // Add/Subtract (with carry)
  void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Conditional Compare (immediate)
  void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
  void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);

  // Conditional Compare (register)
  void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
  void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);

  // Conditional Select
  void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
  void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
  void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
  void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);

  // Aliases
  void CSET(ARM64Reg Rd, CCFlags cond)
  {
    ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR;
    CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
  }
  void CSETM(ARM64Reg Rd, CCFlags cond)
  {
    ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR;
    CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
  }
  void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); }
  void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option)
  {
    SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option);
  }
  void NEGS(ARM64Reg Rd, ARM64Reg Rs) { SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); }
  void NEGS(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option)
  {
    SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option);
  }
  // Data-Processing 1 source
  void RBIT(ARM64Reg Rd, ARM64Reg Rn);
  void REV16(ARM64Reg Rd, ARM64Reg Rn);
  void REV32(ARM64Reg Rd, ARM64Reg Rn);
  void REV64(ARM64Reg Rd, ARM64Reg Rn);
  void CLZ(ARM64Reg Rd, ARM64Reg Rn);
  void CLS(ARM64Reg Rd, ARM64Reg Rn);

  // Data-Processing 2 source
  void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Data-Processing 3 source
  void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Logical (shifted register)
  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
  void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ARM64Reg::ZR : ARM64Reg::WZR, Rn, Rm); }
  void TST(ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
  {
    ANDS(Is64Bit(Rn) ? ARM64Reg::ZR : ARM64Reg::WZR, Rn, Rm, Shift);
  }

  // Wrap the above for saner syntax
  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    AND(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    BIC(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    ORR(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    ORN(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    EOR(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    EON(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    ANDS(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
  {
    BICS(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0));
  }
  // Convenience wrappers around ORR. These match the official convenience syntax.
  void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
  void MOV(ARM64Reg Rd, ARM64Reg Rm);
  void MVN(ARM64Reg Rd, ARM64Reg Rm);

  // Convenience wrappers around UBFM/EXTR.
  void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
  void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
  void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
  void ROR(ARM64Reg Rd, ARM64Reg Rm, int shift);

  // Logical (immediate)
  void AND(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm);
  void ANDS(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm);
  void EOR(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm);
  void ORR(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm);
  void TST(ARM64Reg Rn, LogicalImm imm);
  // Add/subtract (immediate)
  void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
  void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
  void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
  void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
  void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
  void CMN(ARM64Reg Rn, u32 imm, bool shift = false);

  // Data Processing (Immediate)
  void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0);
  void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0);
  void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0);

  // Bitfield move
  void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
  void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
  void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
  void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
  void BFXIL(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
  void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);

  // Extract register (ROR with two inputs, if same then faster on A67)
  void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);

  // Aliases
  void SXTB(ARM64Reg Rd, ARM64Reg Rn);
  void SXTH(ARM64Reg Rd, ARM64Reg Rn);
  void SXTW(ARM64Reg Rd, ARM64Reg Rn);
  void UXTB(ARM64Reg Rd, ARM64Reg Rn);
  void UXTH(ARM64Reg Rd, ARM64Reg Rn);

  void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); }
  // Load Register (Literal)
  void LDR(ARM64Reg Rt, u32 imm);
  void LDRSW(ARM64Reg Rt, u32 imm);
  void PRFM(ARM64Reg Rt, u32 imm);

  // Load/Store Exclusive
  void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
  void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
  void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
  void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
  void STLRB(ARM64Reg Rt, ARM64Reg Rn);
  void LDARB(ARM64Reg Rt, ARM64Reg Rn);
  void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
  void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
  void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
  void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
  void STLRH(ARM64Reg Rt, ARM64Reg Rn);
  void LDARH(ARM64Reg Rt, ARM64Reg Rn);
  void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
  void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
  void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
  void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
  void LDXR(ARM64Reg Rt, ARM64Reg Rn);
  void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
  void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
  void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
  void STLR(ARM64Reg Rt, ARM64Reg Rn);
  void LDAR(ARM64Reg Rt, ARM64Reg Rn);

  // Load/Store no-allocate pair (offset)
  void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
  void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);

  // Load/Store register (immediate indexed)
  void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);

  // Load/Store register (register offset)
  void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);

  // Load/Store register (unscaled offset)
  void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);

  // Load/Store pair
  void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
  void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
  void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);

  // Address of label/page PC-relative
  void ADR(ARM64Reg Rd, s32 imm);
  void ADRP(ARM64Reg Rd, s64 imm);

  // Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK
  void MOVI2R(ARM64Reg Rd, u64 imm);
  bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
  template <class P>
  void MOVP2R(ARM64Reg Rd, P* ptr)
  {
    ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
    MOVI2R(Rd, reinterpret_cast<uintptr_t>(ptr));
  }
  template <class P>
  // Given an address, stores the page address into a register and returns the page-relative offset
  s32 MOVPage2R(ARM64Reg Rd, P* ptr)
  {
    ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
    MOVI2R(Rd, reinterpret_cast<uintptr_t>(ptr) & ~0xFFFULL);
    return static_cast<s32>(reinterpret_cast<uintptr_t>(ptr) & 0xFFFULL);
  }

  // Wrappers around bitwise operations with an immediate. If you're sure an imm can be encoded
  // without a scratch register, preferably construct a LogicalImm directly instead,
  // since that is constexpr and thus can be done at compile time for constant values.
  void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);
  void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);
  void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch)
  {
    ANDSI2R(Is64Bit(Rn) ? ARM64Reg::ZR : ARM64Reg::WZR, Rn, imm, scratch);
  }
  void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);
  void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);

  // Wrappers around arithmetic operations with an immediate.
  void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
                       ARM64Reg scratch);
  void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG);
  void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG);
  void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG);
  void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG);
  void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG);

  bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
  bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
  bool TryCMPI2R(ARM64Reg Rn, u64 imm);

  bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
  bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
  bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);

  // ABI related
  static constexpr BitSet32 CALLER_SAVED_GPRS = BitSet32(0x4007FFFF);
  static constexpr BitSet32 CALLER_SAVED_FPRS = BitSet32(0xFFFF00FF);
  void ABI_PushRegisters(BitSet32 registers);
  void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));

  // Plain function call
  void QuickCallFunction(ARM64Reg scratchreg, const void* func);
  template <typename T>
  void QuickCallFunction(ARM64Reg scratchreg, T func)
  {
    QuickCallFunction(scratchreg, (const void*)func);
  }

  template <typename FuncRet, typename... FuncArgs, typename... Args>
  void ABI_CallFunction(FuncRet (*func)(FuncArgs...), Args... args)
  {
    static_assert(sizeof...(FuncArgs) == sizeof...(Args), "Wrong number of arguments");
    static_assert(sizeof...(FuncArgs) <= 8, "Passing arguments on the stack is not supported");

    if constexpr (!std::is_void_v<FuncRet>)
      static_assert(sizeof(FuncRet) <= 16, "Large return types are not supported");

    std::array<u8, 32> source_gpr_uses{};

    auto check_argument = [&](auto& arg) {
      using Arg = std::decay_t<decltype(arg)>;

      if constexpr (std::is_same_v<Arg, ARM64Reg>)
      {
        ASSERT(IsGPR(arg));
        source_gpr_uses[DecodeReg(arg)]++;
      }
      else
      {
        // To be more correct, we should be checking FuncArgs here rather than Args, but that's a
        // lot more effort to implement. Let's just do these best-effort checks for now.
        static_assert(!std::is_floating_point_v<Arg>, "Floating-point arguments are not supported");
        static_assert(sizeof(Arg) <= 8, "Arguments bigger than a register are not supported");
      }
    };

    (check_argument(args), ...);

    {
      Common::SmallVector<RegisterMove, sizeof...(Args)> pending_moves;

      size_t i = 0;

      auto handle_register_argument = [&](auto& arg) {
        using Arg = std::decay_t<decltype(arg)>;

        if constexpr (std::is_same_v<Arg, ARM64Reg>)
        {
          const ARM64Reg dst_reg =
              (Is64Bit(arg) ? EncodeRegTo64 : EncodeRegTo32)(static_cast<ARM64Reg>(i));

          if (dst_reg == arg)
          {
            // The value is already in the right register.
            source_gpr_uses[DecodeReg(arg)]--;
          }
          else if (source_gpr_uses[i] == 0)
          {
            // The destination register isn't used as the source of another move.
            // We can go ahead and do the move right away.
            MOV(dst_reg, arg);
            source_gpr_uses[DecodeReg(arg)]--;
          }
          else
          {
            // The destination register is used as the source of a move we haven't gotten to yet.
            // Let's record that we need to deal with this move later.
            pending_moves.emplace_back(dst_reg, arg);
          }
        }

        ++i;
      };

      (handle_register_argument(args), ...);

      if (!pending_moves.empty())
      {
        ParallelMoves(pending_moves.data(), pending_moves.data() + pending_moves.size(),
                      &source_gpr_uses);
      }
    }

    {
      size_t i = 0;

      auto handle_immediate_argument = [&](auto& arg) {
        using Arg = std::decay_t<decltype(arg)>;

        if constexpr (!std::is_same_v<Arg, ARM64Reg>)
        {
          const ARM64Reg dst_reg =
              (sizeof(arg) == 8 ? EncodeRegTo64 : EncodeRegTo32)(static_cast<ARM64Reg>(i));
          if constexpr (std::is_pointer_v<Arg>)
            MOVP2R(dst_reg, arg);
          else
            MOVI2R(dst_reg, arg);
        }

        ++i;
      };

      (handle_immediate_argument(args), ...);
    }

    QuickCallFunction(ARM64Reg::X8, func);
  }

  // Utility to generate a call to a std::function object.
  //
  // Unfortunately, calling operator() directly is undefined behavior in C++
  // (this method might be a thunk in the case of multi-inheritance) so we
  // have to go through a trampoline function.
  template <typename T, typename... Args>
  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
  {
    return (*f)(args...);
  }

  template <typename FuncRet, typename... FuncArgs, typename... Args>
  void ABI_CallLambdaFunction(const std::function<FuncRet(FuncArgs...)>* f, Args... args)
  {
    auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<FuncRet, FuncArgs...>;
    ABI_CallFunction(trampoline, f, args...);
  }
};

class ARM64FloatEmitter
{
public:
  ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
  void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);

  // Loadstore unscaled
  void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);

  // Loadstore single structure
  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);

  // Loadstore multiple structure
  void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
  void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = ARM64Reg::SP);
  void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
  void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = ARM64Reg::SP);

  // Loadstore paired
  void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
  void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);

  // Loadstore register offset
  void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);

  // Scalar - 1 Source
  void FABS(ARM64Reg Rd, ARM64Reg Rn);
  void FNEG(ARM64Reg Rd, ARM64Reg Rn);
  void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
  void FRINTI(ARM64Reg Rd, ARM64Reg Rn);
  void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP
  void FRECPE(ARM64Reg Rd, ARM64Reg Rn);
  void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);

  // Scalar - pairwise
  void FADDP(ARM64Reg Rd, ARM64Reg Rn);
  void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
  void FMINP(ARM64Reg Rd, ARM64Reg Rn);
  void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
  void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);

  // Scalar - 2 Source
  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Scalar - 3 Source. Note - the accumulator is last on ARM!
  void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
  void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);

  // Scalar floating point immediate
  void FMOV(ARM64Reg Rd, uint8_t imm8);

  // Vector
  void ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
  void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void NOT(ARM64Reg Rd, ARM64Reg Rn);
  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
  void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
  void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
  void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);

  // Move
  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
  void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
  void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
  void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);

  // One source
  void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);

  // Scalar convert float to int, in a lot of variants.
  // Note that the scalar version of this operation has two encodings, one that goes to an integer
  // register
  // and one that outputs to a scalar fp register.
  void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
  void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);

  // Scalar convert int to float. No rounding mode specifier necessary.
  void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
  void UCVTF(ARM64Reg Rd, ARM64Reg Rn);

  // Scalar fixed point to float. scale is the number of fractional bits.
  void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
  void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);

  // Float comparison
  void FCMP(ARM64Reg Rn, ARM64Reg Rm);
  void FCMP(ARM64Reg Rn);
  void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
  void FCMPE(ARM64Reg Rn);
  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Conditional select
  void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);

  // Permute
  void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Extract
  void EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 index);

  // Scalar shift by immediate
  void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift);

  // Vector shift by immediate
  void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
  void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
  void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);

  // vector x indexed element
  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
  void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);

  // Modified Immediate
  void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
  void ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
  void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);

  void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG,
              bool negate = false);
  void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG);

  // ABI related
  void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = ARM64Reg::INVALID_REG);
  void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = ARM64Reg::INVALID_REG);

private:
  ARM64XEmitter* m_emit;
  inline void Write32(u32 value) { m_emit->Write32(value); }
  // Emitting functions
  void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
                         ARM64Reg Rm);
  void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
  void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
                                    ARM64Reg Rn);
  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
                                    ARM64Reg Rn, ARM64Reg Rm);
  void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale,
                       ARM64Reg Rd, ARM64Reg Rn);
  void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
  void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EmitExtract(u32 imm4, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
  void EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
  void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
                                          ARM64Reg Rm);
  void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
  void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
                          ARM64Reg Rm);
  void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
  void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
  void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
                         int opcode);
  void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
                           ARM64Reg Rn, s32 imm);
  void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
  void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);

  void ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op);

  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
};

class ARM64CodeBlock : public Common::CodeBlock<ARM64XEmitter>
{
private:
  void PoisonMemory() override
  {
    // If our memory isn't a multiple of u32 then this won't write the last remaining bytes with
    // anything
    // Less than optimal, but there would be nothing we could do but throw a runtime warning anyway.
    // AArch64: 0xD4200000 = BRK 0
    constexpr u32 brk_0 = 0xD4200000;

    for (size_t i = 0; i < region_size; i += sizeof(u32))
    {
      std::memcpy(region + i, &brk_0, sizeof(u32));
    }
  }
};
}  // namespace Arm64Gen