flycast/core/deps/vixl/aarch64/macro-assembler-sve-aarch64.cc

// Copyright 2019, VIXL authors
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//   * Redistributions of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//   * Redistributions in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//   * Neither the name of ARM Limited nor the names of its contributors may be
//     used to endorse or promote products derived from this software without
//     specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "macro-assembler-aarch64.h"

namespace vixl {
namespace aarch64 {

void MacroAssembler::AddSubHelper(AddSubHelperOption option,
                                  const ZRegister& zd,
                                  const ZRegister& zn,
                                  IntegerOperand imm) {
  VIXL_ASSERT(imm.FitsInLane(zd));

  // Simple, encodable cases.
  if (TrySingleAddSub(option, zd, zn, imm)) return;

  VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
  bool add_imm = (option == kAddImmediate);

  // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
  // instruction. Also interpret the immediate as signed, so we can convert
  // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
  IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
  if (signed_imm.IsNegative()) {
    AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
    IntegerOperand n_imm(signed_imm.GetMagnitude());
    // IntegerOperand can represent -INT_MIN, so this is always safe.
    VIXL_ASSERT(n_imm.IsPositiveOrZero());
    if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
  }

  // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
  UseScratchRegisterScope temps(this);
  ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
  Dup(scratch, imm);

  SingleEmissionCheckScope guard(this);
  if (add_imm) {
    add(zd, zn, scratch);
  } else {
    sub(zd, zn, scratch);
  }
}

bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
                                     const ZRegister& zd,
                                     const ZRegister& zn,
                                     IntegerOperand imm) {
  VIXL_ASSERT(imm.FitsInLane(zd));

  int imm8;
  int shift = -1;
  if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
      imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
    MovprfxHelperScope guard(this, zd, zn);
    switch (option) {
      case kAddImmediate:
        add(zd, zd, imm8, shift);
        return true;
      case kSubImmediate:
        sub(zd, zd, imm8, shift);
        return true;
    }
  }
  return false;
}

void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
                                      SVEArithPredicatedFn reg_macro,
                                      const ZRegister& zd,
                                      const ZRegister& zn,
                                      IntegerOperand imm,
                                      bool is_signed) {
  if (is_signed) {
    // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
    if (imm.IsInt8()) {
      MovprfxHelperScope guard(this, zd, zn);
      (this->*imm_fn)(zd, zd, imm.AsInt8());
      return;
    }
  } else {
    // E.g. UMIN_z_zi, UMAX_z_zi
    if (imm.IsUint8()) {
      MovprfxHelperScope guard(this, zd, zn);
      (this->*imm_fn)(zd, zd, imm.AsUint8());
      return;
    }
  }

  UseScratchRegisterScope temps(this);
  PRegister pg = temps.AcquireGoverningP();
  Ptrue(pg.WithSameLaneSizeAs(zd));

  // Try to re-use zd if we can, so we can avoid a movprfx.
  ZRegister scratch =
      zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
                     : zd;
  Dup(scratch, imm);

  // The vector-form macro for commutative operations will swap the arguments to
  // avoid movprfx, if necessary.
  (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
}

void MacroAssembler::Mul(const ZRegister& zd,
                         const ZRegister& zn,
                         IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  IntArithImmFn imm_fn = &Assembler::mul;
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
}

void MacroAssembler::Smin(const ZRegister& zd,
                          const ZRegister& zn,
                          IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInSignedLane(zd));
  IntArithImmFn imm_fn = &Assembler::smin;
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
}

void MacroAssembler::Smax(const ZRegister& zd,
                          const ZRegister& zn,
                          IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInSignedLane(zd));
  IntArithImmFn imm_fn = &Assembler::smax;
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
}

void MacroAssembler::Umax(const ZRegister& zd,
                          const ZRegister& zn,
                          IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
  IntArithImmFn imm_fn = &Assembler::umax;
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
}

void MacroAssembler::Umin(const ZRegister& zd,
                          const ZRegister& zn,
                          IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
  IntArithImmFn imm_fn = &Assembler::umin;
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
}

void MacroAssembler::Addpl(const Register& xd,
                           const Register& xn,
                           int64_t multiplier) {
  VIXL_ASSERT(allow_macro_instructions_);

  // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
  // `VL * multiplier` cannot overflow, for any possible value of VL.
  VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
  VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));

  if (xd.IsZero()) return;
  if (xn.IsZero() && xd.IsSP()) {
    // TODO: This operation doesn't make much sense, but we could support it
    // with a scratch register if necessary.
    VIXL_UNIMPLEMENTED();
  }

  // Handling xzr requires an extra move, so defer it until later so we can try
  // to use `rdvl` instead (via `Addvl`).
  if (IsInt6(multiplier) && !xn.IsZero()) {
    SingleEmissionCheckScope guard(this);
    addpl(xd, xn, static_cast<int>(multiplier));
    return;
  }

  // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
  if ((multiplier % kZRegBitsPerPRegBit) == 0) {
    Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
    return;
  }

  if (IsInt6(multiplier)) {
    VIXL_ASSERT(xn.IsZero());  // Other cases were handled with `addpl`.
    // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
    // materialise a zero.
    MacroEmissionCheckScope guard(this);
    movz(xd, 0);
    addpl(xd, xd, static_cast<int>(multiplier));
    return;
  }

  // TODO: Some probable cases result in rather long sequences. For example,
  // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
  // outside the encodable range. We should look for ways to cover such cases
  // without drastically increasing the complexity of this logic.

  // For other cases, calculate xn + (PL * multiplier) using discrete
  // instructions. This requires two scratch registers in the general case, so
  // try to re-use the destination as a scratch register.
  UseScratchRegisterScope temps(this);
  temps.Include(xd);
  temps.Exclude(xn);

  Register scratch = temps.AcquireX();
  // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
  // scale the multiplier because (we already know) it isn't a multiple of 8.
  Rdvl(scratch, multiplier);

  MacroEmissionCheckScope guard(this);
  if (xn.IsZero()) {
    asr(xd, scratch, kZRegBitsPerPRegBitLog2);
  } else if (xd.IsSP() || xn.IsSP()) {
    // TODO: MacroAssembler::Add should be able to handle this.
    asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
    add(xd, xn, scratch);
  } else {
    add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
  }
}

void MacroAssembler::Addvl(const Register& xd,
                           const Register& xn,
                           int64_t multiplier) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(xd.IsX());
  VIXL_ASSERT(xn.IsX());

  // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
  VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
  VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));

  if (xd.IsZero()) return;
  if (xn.IsZero() && xd.IsSP()) {
    // TODO: This operation doesn't make much sense, but we could support it
    // with a scratch register if necessary. `rdvl` cannot write into `sp`.
    VIXL_UNIMPLEMENTED();
  }

  if (IsInt6(multiplier)) {
    SingleEmissionCheckScope guard(this);
    if (xn.IsZero()) {
      rdvl(xd, static_cast<int>(multiplier));
    } else {
      addvl(xd, xn, static_cast<int>(multiplier));
    }
    return;
  }

  // TODO: Some probable cases result in rather long sequences. For example,
  // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
  // outside the encodable range. We should look for ways to cover such cases
  // without drastically increasing the complexity of this logic.

  // For other cases, calculate xn + (VL * multiplier) using discrete
  // instructions. This requires two scratch registers in the general case, so
  // we try to re-use the destination as a scratch register.
  UseScratchRegisterScope temps(this);
  temps.Include(xd);
  temps.Exclude(xn);

  Register a = temps.AcquireX();
  Mov(a, multiplier);

  MacroEmissionCheckScope guard(this);
  Register b = temps.AcquireX();
  rdvl(b, 1);
  if (xn.IsZero()) {
    mul(xd, a, b);
  } else if (xd.IsSP() || xn.IsSP()) {
    mul(a, a, b);
    add(xd, xn, a);
  } else {
    madd(xd, a, b, xn);
  }
}

void MacroAssembler::CalculateSVEAddress(const Register& xd,
                                         const SVEMemOperand& addr,
                                         int vl_divisor_log2) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(!addr.IsScatterGather());
  VIXL_ASSERT(xd.IsX());

  // The lower bound is where a whole Z register is accessed.
  VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
  // The upper bound is for P register accesses, and for instructions like
  // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
  VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));

  SVEOffsetModifier mod = addr.GetOffsetModifier();
  Register base = addr.GetScalarBase();

  if (addr.IsEquivalentToScalar()) {
    // For example:
    //   [x0]
    //   [x0, #0]
    //   [x0, xzr, LSL 2]
    Mov(xd, base);
  } else if (addr.IsScalarPlusImmediate()) {
    // For example:
    //   [x0, #42]
    //   [x0, #42, MUL VL]
    int64_t offset = addr.GetImmediateOffset();
    VIXL_ASSERT(offset != 0);  // Handled by IsEquivalentToScalar.
    if (addr.IsMulVl()) {
      int vl_divisor = 1 << vl_divisor_log2;
      // For all possible values of vl_divisor, we can simply use `Addpl`. This
      // will select `addvl` if necessary.
      VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
      Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
    } else {
      // IsScalarPlusImmediate() ensures that no other modifiers can occur.
      VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
      Add(xd, base, offset);
    }
  } else if (addr.IsScalarPlusScalar()) {
    // For example:
    //   [x0, x1]
    //   [x0, x1, LSL #4]
    Register offset = addr.GetScalarOffset();
    VIXL_ASSERT(!offset.IsZero());  // Handled by IsEquivalentToScalar.
    if (mod == SVE_LSL) {
      Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
    } else {
      // IsScalarPlusScalar() ensures that no other modifiers can occur.
      VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
      Add(xd, base, offset);
    }
  } else {
    // All other forms are scatter-gather addresses, which cannot be evaluated
    // into an X register.
    VIXL_UNREACHABLE();
  }
}

void MacroAssembler::Cpy(const ZRegister& zd,
                         const PRegister& pg,
                         IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInLane(zd));
  int imm8;
  int shift;
  if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
      imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
    SingleEmissionCheckScope guard(this);
    cpy(zd, pg, imm8, shift);
    return;
  }

  // The fallbacks rely on `cpy` variants that only support merging predication.
  // If zeroing predication was requested, zero the destination first.
  if (pg.IsZeroing()) {
    SingleEmissionCheckScope guard(this);
    dup(zd, 0);
  }
  PRegisterM pg_m = pg.Merging();

  // Try to encode the immediate using fcpy.
  VIXL_ASSERT(imm.FitsInLane(zd));
  if (zd.GetLaneSizeInBits() >= kHRegSize) {
    double fp_imm = 0.0;
    switch (zd.GetLaneSizeInBits()) {
      case kHRegSize:
        fp_imm =
            FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
        break;
      case kSRegSize:
        fp_imm = RawbitsToFloat(imm.AsUint32());
        break;
      case kDRegSize:
        fp_imm = RawbitsToDouble(imm.AsUint64());
        break;
      default:
        VIXL_UNREACHABLE();
        break;
    }
    // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
    // we can use IsImmFP64 for all lane sizes.
    if (IsImmFP64(fp_imm)) {
      SingleEmissionCheckScope guard(this);
      fcpy(zd, pg_m, fp_imm);
      return;
    }
  }

  // Fall back to using a scratch register.
  UseScratchRegisterScope temps(this);
  Register scratch = temps.AcquireRegisterToHoldLane(zd);
  Mov(scratch, imm);

  SingleEmissionCheckScope guard(this);
  cpy(zd, pg_m, scratch);
}

// TODO: We implement Fcpy (amongst other things) for all FP types because it
// allows us to preserve user-specified NaNs. We should come up with some
// FPImmediate type to abstract this, and avoid all the duplication below (and
// elsewhere).

void MacroAssembler::Fcpy(const ZRegister& zd,
                          const PRegisterM& pg,
                          double imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(pg.IsMerging());

  if (IsImmFP64(imm)) {
    SingleEmissionCheckScope guard(this);
    fcpy(zd, pg, imm);
    return;
  }

  // As a fall-back, cast the immediate to the required lane size, and try to
  // encode the bit pattern using `Cpy`.
  Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
}

void MacroAssembler::Fcpy(const ZRegister& zd,
                          const PRegisterM& pg,
                          float imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(pg.IsMerging());

  if (IsImmFP32(imm)) {
    SingleEmissionCheckScope guard(this);
    fcpy(zd, pg, imm);
    return;
  }

  // As a fall-back, cast the immediate to the required lane size, and try to
  // encode the bit pattern using `Cpy`.
  Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
}

void MacroAssembler::Fcpy(const ZRegister& zd,
                          const PRegisterM& pg,
                          Float16 imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(pg.IsMerging());

  if (IsImmFP16(imm)) {
    SingleEmissionCheckScope guard(this);
    fcpy(zd, pg, imm);
    return;
  }

  // As a fall-back, cast the immediate to the required lane size, and try to
  // encode the bit pattern using `Cpy`.
  Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
}

void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInLane(zd));
  unsigned lane_size = zd.GetLaneSizeInBits();
  int imm8;
  int shift;
  if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
      imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
    SingleEmissionCheckScope guard(this);
    dup(zd, imm8, shift);
  } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
    SingleEmissionCheckScope guard(this);
    dupm(zd, imm.AsUintN(lane_size));
  } else {
    UseScratchRegisterScope temps(this);
    Register scratch = temps.AcquireRegisterToHoldLane(zd);
    Mov(scratch, imm);

    SingleEmissionCheckScope guard(this);
    dup(zd, scratch);
  }
}

void MacroAssembler::NoncommutativeArithmeticHelper(
    const ZRegister& zd,
    const PRegisterM& pg,
    const ZRegister& zn,
    const ZRegister& zm,
    SVEArithPredicatedFn fn,
    SVEArithPredicatedFn rev_fn) {
  if (zd.Aliases(zn)) {
    // E.g. zd = zd / zm
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zd, pg, zn, zm);
  } else if (zd.Aliases(zm)) {
    // E.g. zd = zn / zd
    SingleEmissionCheckScope guard(this);
    (this->*rev_fn)(zd, pg, zm, zn);
  } else {
    // E.g. zd = zn / zm
    MovprfxHelperScope guard(this, zd, pg, zn);
    (this->*fn)(zd, pg, zd, zm);
  }
}

void MacroAssembler::FPCommutativeArithmeticHelper(
    const ZRegister& zd,
    const PRegisterM& pg,
    const ZRegister& zn,
    const ZRegister& zm,
    SVEArithPredicatedFn fn,
    FPMacroNaNPropagationOption nan_option) {
  ResolveFPNaNPropagationOption(&nan_option);

  if (zd.Aliases(zn)) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zd, pg, zd, zm);
  } else if (zd.Aliases(zm)) {
    switch (nan_option) {
      case FastNaNPropagation: {
        // Swap the arguments.
        SingleEmissionCheckScope guard(this);
        (this->*fn)(zd, pg, zd, zn);
        return;
      }
      case StrictNaNPropagation: {
        UseScratchRegisterScope temps(this);
        // Use a scratch register to keep the argument order exactly as
        // specified.
        ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
        {
          MovprfxHelperScope guard(this, scratch, pg, zn);
          (this->*fn)(scratch, pg, scratch, zm);
        }
        Mov(zd, scratch);
        return;
      }
      case NoFPMacroNaNPropagationSelected:
        VIXL_UNREACHABLE();
        return;
    }
  } else {
    MovprfxHelperScope guard(this, zd, pg, zn);
    (this->*fn)(zd, pg, zd, zm);
  }
}

// Instructions of the form "inst zda, zn, zm, #num", where they are
// non-commutative and no reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
  V(Cmla, cmla)                              \
  V(Sqrdcmlah, sqrdcmlah)

#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
  void MacroAssembler::MASMFN(const ZRegister& zd,               \
                              const ZRegister& za,               \
                              const ZRegister& zn,               \
                              const ZRegister& zm,               \
                              int imm) {                         \
    if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
      UseScratchRegisterScope temps(this);                       \
      VIXL_ASSERT(AreSameLaneSize(zn, zm));                      \
      ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);  \
      Mov(ztmp, zd.Aliases(zn) ? zn : zm);                       \
      MovprfxHelperScope guard(this, zd, za);                    \
      ASMFN(zd,                                                  \
            (zd.Aliases(zn) ? ztmp : zn),                        \
            (zd.Aliases(zm) ? ztmp : zm),                        \
            imm);                                                \
    } else {                                                     \
      MovprfxHelperScope guard(this, zd, za);                    \
      ASMFN(zd, zn, zm, imm);                                    \
    }                                                            \
  }
VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC

// Instructions of the form "inst zda, zn, zm, #num, #num", where they are
// non-commutative and no reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
  V(Cmla, cmla)                               \
  V(Sqrdcmlah, sqrdcmlah)

// This doesn't handle zm when it's out of the range that can be encoded in
// instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
  void MacroAssembler::MASMFN(const ZRegister& zd,               \
                              const ZRegister& za,               \
                              const ZRegister& zn,               \
                              const ZRegister& zm,               \
                              int index,                         \
                              int rot) {                         \
    if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
      UseScratchRegisterScope temps(this);                       \
      ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);  \
      {                                                          \
        MovprfxHelperScope guard(this, ztmp, za);                \
        ASMFN(ztmp, zn, zm, index, rot);                         \
      }                                                          \
      Mov(zd, ztmp);                                             \
    } else {                                                     \
      MovprfxHelperScope guard(this, zd, za);                    \
      ASMFN(zd, zn, zm, index, rot);                             \
    }                                                            \
  }
VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC

// Instructions of the form "inst zda, pg, zda, zn", where they are
// non-commutative and no reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
  V(Addp, addp)                             \
  V(Bic, bic)                               \
  V(Faddp, faddp)                           \
  V(Fmaxnmp, fmaxnmp)                       \
  V(Fminnmp, fminnmp)                       \
  V(Fmaxp, fmaxp)                           \
  V(Fminp, fminp)                           \
  V(Fscale, fscale)                         \
  V(Smaxp, smaxp)                           \
  V(Sminp, sminp)                           \
  V(Suqadd, suqadd)                         \
  V(Umaxp, umaxp)                           \
  V(Uminp, uminp)                           \
  V(Usqadd, usqadd)

#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                       \
  void MacroAssembler::MASMFN(const ZRegister& zd,                 \
                              const PRegisterM& pg,                \
                              const ZRegister& zn,                 \
                              const ZRegister& zm) {               \
    VIXL_ASSERT(allow_macro_instructions_);                        \
    if (zd.Aliases(zm) && !zd.Aliases(zn)) {                       \
      UseScratchRegisterScope temps(this);                         \
      ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
      Mov(scratch, zm);                                            \
      MovprfxHelperScope guard(this, zd, pg, zn);                  \
      ASMFN(zd, pg, zd, scratch);                                  \
    } else {                                                       \
      MovprfxHelperScope guard(this, zd, pg, zn);                  \
      ASMFN(zd, pg, zd, zm);                                       \
    }                                                              \
  }
VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC

// Instructions of the form "inst zda, pg, zda, zn", where they are
// non-commutative and a reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
  V(Asr, asr)                                       \
  V(Fdiv, fdiv)                                     \
  V(Fsub, fsub)                                     \
  V(Lsl, lsl)                                       \
  V(Lsr, lsr)                                       \
  V(Sdiv, sdiv)                                     \
  V(Shsub, shsub)                                   \
  V(Sqrshl, sqrshl)                                 \
  V(Sqshl, sqshl)                                   \
  V(Sqsub, sqsub)                                   \
  V(Srshl, srshl)                                   \
  V(Sub, sub)                                       \
  V(Udiv, udiv)                                     \
  V(Uhsub, uhsub)                                   \
  V(Uqrshl, uqrshl)                                 \
  V(Uqshl, uqshl)                                   \
  V(Uqsub, uqsub)                                   \
  V(Urshl, urshl)

#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                          \
  void MacroAssembler::MASMFN(const ZRegister& zd,                    \
                              const PRegisterM& pg,                   \
                              const ZRegister& zn,                    \
                              const ZRegister& zm) {                  \
    VIXL_ASSERT(allow_macro_instructions_);                           \
    NoncommutativeArithmeticHelper(zd,                                \
                                   pg,                                \
                                   zn,                                \
                                   zm,                                \
                                   static_cast<SVEArithPredicatedFn>( \
                                       &Assembler::ASMFN),            \
                                   static_cast<SVEArithPredicatedFn>( \
                                       &Assembler::ASMFN##r));        \
  }
VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC

void MacroAssembler::Fadd(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fadd),
                                nan_option);
}

void MacroAssembler::Fabd(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fabd),
                                nan_option);
}

void MacroAssembler::Fmul(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fmul),
                                nan_option);
}

void MacroAssembler::Fmulx(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fmulx),
                                nan_option);
}

void MacroAssembler::Fmax(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fmax),
                                nan_option);
}

void MacroAssembler::Fmin(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fmin),
                                nan_option);
}

void MacroAssembler::Fmaxnm(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fmaxnm),
                                nan_option);
}

void MacroAssembler::Fminnm(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPCommutativeArithmeticHelper(zd,
                                pg,
                                zn,
                                zm,
                                static_cast<SVEArithPredicatedFn>(
                                    &Assembler::fminnm),
                                nan_option);
}

void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
  VIXL_ASSERT(allow_macro_instructions_);

  switch (zd.GetLaneSizeInBits()) {
    case kHRegSize:
      Fdup(zd, Float16(imm));
      break;
    case kSRegSize:
      Fdup(zd, static_cast<float>(imm));
      break;
    case kDRegSize:
      uint64_t bits = DoubleToRawbits(imm);
      if (IsImmFP64(bits)) {
        SingleEmissionCheckScope guard(this);
        fdup(zd, imm);
      } else {
        Dup(zd, bits);
      }
      break;
  }
}

void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
  VIXL_ASSERT(allow_macro_instructions_);

  switch (zd.GetLaneSizeInBits()) {
    case kHRegSize:
      Fdup(zd, Float16(imm));
      break;
    case kSRegSize:
      if (IsImmFP32(imm)) {
        SingleEmissionCheckScope guard(this);
        fdup(zd, imm);
      } else {
        Dup(zd, FloatToRawbits(imm));
      }
      break;
    case kDRegSize:
      Fdup(zd, static_cast<double>(imm));
      break;
  }
}

void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
  VIXL_ASSERT(allow_macro_instructions_);

  switch (zd.GetLaneSizeInBits()) {
    case kHRegSize:
      if (IsImmFP16(imm)) {
        SingleEmissionCheckScope guard(this);
        fdup(zd, imm);
      } else {
        Dup(zd, Float16ToRawbits(imm));
      }
      break;
    case kSRegSize:
      Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
      break;
    case kDRegSize:
      Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
      break;
  }
}

void MacroAssembler::Index(const ZRegister& zd,
                           const Operand& start,
                           const Operand& step) {
  class IndexOperand : public Operand {
   public:
    static IndexOperand Prepare(MacroAssembler* masm,
                                UseScratchRegisterScope* temps,
                                const Operand& op,
                                const ZRegister& zd_inner) {
      // Look for encodable immediates.
      int imm;
      if (op.IsImmediate()) {
        if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
          return IndexOperand(imm);
        }
        Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
        masm->Mov(scratch, op);
        return IndexOperand(scratch);
      } else {
        // Plain registers can be encoded directly.
        VIXL_ASSERT(op.IsPlainRegister());
        return IndexOperand(op.GetRegister());
      }
    }

    int GetImm5() const {
      int64_t imm = GetImmediate();
      VIXL_ASSERT(IsInt5(imm));
      return static_cast<int>(imm);
    }

   private:
    explicit IndexOperand(const Register& reg) : Operand(reg) {}
    explicit IndexOperand(int64_t imm) : Operand(imm) {}
  };

  UseScratchRegisterScope temps(this);
  IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
  IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);

  SingleEmissionCheckScope guard(this);
  if (start_enc.IsImmediate()) {
    if (step_enc.IsImmediate()) {
      index(zd, start_enc.GetImm5(), step_enc.GetImm5());
    } else {
      index(zd, start_enc.GetImm5(), step_enc.GetRegister());
    }
  } else {
    if (step_enc.IsImmediate()) {
      index(zd, start_enc.GetRegister(), step_enc.GetImm5());
    } else {
      index(zd, start_enc.GetRegister(), step_enc.GetRegister());
    }
  }
}

void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(imm.FitsInLane(zdn));

  if (imm.IsZero()) {
    SingleEmissionCheckScope guard(this);
    insr(zdn, xzr);
    return;
  }

  UseScratchRegisterScope temps(this);
  Register scratch = temps.AcquireRegisterToHoldLane(zdn);

  // TODO: There are many cases where we could optimise immediates, such as by
  // detecting repeating patterns or FP immediates. We should optimise and
  // abstract this for use in other SVE mov-immediate-like macros.
  Mov(scratch, imm);

  SingleEmissionCheckScope guard(this);
  insr(zdn, scratch);
}

void MacroAssembler::Mla(const ZRegister& zd,
                         const PRegisterM& pg,
                         const ZRegister& za,
                         const ZRegister& zn,
                         const ZRegister& zm) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (zd.Aliases(za)) {
    // zda = zda + (zn * zm)
    SingleEmissionCheckScope guard(this);
    mla(zd, pg, zn, zm);
  } else if (zd.Aliases(zn)) {
    // zdn = za + (zdn * zm)
    SingleEmissionCheckScope guard(this);
    mad(zd, pg, zm, za);
  } else if (zd.Aliases(zm)) {
    // Multiplication is commutative, so we can swap zn and zm.
    // zdm = za + (zdm * zn)
    SingleEmissionCheckScope guard(this);
    mad(zd, pg, zn, za);
  } else {
    // zd = za + (zn * zm)
    ExactAssemblyScope guard(this, 2 * kInstructionSize);
    movprfx(zd, pg, za);
    mla(zd, pg, zn, zm);
  }
}

void MacroAssembler::Mls(const ZRegister& zd,
                         const PRegisterM& pg,
                         const ZRegister& za,
                         const ZRegister& zn,
                         const ZRegister& zm) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (zd.Aliases(za)) {
    // zda = zda - (zn * zm)
    SingleEmissionCheckScope guard(this);
    mls(zd, pg, zn, zm);
  } else if (zd.Aliases(zn)) {
    // zdn = za - (zdn * zm)
    SingleEmissionCheckScope guard(this);
    msb(zd, pg, zm, za);
  } else if (zd.Aliases(zm)) {
    // Multiplication is commutative, so we can swap zn and zm.
    // zdm = za - (zdm * zn)
    SingleEmissionCheckScope guard(this);
    msb(zd, pg, zn, za);
  } else {
    // zd = za - (zn * zm)
    ExactAssemblyScope guard(this, 2 * kInstructionSize);
    movprfx(zd, pg, za);
    mls(zd, pg, zn, zm);
  }
}

void MacroAssembler::CompareHelper(Condition cond,
                                   const PRegisterWithLaneSize& pd,
                                   const PRegisterZ& pg,
                                   const ZRegister& zn,
                                   IntegerOperand imm) {
  UseScratchRegisterScope temps(this);
  ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
  Dup(zm, imm);
  SingleEmissionCheckScope guard(this);
  cmp(cond, pd, pg, zn, zm);
}

void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
                            const PRegister& pg,
                            const PRegisterWithLaneSize& pn) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(pd.IsLaneSizeB());
  VIXL_ASSERT(pn.IsLaneSizeB());
  if (pd.Is(pn)) {
    SingleEmissionCheckScope guard(this);
    pfirst(pd, pg, pn);
  } else {
    UseScratchRegisterScope temps(this);
    PRegister temp_pg = pg;
    if (pd.Aliases(pg)) {
      temp_pg = temps.AcquireP();
      Mov(temp_pg.VnB(), pg.VnB());
    }
    Mov(pd, pn);
    SingleEmissionCheckScope guard(this);
    pfirst(pd, temp_pg, pd);
  }
}

void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
                           const PRegister& pg,
                           const PRegisterWithLaneSize& pn) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(AreSameFormat(pd, pn));
  if (pd.Is(pn)) {
    SingleEmissionCheckScope guard(this);
    pnext(pd, pg, pn);
  } else {
    UseScratchRegisterScope temps(this);
    PRegister temp_pg = pg;
    if (pd.Aliases(pg)) {
      temp_pg = temps.AcquireP();
      Mov(temp_pg.VnB(), pg.VnB());
    }
    Mov(pd.VnB(), pn.VnB());
    SingleEmissionCheckScope guard(this);
    pnext(pd, temp_pg, pd);
  }
}

void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
                           SVEPredicateConstraint pattern,
                           FlagsUpdate s) {
  VIXL_ASSERT(allow_macro_instructions_);
  switch (s) {
    case LeaveFlags:
      Ptrue(pd, pattern);
      return;
    case SetFlags:
      Ptrues(pd, pattern);
      return;
  }
  VIXL_UNREACHABLE();
}

void MacroAssembler::Sub(const ZRegister& zd,
                         IntegerOperand imm,
                         const ZRegister& zm) {
  VIXL_ASSERT(allow_macro_instructions_);

  int imm8;
  int shift = -1;
  if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
      imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
    MovprfxHelperScope guard(this, zd, zm);
    subr(zd, zd, imm8, shift);
  } else {
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
    Dup(scratch, imm);

    SingleEmissionCheckScope guard(this);
    sub(zd, scratch, zm);
  }
}

void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
                                               const PRegisterZ& pg,
                                               const SVEMemOperand& addr,
                                               SVELoadBroadcastFn fn,
                                               int divisor) {
  VIXL_ASSERT(addr.IsScalarPlusImmediate());
  int64_t imm = addr.GetImmediateOffset();
  if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, addr);
  } else {
    UseScratchRegisterScope temps(this);
    Register scratch = temps.AcquireX();
    CalculateSVEAddress(scratch, addr, zt);
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, SVEMemOperand(scratch));
  }
}

void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
                                                 const SVEMemOperand& addr,
                                                 SVELoadStoreFn fn) {
  VIXL_ASSERT(allow_macro_instructions_);
  VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());

  if (addr.IsPlainScalar() ||
      (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
       addr.IsMulVl())) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(rt, addr);
    return;
  }

  if (addr.IsEquivalentToScalar()) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
    return;
  }

  UseScratchRegisterScope temps(this);
  Register scratch = temps.AcquireX();
  CalculateSVEAddress(scratch, addr, rt);
  SingleEmissionCheckScope guard(this);
  (this->*fn)(rt, SVEMemOperand(scratch));
}

template <typename Tg, typename Tf>
void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
    const ZRegister& zt,
    const Tg& pg,
    const SVEMemOperand& addr,
    Tf fn,
    int imm_bits,
    int shift_amount,
    SVEOffsetModifier supported_modifier,
    int vl_divisor_log2) {
  VIXL_ASSERT(allow_macro_instructions_);
  int imm_divisor = 1 << shift_amount;

  if (addr.IsPlainScalar() ||
      (addr.IsScalarPlusImmediate() &&
       IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
       ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
       (addr.GetOffsetModifier() == supported_modifier))) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, addr);
    return;
  }

  if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
      addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, addr);
    return;
  }

  if (addr.IsEquivalentToScalar()) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
    return;
  }

  if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
      (vl_divisor_log2 == -1)) {
    // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
    // dependent.
    VIXL_UNIMPLEMENTED();
  }

  UseScratchRegisterScope temps(this);
  Register scratch = temps.AcquireX();
  CalculateSVEAddress(scratch, addr, vl_divisor_log2);
  SingleEmissionCheckScope guard(this);
  (this->*fn)(zt, pg, SVEMemOperand(scratch));
}

template <typename Tg, typename Tf>
void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
                                         const ZRegister& zt,
                                         const Tg& pg,
                                         const SVEMemOperand& addr,
                                         Tf fn) {
  if (addr.IsPlainScalar() ||
      (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
       addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
      (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
       addr.IsMulVl())) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, addr);
    return;
  }

  if (addr.IsEquivalentToScalar()) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
    return;
  }

  if (addr.IsVectorPlusImmediate()) {
    uint64_t offset = addr.GetImmediateOffset();
    if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
        IsUint5(offset >> msize_in_bytes_log2)) {
      SingleEmissionCheckScope guard(this);
      (this->*fn)(zt, pg, addr);
      return;
    }
  }

  if (addr.IsScalarPlusVector()) {
    VIXL_ASSERT(addr.IsScatterGather());
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, addr);
    return;
  }

  UseScratchRegisterScope temps(this);
  if (addr.IsScatterGather()) {
    // In scatter-gather modes, zt and zn/zm have the same lane size. However,
    // for 32-bit accesses, the result of each lane's address calculation still
    // requires 64 bits; we can't naively use `Adr` for the address calculation
    // because it would truncate each address to 32 bits.

    if (addr.IsVectorPlusImmediate()) {
      // Synthesise the immediate in an X register, then use a
      // scalar-plus-vector access with the original vector.
      Register scratch = temps.AcquireX();
      Mov(scratch, addr.GetImmediateOffset());
      SingleEmissionCheckScope guard(this);
      SVEOffsetModifier om =
          zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
      (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
      return;
    }

    VIXL_UNIMPLEMENTED();
  } else {
    Register scratch = temps.AcquireX();
    // TODO: If we have an immediate offset that is a multiple of
    // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
    // save an instruction.
    int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
    CalculateSVEAddress(scratch, addr, vl_divisor_log2);
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, SVEMemOperand(scratch));
  }
}

template <typename Tf>
void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
                                     const ZRegister& zt,
                                     const PRegisterZ& pg,
                                     const SVEMemOperand& addr,
                                     Tf fn) {
  if (addr.IsScatterGather()) {
    // Scatter-gather first-fault loads share encodings with normal loads.
    SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
    return;
  }

  // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
  // so we don't do immediate synthesis.

  // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
  // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
  if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
                               addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zt, pg, addr);
    return;
  }

  VIXL_UNIMPLEMENTED();
}

void MacroAssembler::Ld1b(const ZRegister& zt,
                          const PRegisterZ& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kBRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1b));
}

void MacroAssembler::Ld1h(const ZRegister& zt,
                          const PRegisterZ& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kHRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1h));
}

void MacroAssembler::Ld1w(const ZRegister& zt,
                          const PRegisterZ& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kWRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1w));
}

void MacroAssembler::Ld1d(const ZRegister& zt,
                          const PRegisterZ& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kDRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1d));
}

void MacroAssembler::Ld1sb(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kBRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1sb));
}

void MacroAssembler::Ld1sh(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kHRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1sh));
}

void MacroAssembler::Ld1sw(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kSRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVELoad1Fn>(&Assembler::ld1sw));
}

void MacroAssembler::St1b(const ZRegister& zt,
                          const PRegister& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kBRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVEStore1Fn>(&Assembler::st1b));
}

void MacroAssembler::St1h(const ZRegister& zt,
                          const PRegister& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kHRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVEStore1Fn>(&Assembler::st1h));
}

void MacroAssembler::St1w(const ZRegister& zt,
                          const PRegister& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kSRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVEStore1Fn>(&Assembler::st1w));
}

void MacroAssembler::St1d(const ZRegister& zt,
                          const PRegister& pg,
                          const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadStore1Helper(kDRegSizeInBytesLog2,
                      zt,
                      pg,
                      addr,
                      static_cast<SVEStore1Fn>(&Assembler::st1d));
}

void MacroAssembler::Ldff1b(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kBRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1b));
}

void MacroAssembler::Ldff1h(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kHRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1h));
}

void MacroAssembler::Ldff1w(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kSRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1w));
}

void MacroAssembler::Ldff1d(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kDRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1d));
}

void MacroAssembler::Ldff1sb(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kBRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
}

void MacroAssembler::Ldff1sh(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kHRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
}

void MacroAssembler::Ldff1sw(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVELoadFFHelper(kSRegSizeInBytesLog2,
                  zt,
                  pg,
                  addr,
                  static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
}

#define VIXL_SVE_LD1R_LIST(V) \
  V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)

#define VIXL_DEFINE_MASM_FUNC(SZ, SH)                          \
  void MacroAssembler::Ld1r##SZ(const ZRegister& zt,           \
                                const PRegisterZ& pg,          \
                                const SVEMemOperand& addr) {   \
    VIXL_ASSERT(allow_macro_instructions_);                    \
    SVELoadStoreNTBroadcastQOHelper(zt,                        \
                                    pg,                        \
                                    addr,                      \
                                    &MacroAssembler::ld1r##SZ, \
                                    4,                         \
                                    SH,                        \
                                    NO_SVE_OFFSET_MODIFIER,    \
                                    -1);                       \
  }

VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)

#undef VIXL_DEFINE_MASM_FUNC
#undef VIXL_SVE_LD1R_LIST

void MacroAssembler::Ldnt1b(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    ldnt1b(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::ldnt1b,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}

void MacroAssembler::Ldnt1d(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    ldnt1d(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::ldnt1d,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}

void MacroAssembler::Ldnt1h(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    ldnt1h(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::ldnt1h,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}

void MacroAssembler::Ldnt1w(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    ldnt1w(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::ldnt1w,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}

void MacroAssembler::Stnt1b(const ZRegister& zt,
                            const PRegister& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    stnt1b(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::stnt1b,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}
void MacroAssembler::Stnt1d(const ZRegister& zt,
                            const PRegister& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    stnt1d(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::stnt1d,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}
void MacroAssembler::Stnt1h(const ZRegister& zt,
                            const PRegister& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    stnt1h(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::stnt1h,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}
void MacroAssembler::Stnt1w(const ZRegister& zt,
                            const PRegister& pg,
                            const SVEMemOperand& addr) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (addr.IsVectorPlusScalar()) {
    SingleEmissionCheckScope guard(this);
    stnt1w(zt, pg, addr);
  } else {
    SVELoadStoreNTBroadcastQOHelper(zt,
                                    pg,
                                    addr,
                                    &MacroAssembler::stnt1w,
                                    4,
                                    0,
                                    SVE_MUL_VL);
  }
}

void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
                                       const ZRegister& zd,
                                       const ZRegister& za,
                                       const ZRegister& zn,
                                       const ZRegister& zm,
                                       int index) {
  if (zd.Aliases(za)) {
    // zda = zda + (zn . zm)
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zd, zn, zm, index);

  } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
    // zdn = za + (zdn . zm[index])
    // zdm = za + (zn . zdm[index])
    // zdnm = za + (zdnm . zdnm[index])
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, za);
      (this->*fn)(scratch, zn, zm, index);
    }

    Mov(zd, scratch);
  } else {
    // zd = za + (zn . zm)
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zn, zm, index);
  }
}

void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
                                              const ZRegister& zd,
                                              const ZRegister& za,
                                              const ZRegister& zn,
                                              const ZRegister& zm) {
  if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
    // zd = za . zd . zm
    // zd = za . zn . zd
    // zd = za . zd . zd
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, za);
      (this->*fn)(scratch, zn, zm);
    }

    Mov(zd, scratch);
  } else {
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zn, zm);
  }
}

void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
                                              const ZRegister& zd,
                                              const ZRegister& za,
                                              const ZRegister& zn,
                                              const ZRegister& zm) {
  if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
    // zd = za . zd . zm
    // zd = za . zn . zd
    // zd = za . zd . zd
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, za);
      (this->*fn)(scratch, scratch, zn, zm);
    }

    Mov(zd, scratch);
  } else {
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zd, zn, zm);
  }
}

void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
                                                    const ZRegister& zd,
                                                    const ZRegister& za,
                                                    const ZRegister& zn,
                                                    const ZRegister& zm,
                                                    int imm) {
  if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
    // zd = za . zd . zm[i]
    // zd = za . zn . zd[i]
    // zd = za . zd . zd[i]
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, za);
      (this->*fn)(scratch, zn, zm, imm);
    }

    Mov(zd, scratch);
  } else {
    // zd = za . zn . zm[i]
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zn, zm, imm);
  }
}

void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
                                                  const ZRegister& zd,
                                                  const ZRegister& za,
                                                  const ZRegister& zn,
                                                  const ZRegister& zm) {
  if (zn.Aliases(zm)) {
    // If zn == zm, the difference is zero.
    if (!zd.Aliases(za)) {
      Mov(zd, za);
    }
  } else if (zd.Aliases(za)) {
    SingleEmissionCheckScope guard(this);
    (this->*fn)(zd, zn, zm);
  } else if (zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
    Mov(ztmp, zn);
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, ztmp, zm);
  } else if (zd.Aliases(zm)) {
    UseScratchRegisterScope temps(this);
    ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
    Mov(ztmp, zm);
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zn, ztmp);
  } else {
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zn, zm);
  }
}

#define VIXL_SVE_4REG_LIST(V)                       \
  V(Saba, saba, AbsoluteDifferenceAccumulate)       \
  V(Uaba, uaba, AbsoluteDifferenceAccumulate)       \
  V(Sabalb, sabalb, AbsoluteDifferenceAccumulate)   \
  V(Sabalt, sabalt, AbsoluteDifferenceAccumulate)   \
  V(Uabalb, uabalb, AbsoluteDifferenceAccumulate)   \
  V(Uabalt, uabalt, AbsoluteDifferenceAccumulate)   \
  V(Sdot, sdot, FourRegDestructiveHelper)           \
  V(Udot, udot, FourRegDestructiveHelper)           \
  V(Adclb, adclb, FourRegDestructiveHelper)         \
  V(Adclt, adclt, FourRegDestructiveHelper)         \
  V(Sbclb, sbclb, FourRegDestructiveHelper)         \
  V(Sbclt, sbclt, FourRegDestructiveHelper)         \
  V(Smlalb, smlalb, FourRegDestructiveHelper)       \
  V(Smlalt, smlalt, FourRegDestructiveHelper)       \
  V(Smlslb, smlslb, FourRegDestructiveHelper)       \
  V(Smlslt, smlslt, FourRegDestructiveHelper)       \
  V(Umlalb, umlalb, FourRegDestructiveHelper)       \
  V(Umlalt, umlalt, FourRegDestructiveHelper)       \
  V(Umlslb, umlslb, FourRegDestructiveHelper)       \
  V(Umlslt, umlslt, FourRegDestructiveHelper)       \
  V(Bcax, bcax, FourRegDestructiveHelper)           \
  V(Bsl, bsl, FourRegDestructiveHelper)             \
  V(Bsl1n, bsl1n, FourRegDestructiveHelper)         \
  V(Bsl2n, bsl2n, FourRegDestructiveHelper)         \
  V(Eor3, eor3, FourRegDestructiveHelper)           \
  V(Nbsl, nbsl, FourRegDestructiveHelper)           \
  V(Fmlalb, fmlalb, FourRegDestructiveHelper)       \
  V(Fmlalt, fmlalt, FourRegDestructiveHelper)       \
  V(Fmlslb, fmlslb, FourRegDestructiveHelper)       \
  V(Fmlslt, fmlslt, FourRegDestructiveHelper)       \
  V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper)   \
  V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
  V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper)   \
  V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper)   \
  V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
  V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper)   \
  V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper)   \
  V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper)   \
  V(Fmmla, fmmla, FourRegDestructiveHelper)         \
  V(Smmla, smmla, FourRegDestructiveHelper)         \
  V(Ummla, ummla, FourRegDestructiveHelper)         \
  V(Usmmla, usmmla, FourRegDestructiveHelper)       \
  V(Usdot, usdot, FourRegDestructiveHelper)

#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
  void MacroAssembler::MASMFN(const ZRegister& zd,   \
                              const ZRegister& za,   \
                              const ZRegister& zn,   \
                              const ZRegister& zm) { \
    VIXL_ASSERT(allow_macro_instructions_);          \
    HELPER(&Assembler::ASMFN, zd, za, zn, zm);       \
  }
VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC

#define VIXL_SVE_4REG_1IMM_LIST(V)                      \
  V(Fmla, fmla, FourRegOneImmDestructiveHelper)         \
  V(Fmls, fmls, FourRegOneImmDestructiveHelper)         \
  V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper)     \
  V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper)     \
  V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper)     \
  V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper)     \
  V(Mla, mla, FourRegOneImmDestructiveHelper)           \
  V(Mls, mls, FourRegOneImmDestructiveHelper)           \
  V(Smlalb, smlalb, FourRegOneImmDestructiveHelper)     \
  V(Smlalt, smlalt, FourRegOneImmDestructiveHelper)     \
  V(Smlslb, smlslb, FourRegOneImmDestructiveHelper)     \
  V(Smlslt, smlslt, FourRegOneImmDestructiveHelper)     \
  V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
  V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
  V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
  V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
  V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
  V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
  V(Umlalb, umlalb, FourRegOneImmDestructiveHelper)     \
  V(Umlalt, umlalt, FourRegOneImmDestructiveHelper)     \
  V(Umlslb, umlslb, FourRegOneImmDestructiveHelper)     \
  V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)

#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
  void MacroAssembler::MASMFN(const ZRegister& zd,   \
                              const ZRegister& za,   \
                              const ZRegister& zn,   \
                              const ZRegister& zm,   \
                              int imm) {             \
    VIXL_ASSERT(allow_macro_instructions_);          \
    HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm);  \
  }
VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC

void MacroAssembler::Sdot(const ZRegister& zd,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          int index) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
}

void MacroAssembler::Udot(const ZRegister& zd,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          int index) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
}

void MacroAssembler::Sudot(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int index) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
}

void MacroAssembler::Usdot(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int index) {
  VIXL_ASSERT(allow_macro_instructions_);
  SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
}

void MacroAssembler::Cdot(const ZRegister& zd,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          int index,
                          int rot) {
  // This doesn't handle zm when it's out of the range that can be encoded in
  // instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
  if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
    UseScratchRegisterScope temps(this);
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, ztmp, za);
      cdot(ztmp, zn, zm, index, rot);
    }
    Mov(zd, ztmp);
  } else {
    MovprfxHelperScope guard(this, zd, za);
    cdot(zd, zn, zm, index, rot);
  }
}

void MacroAssembler::Cdot(const ZRegister& zd,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          int rot) {
  if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
    UseScratchRegisterScope temps(this);
    VIXL_ASSERT(AreSameLaneSize(zn, zm));
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
    Mov(ztmp, zd.Aliases(zn) ? zn : zm);
    MovprfxHelperScope guard(this, zd, za);
    cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
  } else {
    MovprfxHelperScope guard(this, zd, za);
    cdot(zd, zn, zm, rot);
  }
}

void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
                                    const PRegisterM& pg,
                                    const ZRegister& za,
                                    const ZRegister& zn,
                                    const ZRegister& zm,
                                    SVEMulAddPredicatedZdaFn fn_zda,
                                    SVEMulAddPredicatedZdnFn fn_zdn,
                                    FPMacroNaNPropagationOption nan_option) {
  ResolveFPNaNPropagationOption(&nan_option);

  if (zd.Aliases(za)) {
    // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
    SingleEmissionCheckScope guard(this);
    (this->*fn_zda)(zd, pg, zn, zm);
  } else if (zd.Aliases(zn)) {
    // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
    SingleEmissionCheckScope guard(this);
    (this->*fn_zdn)(zd, pg, zm, za);
  } else if (zd.Aliases(zm)) {
    switch (nan_option) {
      case FastNaNPropagation: {
        // We treat multiplication as commutative in the fast mode, so we can
        // swap zn and zm.
        // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
        SingleEmissionCheckScope guard(this);
        (this->*fn_zdn)(zd, pg, zn, za);
        return;
      }
      case StrictNaNPropagation: {
        UseScratchRegisterScope temps(this);
        // Use a scratch register to keep the argument order exactly as
        // specified.
        ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
        {
          MovprfxHelperScope guard(this, scratch, pg, za);
          // scratch = (-)za + ((-)zn * zm)
          (this->*fn_zda)(scratch, pg, zn, zm);
        }
        Mov(zd, scratch);
        return;
      }
      case NoFPMacroNaNPropagationSelected:
        VIXL_UNREACHABLE();
        return;
    }
  } else {
    // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
    MovprfxHelperScope guard(this, zd, pg, za);
    (this->*fn_zda)(zd, pg, zn, zm);
  }
}

void MacroAssembler::Fmla(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPMulAddHelper(zd,
                 pg,
                 za,
                 zn,
                 zm,
                 &Assembler::fmla,
                 &Assembler::fmad,
                 nan_option);
}

void MacroAssembler::Fmls(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPMulAddHelper(zd,
                 pg,
                 za,
                 zn,
                 zm,
                 &Assembler::fmls,
                 &Assembler::fmsb,
                 nan_option);
}

void MacroAssembler::Fnmla(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPMulAddHelper(zd,
                 pg,
                 za,
                 zn,
                 zm,
                 &Assembler::fnmla,
                 &Assembler::fnmad,
                 nan_option);
}

void MacroAssembler::Fnmls(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
  VIXL_ASSERT(allow_macro_instructions_);
  FPMulAddHelper(zd,
                 pg,
                 za,
                 zn,
                 zm,
                 &Assembler::fnmls,
                 &Assembler::fnmsb,
                 nan_option);
}

void MacroAssembler::Ftmad(const ZRegister& zd,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int imm3) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
    Mov(scratch, zm);
    MovprfxHelperScope guard(this, zd, zn);
    ftmad(zd, zd, scratch, imm3);
  } else {
    MovprfxHelperScope guard(this, zd, zn);
    ftmad(zd, zd, zm, imm3);
  }
}

void MacroAssembler::Fcadd(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int rot) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, pg, zn);
      fcadd(scratch, pg, scratch, zm, rot);
    }
    Mov(zd, scratch);
  } else {
    MovprfxHelperScope guard(this, zd, pg, zn);
    fcadd(zd, pg, zd, zm, rot);
  }
}

void MacroAssembler::Fcmla(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int rot) {
  VIXL_ASSERT(allow_macro_instructions_);
  if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
    UseScratchRegisterScope temps(this);
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, ztmp, za);
      fcmla(ztmp, pg, zn, zm, rot);
    }
    Mov(zd, pg, ztmp);
  } else {
    MovprfxHelperScope guard(this, zd, pg, za);
    fcmla(zd, pg, zn, zm, rot);
  }
}

void MacroAssembler::Splice(const ZRegister& zd,
                            const PRegister& pg,
                            const ZRegister& zn,
                            const ZRegister& zm) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
    SingleEmissionCheckScope guard(this);
    splice(zd, pg, zn, zm);
  } else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, zn);
      splice(scratch, pg, scratch, zm);
    }
    Mov(zd, scratch);
  } else {
    MovprfxHelperScope guard(this, zd, zn);
    splice(zd, pg, zd, zm);
  }
}

void MacroAssembler::Clasta(const ZRegister& zd,
                            const PRegister& pg,
                            const ZRegister& zn,
                            const ZRegister& zm) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, zn);
      clasta(scratch, pg, scratch, zm);
    }
    Mov(zd, scratch);
  } else {
    MovprfxHelperScope guard(this, zd, zn);
    clasta(zd, pg, zd, zm);
  }
}

void MacroAssembler::Clastb(const ZRegister& zd,
                            const PRegister& pg,
                            const ZRegister& zn,
                            const ZRegister& zm) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
    {
      MovprfxHelperScope guard(this, scratch, zn);
      clastb(scratch, pg, scratch, zm);
    }
    Mov(zd, scratch);
  } else {
    MovprfxHelperScope guard(this, zd, zn);
    clastb(zd, pg, zd, zm);
  }
}

void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
                                          const ZRegister& zd,
                                          const ZRegister& za,
                                          const ZRegister& zn,
                                          int shift) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (!zd.Aliases(za) && zd.Aliases(zn)) {
    UseScratchRegisterScope temps(this);
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
    Mov(ztmp, zn);
    {
      MovprfxHelperScope guard(this, zd, za);
      (this->*fn)(zd, ztmp, shift);
    }
  } else {
    MovprfxHelperScope guard(this, zd, za);
    (this->*fn)(zd, zn, shift);
  }
}

void MacroAssembler::Srsra(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           int shift) {
  ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
}

void MacroAssembler::Ssra(const ZRegister& zd,
                          const ZRegister& za,
                          const ZRegister& zn,
                          int shift) {
  ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
}

void MacroAssembler::Ursra(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           int shift) {
  ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
}

void MacroAssembler::Usra(const ZRegister& zd,
                          const ZRegister& za,
                          const ZRegister& zn,
                          int shift) {
  ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
}

void MacroAssembler::ComplexAddition(ZZZImmFn fn,
                                     const ZRegister& zd,
                                     const ZRegister& zn,
                                     const ZRegister& zm,
                                     int rot) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (!zd.Aliases(zn) && zd.Aliases(zm)) {
    UseScratchRegisterScope temps(this);
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
    Mov(ztmp, zm);
    {
      MovprfxHelperScope guard(this, zd, zn);
      (this->*fn)(zd, zd, ztmp, rot);
    }
  } else {
    MovprfxHelperScope guard(this, zd, zn);
    (this->*fn)(zd, zd, zm, rot);
  }
}

void MacroAssembler::Cadd(const ZRegister& zd,
                          const ZRegister& zn,
                          const ZRegister& zm,
                          int rot) {
  ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
}

void MacroAssembler::Sqcadd(const ZRegister& zd,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            int rot) {
  ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
}

}  // namespace aarch64
}  // namespace vixl