flycast/core/deps/vixl/aarch64/macro-assembler-sve-aarch64.cc

2289 lines
82 KiB
C++

// Copyright 2019, VIXL authors
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of ARM Limited nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "macro-assembler-aarch64.h"
namespace vixl {
namespace aarch64 {
void MacroAssembler::AddSubHelper(AddSubHelperOption option,
const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(imm.FitsInLane(zd));
// Simple, encodable cases.
if (TrySingleAddSub(option, zd, zn, imm)) return;
VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
bool add_imm = (option == kAddImmediate);
// Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
// instruction. Also interpret the immediate as signed, so we can convert
// Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
if (signed_imm.IsNegative()) {
AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
IntegerOperand n_imm(signed_imm.GetMagnitude());
// IntegerOperand can represent -INT_MIN, so this is always safe.
VIXL_ASSERT(n_imm.IsPositiveOrZero());
if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
}
// Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
Dup(scratch, imm);
SingleEmissionCheckScope guard(this);
if (add_imm) {
add(zd, zn, scratch);
} else {
sub(zd, zn, scratch);
}
}
bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(imm.FitsInLane(zd));
int imm8;
int shift = -1;
if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
MovprfxHelperScope guard(this, zd, zn);
switch (option) {
case kAddImmediate:
add(zd, zd, imm8, shift);
return true;
case kSubImmediate:
sub(zd, zd, imm8, shift);
return true;
}
}
return false;
}
void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
SVEArithPredicatedFn reg_macro,
const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm,
bool is_signed) {
if (is_signed) {
// E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
if (imm.IsInt8()) {
MovprfxHelperScope guard(this, zd, zn);
(this->*imm_fn)(zd, zd, imm.AsInt8());
return;
}
} else {
// E.g. UMIN_z_zi, UMAX_z_zi
if (imm.IsUint8()) {
MovprfxHelperScope guard(this, zd, zn);
(this->*imm_fn)(zd, zd, imm.AsUint8());
return;
}
}
UseScratchRegisterScope temps(this);
PRegister pg = temps.AcquireGoverningP();
Ptrue(pg.WithSameLaneSizeAs(zd));
// Try to re-use zd if we can, so we can avoid a movprfx.
ZRegister scratch =
zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
: zd;
Dup(scratch, imm);
// The vector-form macro for commutative operations will swap the arguments to
// avoid movprfx, if necessary.
(this->*reg_macro)(zd, pg.Merging(), zn, scratch);
}
void MacroAssembler::Mul(const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
IntArithImmFn imm_fn = &Assembler::mul;
SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
}
void MacroAssembler::Smin(const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInSignedLane(zd));
IntArithImmFn imm_fn = &Assembler::smin;
SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
}
void MacroAssembler::Smax(const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInSignedLane(zd));
IntArithImmFn imm_fn = &Assembler::smax;
SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
}
void MacroAssembler::Umax(const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
IntArithImmFn imm_fn = &Assembler::umax;
SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
}
void MacroAssembler::Umin(const ZRegister& zd,
const ZRegister& zn,
IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
IntArithImmFn imm_fn = &Assembler::umin;
SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
}
void MacroAssembler::Addpl(const Register& xd,
const Register& xn,
int64_t multiplier) {
VIXL_ASSERT(allow_macro_instructions_);
// This macro relies on `Rdvl` to handle some out-of-range cases. Check that
// `VL * multiplier` cannot overflow, for any possible value of VL.
VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
if (xd.IsZero()) return;
if (xn.IsZero() && xd.IsSP()) {
// TODO: This operation doesn't make much sense, but we could support it
// with a scratch register if necessary.
VIXL_UNIMPLEMENTED();
}
// Handling xzr requires an extra move, so defer it until later so we can try
// to use `rdvl` instead (via `Addvl`).
if (IsInt6(multiplier) && !xn.IsZero()) {
SingleEmissionCheckScope guard(this);
addpl(xd, xn, static_cast<int>(multiplier));
return;
}
// If `multiplier` is a multiple of 8, we can use `Addvl` instead.
if ((multiplier % kZRegBitsPerPRegBit) == 0) {
Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
return;
}
if (IsInt6(multiplier)) {
VIXL_ASSERT(xn.IsZero()); // Other cases were handled with `addpl`.
// There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
// materialise a zero.
MacroEmissionCheckScope guard(this);
movz(xd, 0);
addpl(xd, xd, static_cast<int>(multiplier));
return;
}
// TODO: Some probable cases result in rather long sequences. For example,
// `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
// outside the encodable range. We should look for ways to cover such cases
// without drastically increasing the complexity of this logic.
// For other cases, calculate xn + (PL * multiplier) using discrete
// instructions. This requires two scratch registers in the general case, so
// try to re-use the destination as a scratch register.
UseScratchRegisterScope temps(this);
temps.Include(xd);
temps.Exclude(xn);
Register scratch = temps.AcquireX();
// Because there is no `rdpl`, so we have to calculate PL from VL. We can't
// scale the multiplier because (we already know) it isn't a multiple of 8.
Rdvl(scratch, multiplier);
MacroEmissionCheckScope guard(this);
if (xn.IsZero()) {
asr(xd, scratch, kZRegBitsPerPRegBitLog2);
} else if (xd.IsSP() || xn.IsSP()) {
// TODO: MacroAssembler::Add should be able to handle this.
asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
add(xd, xn, scratch);
} else {
add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
}
}
void MacroAssembler::Addvl(const Register& xd,
const Register& xn,
int64_t multiplier) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(xd.IsX());
VIXL_ASSERT(xn.IsX());
// Check that `VL * multiplier` cannot overflow, for any possible value of VL.
VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
if (xd.IsZero()) return;
if (xn.IsZero() && xd.IsSP()) {
// TODO: This operation doesn't make much sense, but we could support it
// with a scratch register if necessary. `rdvl` cannot write into `sp`.
VIXL_UNIMPLEMENTED();
}
if (IsInt6(multiplier)) {
SingleEmissionCheckScope guard(this);
if (xn.IsZero()) {
rdvl(xd, static_cast<int>(multiplier));
} else {
addvl(xd, xn, static_cast<int>(multiplier));
}
return;
}
// TODO: Some probable cases result in rather long sequences. For example,
// `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
// outside the encodable range. We should look for ways to cover such cases
// without drastically increasing the complexity of this logic.
// For other cases, calculate xn + (VL * multiplier) using discrete
// instructions. This requires two scratch registers in the general case, so
// we try to re-use the destination as a scratch register.
UseScratchRegisterScope temps(this);
temps.Include(xd);
temps.Exclude(xn);
Register a = temps.AcquireX();
Mov(a, multiplier);
MacroEmissionCheckScope guard(this);
Register b = temps.AcquireX();
rdvl(b, 1);
if (xn.IsZero()) {
mul(xd, a, b);
} else if (xd.IsSP() || xn.IsSP()) {
mul(a, a, b);
add(xd, xn, a);
} else {
madd(xd, a, b, xn);
}
}
void MacroAssembler::CalculateSVEAddress(const Register& xd,
const SVEMemOperand& addr,
int vl_divisor_log2) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(!addr.IsScatterGather());
VIXL_ASSERT(xd.IsX());
// The lower bound is where a whole Z register is accessed.
VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
// The upper bound is for P register accesses, and for instructions like
// "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
SVEOffsetModifier mod = addr.GetOffsetModifier();
Register base = addr.GetScalarBase();
if (addr.IsEquivalentToScalar()) {
// For example:
// [x0]
// [x0, #0]
// [x0, xzr, LSL 2]
Mov(xd, base);
} else if (addr.IsScalarPlusImmediate()) {
// For example:
// [x0, #42]
// [x0, #42, MUL VL]
int64_t offset = addr.GetImmediateOffset();
VIXL_ASSERT(offset != 0); // Handled by IsEquivalentToScalar.
if (addr.IsMulVl()) {
int vl_divisor = 1 << vl_divisor_log2;
// For all possible values of vl_divisor, we can simply use `Addpl`. This
// will select `addvl` if necessary.
VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
} else {
// IsScalarPlusImmediate() ensures that no other modifiers can occur.
VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
Add(xd, base, offset);
}
} else if (addr.IsScalarPlusScalar()) {
// For example:
// [x0, x1]
// [x0, x1, LSL #4]
Register offset = addr.GetScalarOffset();
VIXL_ASSERT(!offset.IsZero()); // Handled by IsEquivalentToScalar.
if (mod == SVE_LSL) {
Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
} else {
// IsScalarPlusScalar() ensures that no other modifiers can occur.
VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
Add(xd, base, offset);
}
} else {
// All other forms are scatter-gather addresses, which cannot be evaluated
// into an X register.
VIXL_UNREACHABLE();
}
}
void MacroAssembler::Cpy(const ZRegister& zd,
const PRegister& pg,
IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInLane(zd));
int imm8;
int shift;
if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
SingleEmissionCheckScope guard(this);
cpy(zd, pg, imm8, shift);
return;
}
// The fallbacks rely on `cpy` variants that only support merging predication.
// If zeroing predication was requested, zero the destination first.
if (pg.IsZeroing()) {
SingleEmissionCheckScope guard(this);
dup(zd, 0);
}
PRegisterM pg_m = pg.Merging();
// Try to encode the immediate using fcpy.
VIXL_ASSERT(imm.FitsInLane(zd));
if (zd.GetLaneSizeInBits() >= kHRegSize) {
double fp_imm = 0.0;
switch (zd.GetLaneSizeInBits()) {
case kHRegSize:
fp_imm =
FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
break;
case kSRegSize:
fp_imm = RawbitsToFloat(imm.AsUint32());
break;
case kDRegSize:
fp_imm = RawbitsToDouble(imm.AsUint64());
break;
default:
VIXL_UNREACHABLE();
break;
}
// IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
// we can use IsImmFP64 for all lane sizes.
if (IsImmFP64(fp_imm)) {
SingleEmissionCheckScope guard(this);
fcpy(zd, pg_m, fp_imm);
return;
}
}
// Fall back to using a scratch register.
UseScratchRegisterScope temps(this);
Register scratch = temps.AcquireRegisterToHoldLane(zd);
Mov(scratch, imm);
SingleEmissionCheckScope guard(this);
cpy(zd, pg_m, scratch);
}
// TODO: We implement Fcpy (amongst other things) for all FP types because it
// allows us to preserve user-specified NaNs. We should come up with some
// FPImmediate type to abstract this, and avoid all the duplication below (and
// elsewhere).
void MacroAssembler::Fcpy(const ZRegister& zd,
const PRegisterM& pg,
double imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(pg.IsMerging());
if (IsImmFP64(imm)) {
SingleEmissionCheckScope guard(this);
fcpy(zd, pg, imm);
return;
}
// As a fall-back, cast the immediate to the required lane size, and try to
// encode the bit pattern using `Cpy`.
Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
}
void MacroAssembler::Fcpy(const ZRegister& zd,
const PRegisterM& pg,
float imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(pg.IsMerging());
if (IsImmFP32(imm)) {
SingleEmissionCheckScope guard(this);
fcpy(zd, pg, imm);
return;
}
// As a fall-back, cast the immediate to the required lane size, and try to
// encode the bit pattern using `Cpy`.
Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
}
void MacroAssembler::Fcpy(const ZRegister& zd,
const PRegisterM& pg,
Float16 imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(pg.IsMerging());
if (IsImmFP16(imm)) {
SingleEmissionCheckScope guard(this);
fcpy(zd, pg, imm);
return;
}
// As a fall-back, cast the immediate to the required lane size, and try to
// encode the bit pattern using `Cpy`.
Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
}
void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInLane(zd));
unsigned lane_size = zd.GetLaneSizeInBits();
int imm8;
int shift;
if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
SingleEmissionCheckScope guard(this);
dup(zd, imm8, shift);
} else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
SingleEmissionCheckScope guard(this);
dupm(zd, imm.AsUintN(lane_size));
} else {
UseScratchRegisterScope temps(this);
Register scratch = temps.AcquireRegisterToHoldLane(zd);
Mov(scratch, imm);
SingleEmissionCheckScope guard(this);
dup(zd, scratch);
}
}
void MacroAssembler::NoncommutativeArithmeticHelper(
const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
SVEArithPredicatedFn fn,
SVEArithPredicatedFn rev_fn) {
if (zd.Aliases(zn)) {
// E.g. zd = zd / zm
SingleEmissionCheckScope guard(this);
(this->*fn)(zd, pg, zn, zm);
} else if (zd.Aliases(zm)) {
// E.g. zd = zn / zd
SingleEmissionCheckScope guard(this);
(this->*rev_fn)(zd, pg, zm, zn);
} else {
// E.g. zd = zn / zm
MovprfxHelperScope guard(this, zd, pg, zn);
(this->*fn)(zd, pg, zd, zm);
}
}
void MacroAssembler::FPCommutativeArithmeticHelper(
const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
SVEArithPredicatedFn fn,
FPMacroNaNPropagationOption nan_option) {
ResolveFPNaNPropagationOption(&nan_option);
if (zd.Aliases(zn)) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zd, pg, zd, zm);
} else if (zd.Aliases(zm)) {
switch (nan_option) {
case FastNaNPropagation: {
// Swap the arguments.
SingleEmissionCheckScope guard(this);
(this->*fn)(zd, pg, zd, zn);
return;
}
case StrictNaNPropagation: {
UseScratchRegisterScope temps(this);
// Use a scratch register to keep the argument order exactly as
// specified.
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
{
MovprfxHelperScope guard(this, scratch, pg, zn);
(this->*fn)(scratch, pg, scratch, zm);
}
Mov(zd, scratch);
return;
}
case NoFPMacroNaNPropagationSelected:
VIXL_UNREACHABLE();
return;
}
} else {
MovprfxHelperScope guard(this, zd, pg, zn);
(this->*fn)(zd, pg, zd, zm);
}
}
// Instructions of the form "inst zda, zn, zm, #num", where they are
// non-commutative and no reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
V(Cmla, cmla) \
V(Sqrdcmlah, sqrdcmlah)
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
void MacroAssembler::MASMFN(const ZRegister& zd, \
const ZRegister& za, \
const ZRegister& zn, \
const ZRegister& zm, \
int imm) { \
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
UseScratchRegisterScope temps(this); \
VIXL_ASSERT(AreSameLaneSize(zn, zm)); \
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); \
Mov(ztmp, zd.Aliases(zn) ? zn : zm); \
MovprfxHelperScope guard(this, zd, za); \
ASMFN(zd, \
(zd.Aliases(zn) ? ztmp : zn), \
(zd.Aliases(zm) ? ztmp : zm), \
imm); \
} else { \
MovprfxHelperScope guard(this, zd, za); \
ASMFN(zd, zn, zm, imm); \
} \
}
VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
// Instructions of the form "inst zda, zn, zm, #num, #num", where they are
// non-commutative and no reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
V(Cmla, cmla) \
V(Sqrdcmlah, sqrdcmlah)
// This doesn't handle zm when it's out of the range that can be encoded in
// instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
void MacroAssembler::MASMFN(const ZRegister& zd, \
const ZRegister& za, \
const ZRegister& zn, \
const ZRegister& zm, \
int index, \
int rot) { \
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
UseScratchRegisterScope temps(this); \
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); \
{ \
MovprfxHelperScope guard(this, ztmp, za); \
ASMFN(ztmp, zn, zm, index, rot); \
} \
Mov(zd, ztmp); \
} else { \
MovprfxHelperScope guard(this, zd, za); \
ASMFN(zd, zn, zm, index, rot); \
} \
}
VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
// Instructions of the form "inst zda, pg, zda, zn", where they are
// non-commutative and no reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
V(Addp, addp) \
V(Bic, bic) \
V(Faddp, faddp) \
V(Fmaxnmp, fmaxnmp) \
V(Fminnmp, fminnmp) \
V(Fmaxp, fmaxp) \
V(Fminp, fminp) \
V(Fscale, fscale) \
V(Smaxp, smaxp) \
V(Sminp, sminp) \
V(Suqadd, suqadd) \
V(Umaxp, umaxp) \
V(Uminp, uminp) \
V(Usqadd, usqadd)
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
void MacroAssembler::MASMFN(const ZRegister& zd, \
const PRegisterM& pg, \
const ZRegister& zn, \
const ZRegister& zm) { \
VIXL_ASSERT(allow_macro_instructions_); \
if (zd.Aliases(zm) && !zd.Aliases(zn)) { \
UseScratchRegisterScope temps(this); \
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
Mov(scratch, zm); \
MovprfxHelperScope guard(this, zd, pg, zn); \
ASMFN(zd, pg, zd, scratch); \
} else { \
MovprfxHelperScope guard(this, zd, pg, zn); \
ASMFN(zd, pg, zd, zm); \
} \
}
VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
// Instructions of the form "inst zda, pg, zda, zn", where they are
// non-commutative and a reversed form is provided.
#define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
V(Asr, asr) \
V(Fdiv, fdiv) \
V(Fsub, fsub) \
V(Lsl, lsl) \
V(Lsr, lsr) \
V(Sdiv, sdiv) \
V(Shsub, shsub) \
V(Sqrshl, sqrshl) \
V(Sqshl, sqshl) \
V(Sqsub, sqsub) \
V(Srshl, srshl) \
V(Sub, sub) \
V(Udiv, udiv) \
V(Uhsub, uhsub) \
V(Uqrshl, uqrshl) \
V(Uqshl, uqshl) \
V(Uqsub, uqsub) \
V(Urshl, urshl)
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
void MacroAssembler::MASMFN(const ZRegister& zd, \
const PRegisterM& pg, \
const ZRegister& zn, \
const ZRegister& zm) { \
VIXL_ASSERT(allow_macro_instructions_); \
NoncommutativeArithmeticHelper(zd, \
pg, \
zn, \
zm, \
static_cast<SVEArithPredicatedFn>( \
&Assembler::ASMFN), \
static_cast<SVEArithPredicatedFn>( \
&Assembler::ASMFN##r)); \
}
VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
void MacroAssembler::Fadd(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fadd),
nan_option);
}
void MacroAssembler::Fabd(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fabd),
nan_option);
}
void MacroAssembler::Fmul(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fmul),
nan_option);
}
void MacroAssembler::Fmulx(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fmulx),
nan_option);
}
void MacroAssembler::Fmax(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fmax),
nan_option);
}
void MacroAssembler::Fmin(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fmin),
nan_option);
}
void MacroAssembler::Fmaxnm(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fmaxnm),
nan_option);
}
void MacroAssembler::Fminnm(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPCommutativeArithmeticHelper(zd,
pg,
zn,
zm,
static_cast<SVEArithPredicatedFn>(
&Assembler::fminnm),
nan_option);
}
void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
VIXL_ASSERT(allow_macro_instructions_);
switch (zd.GetLaneSizeInBits()) {
case kHRegSize:
Fdup(zd, Float16(imm));
break;
case kSRegSize:
Fdup(zd, static_cast<float>(imm));
break;
case kDRegSize:
uint64_t bits = DoubleToRawbits(imm);
if (IsImmFP64(bits)) {
SingleEmissionCheckScope guard(this);
fdup(zd, imm);
} else {
Dup(zd, bits);
}
break;
}
}
void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
VIXL_ASSERT(allow_macro_instructions_);
switch (zd.GetLaneSizeInBits()) {
case kHRegSize:
Fdup(zd, Float16(imm));
break;
case kSRegSize:
if (IsImmFP32(imm)) {
SingleEmissionCheckScope guard(this);
fdup(zd, imm);
} else {
Dup(zd, FloatToRawbits(imm));
}
break;
case kDRegSize:
Fdup(zd, static_cast<double>(imm));
break;
}
}
void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
VIXL_ASSERT(allow_macro_instructions_);
switch (zd.GetLaneSizeInBits()) {
case kHRegSize:
if (IsImmFP16(imm)) {
SingleEmissionCheckScope guard(this);
fdup(zd, imm);
} else {
Dup(zd, Float16ToRawbits(imm));
}
break;
case kSRegSize:
Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
break;
case kDRegSize:
Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
break;
}
}
void MacroAssembler::Index(const ZRegister& zd,
const Operand& start,
const Operand& step) {
class IndexOperand : public Operand {
public:
static IndexOperand Prepare(MacroAssembler* masm,
UseScratchRegisterScope* temps,
const Operand& op,
const ZRegister& zd_inner) {
// Look for encodable immediates.
int imm;
if (op.IsImmediate()) {
if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
return IndexOperand(imm);
}
Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
masm->Mov(scratch, op);
return IndexOperand(scratch);
} else {
// Plain registers can be encoded directly.
VIXL_ASSERT(op.IsPlainRegister());
return IndexOperand(op.GetRegister());
}
}
int GetImm5() const {
int64_t imm = GetImmediate();
VIXL_ASSERT(IsInt5(imm));
return static_cast<int>(imm);
}
private:
explicit IndexOperand(const Register& reg) : Operand(reg) {}
explicit IndexOperand(int64_t imm) : Operand(imm) {}
};
UseScratchRegisterScope temps(this);
IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
SingleEmissionCheckScope guard(this);
if (start_enc.IsImmediate()) {
if (step_enc.IsImmediate()) {
index(zd, start_enc.GetImm5(), step_enc.GetImm5());
} else {
index(zd, start_enc.GetImm5(), step_enc.GetRegister());
}
} else {
if (step_enc.IsImmediate()) {
index(zd, start_enc.GetRegister(), step_enc.GetImm5());
} else {
index(zd, start_enc.GetRegister(), step_enc.GetRegister());
}
}
}
void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(imm.FitsInLane(zdn));
if (imm.IsZero()) {
SingleEmissionCheckScope guard(this);
insr(zdn, xzr);
return;
}
UseScratchRegisterScope temps(this);
Register scratch = temps.AcquireRegisterToHoldLane(zdn);
// TODO: There are many cases where we could optimise immediates, such as by
// detecting repeating patterns or FP immediates. We should optimise and
// abstract this for use in other SVE mov-immediate-like macros.
Mov(scratch, imm);
SingleEmissionCheckScope guard(this);
insr(zdn, scratch);
}
void MacroAssembler::Mla(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm) {
VIXL_ASSERT(allow_macro_instructions_);
if (zd.Aliases(za)) {
// zda = zda + (zn * zm)
SingleEmissionCheckScope guard(this);
mla(zd, pg, zn, zm);
} else if (zd.Aliases(zn)) {
// zdn = za + (zdn * zm)
SingleEmissionCheckScope guard(this);
mad(zd, pg, zm, za);
} else if (zd.Aliases(zm)) {
// Multiplication is commutative, so we can swap zn and zm.
// zdm = za + (zdm * zn)
SingleEmissionCheckScope guard(this);
mad(zd, pg, zn, za);
} else {
// zd = za + (zn * zm)
ExactAssemblyScope guard(this, 2 * kInstructionSize);
movprfx(zd, pg, za);
mla(zd, pg, zn, zm);
}
}
void MacroAssembler::Mls(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm) {
VIXL_ASSERT(allow_macro_instructions_);
if (zd.Aliases(za)) {
// zda = zda - (zn * zm)
SingleEmissionCheckScope guard(this);
mls(zd, pg, zn, zm);
} else if (zd.Aliases(zn)) {
// zdn = za - (zdn * zm)
SingleEmissionCheckScope guard(this);
msb(zd, pg, zm, za);
} else if (zd.Aliases(zm)) {
// Multiplication is commutative, so we can swap zn and zm.
// zdm = za - (zdm * zn)
SingleEmissionCheckScope guard(this);
msb(zd, pg, zn, za);
} else {
// zd = za - (zn * zm)
ExactAssemblyScope guard(this, 2 * kInstructionSize);
movprfx(zd, pg, za);
mls(zd, pg, zn, zm);
}
}
void MacroAssembler::CompareHelper(Condition cond,
const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const ZRegister& zn,
IntegerOperand imm) {
UseScratchRegisterScope temps(this);
ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
Dup(zm, imm);
SingleEmissionCheckScope guard(this);
cmp(cond, pd, pg, zn, zm);
}
void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
const PRegister& pg,
const PRegisterWithLaneSize& pn) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(pd.IsLaneSizeB());
VIXL_ASSERT(pn.IsLaneSizeB());
if (pd.Is(pn)) {
SingleEmissionCheckScope guard(this);
pfirst(pd, pg, pn);
} else {
UseScratchRegisterScope temps(this);
PRegister temp_pg = pg;
if (pd.Aliases(pg)) {
temp_pg = temps.AcquireP();
Mov(temp_pg.VnB(), pg.VnB());
}
Mov(pd, pn);
SingleEmissionCheckScope guard(this);
pfirst(pd, temp_pg, pd);
}
}
void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
const PRegister& pg,
const PRegisterWithLaneSize& pn) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(AreSameFormat(pd, pn));
if (pd.Is(pn)) {
SingleEmissionCheckScope guard(this);
pnext(pd, pg, pn);
} else {
UseScratchRegisterScope temps(this);
PRegister temp_pg = pg;
if (pd.Aliases(pg)) {
temp_pg = temps.AcquireP();
Mov(temp_pg.VnB(), pg.VnB());
}
Mov(pd.VnB(), pn.VnB());
SingleEmissionCheckScope guard(this);
pnext(pd, temp_pg, pd);
}
}
void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
SVEPredicateConstraint pattern,
FlagsUpdate s) {
VIXL_ASSERT(allow_macro_instructions_);
switch (s) {
case LeaveFlags:
Ptrue(pd, pattern);
return;
case SetFlags:
Ptrues(pd, pattern);
return;
}
VIXL_UNREACHABLE();
}
void MacroAssembler::Sub(const ZRegister& zd,
IntegerOperand imm,
const ZRegister& zm) {
VIXL_ASSERT(allow_macro_instructions_);
int imm8;
int shift = -1;
if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
MovprfxHelperScope guard(this, zd, zm);
subr(zd, zd, imm8, shift);
} else {
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
Dup(scratch, imm);
SingleEmissionCheckScope guard(this);
sub(zd, scratch, zm);
}
}
void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr,
SVELoadBroadcastFn fn,
int divisor) {
VIXL_ASSERT(addr.IsScalarPlusImmediate());
int64_t imm = addr.GetImmediateOffset();
if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
} else {
UseScratchRegisterScope temps(this);
Register scratch = temps.AcquireX();
CalculateSVEAddress(scratch, addr, zt);
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, SVEMemOperand(scratch));
}
}
void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
const SVEMemOperand& addr,
SVELoadStoreFn fn) {
VIXL_ASSERT(allow_macro_instructions_);
VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
if (addr.IsPlainScalar() ||
(addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
addr.IsMulVl())) {
SingleEmissionCheckScope guard(this);
(this->*fn)(rt, addr);
return;
}
if (addr.IsEquivalentToScalar()) {
SingleEmissionCheckScope guard(this);
(this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
return;
}
UseScratchRegisterScope temps(this);
Register scratch = temps.AcquireX();
CalculateSVEAddress(scratch, addr, rt);
SingleEmissionCheckScope guard(this);
(this->*fn)(rt, SVEMemOperand(scratch));
}
template <typename Tg, typename Tf>
void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
const ZRegister& zt,
const Tg& pg,
const SVEMemOperand& addr,
Tf fn,
int imm_bits,
int shift_amount,
SVEOffsetModifier supported_modifier,
int vl_divisor_log2) {
VIXL_ASSERT(allow_macro_instructions_);
int imm_divisor = 1 << shift_amount;
if (addr.IsPlainScalar() ||
(addr.IsScalarPlusImmediate() &&
IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
((addr.GetImmediateOffset() % imm_divisor) == 0) &&
(addr.GetOffsetModifier() == supported_modifier))) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
return;
}
if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
return;
}
if (addr.IsEquivalentToScalar()) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
return;
}
if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
(vl_divisor_log2 == -1)) {
// We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
// dependent.
VIXL_UNIMPLEMENTED();
}
UseScratchRegisterScope temps(this);
Register scratch = temps.AcquireX();
CalculateSVEAddress(scratch, addr, vl_divisor_log2);
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, SVEMemOperand(scratch));
}
template <typename Tg, typename Tf>
void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
const ZRegister& zt,
const Tg& pg,
const SVEMemOperand& addr,
Tf fn) {
if (addr.IsPlainScalar() ||
(addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
(addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
addr.IsMulVl())) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
return;
}
if (addr.IsEquivalentToScalar()) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
return;
}
if (addr.IsVectorPlusImmediate()) {
uint64_t offset = addr.GetImmediateOffset();
if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
IsUint5(offset >> msize_in_bytes_log2)) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
return;
}
}
if (addr.IsScalarPlusVector()) {
VIXL_ASSERT(addr.IsScatterGather());
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
return;
}
UseScratchRegisterScope temps(this);
if (addr.IsScatterGather()) {
// In scatter-gather modes, zt and zn/zm have the same lane size. However,
// for 32-bit accesses, the result of each lane's address calculation still
// requires 64 bits; we can't naively use `Adr` for the address calculation
// because it would truncate each address to 32 bits.
if (addr.IsVectorPlusImmediate()) {
// Synthesise the immediate in an X register, then use a
// scalar-plus-vector access with the original vector.
Register scratch = temps.AcquireX();
Mov(scratch, addr.GetImmediateOffset());
SingleEmissionCheckScope guard(this);
SVEOffsetModifier om =
zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
(this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
return;
}
VIXL_UNIMPLEMENTED();
} else {
Register scratch = temps.AcquireX();
// TODO: If we have an immediate offset that is a multiple of
// msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
// save an instruction.
int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
CalculateSVEAddress(scratch, addr, vl_divisor_log2);
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, SVEMemOperand(scratch));
}
}
template <typename Tf>
void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr,
Tf fn) {
if (addr.IsScatterGather()) {
// Scatter-gather first-fault loads share encodings with normal loads.
SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
return;
}
// Contiguous first-faulting loads have no scalar-plus-immediate form at all,
// so we don't do immediate synthesis.
// We cannot currently distinguish "[x0]" from "[x0, #0]", and this
// is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zt, pg, addr);
return;
}
VIXL_UNIMPLEMENTED();
}
void MacroAssembler::Ld1b(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kBRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1b));
}
void MacroAssembler::Ld1h(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kHRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1h));
}
void MacroAssembler::Ld1w(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kWRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1w));
}
void MacroAssembler::Ld1d(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kDRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1d));
}
void MacroAssembler::Ld1sb(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kBRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1sb));
}
void MacroAssembler::Ld1sh(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kHRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1sh));
}
void MacroAssembler::Ld1sw(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kSRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ld1sw));
}
void MacroAssembler::St1b(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kBRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVEStore1Fn>(&Assembler::st1b));
}
void MacroAssembler::St1h(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kHRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVEStore1Fn>(&Assembler::st1h));
}
void MacroAssembler::St1w(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kSRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVEStore1Fn>(&Assembler::st1w));
}
void MacroAssembler::St1d(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadStore1Helper(kDRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVEStore1Fn>(&Assembler::st1d));
}
void MacroAssembler::Ldff1b(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kBRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1b));
}
void MacroAssembler::Ldff1h(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kHRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1h));
}
void MacroAssembler::Ldff1w(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kSRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1w));
}
void MacroAssembler::Ldff1d(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kDRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1d));
}
void MacroAssembler::Ldff1sb(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kBRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
}
void MacroAssembler::Ldff1sh(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kHRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
}
void MacroAssembler::Ldff1sw(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
SVELoadFFHelper(kSRegSizeInBytesLog2,
zt,
pg,
addr,
static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
}
#define VIXL_SVE_LD1R_LIST(V) \
V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)
#define VIXL_DEFINE_MASM_FUNC(SZ, SH) \
void MacroAssembler::Ld1r##SZ(const ZRegister& zt, \
const PRegisterZ& pg, \
const SVEMemOperand& addr) { \
VIXL_ASSERT(allow_macro_instructions_); \
SVELoadStoreNTBroadcastQOHelper(zt, \
pg, \
addr, \
&MacroAssembler::ld1r##SZ, \
4, \
SH, \
NO_SVE_OFFSET_MODIFIER, \
-1); \
}
VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
#undef VIXL_SVE_LD1R_LIST
void MacroAssembler::Ldnt1b(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
ldnt1b(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::ldnt1b,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Ldnt1d(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
ldnt1d(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::ldnt1d,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Ldnt1h(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
ldnt1h(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::ldnt1h,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Ldnt1w(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
ldnt1w(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::ldnt1w,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Stnt1b(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
stnt1b(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::stnt1b,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Stnt1d(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
stnt1d(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::stnt1d,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Stnt1h(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
stnt1h(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::stnt1h,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::Stnt1w(const ZRegister& zt,
const PRegister& pg,
const SVEMemOperand& addr) {
VIXL_ASSERT(allow_macro_instructions_);
if (addr.IsVectorPlusScalar()) {
SingleEmissionCheckScope guard(this);
stnt1w(zt, pg, addr);
} else {
SVELoadStoreNTBroadcastQOHelper(zt,
pg,
addr,
&MacroAssembler::stnt1w,
4,
0,
SVE_MUL_VL);
}
}
void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index) {
if (zd.Aliases(za)) {
// zda = zda + (zn . zm)
SingleEmissionCheckScope guard(this);
(this->*fn)(zd, zn, zm, index);
} else if (zd.Aliases(zn) || zd.Aliases(zm)) {
// zdn = za + (zdn . zm[index])
// zdm = za + (zn . zdm[index])
// zdnm = za + (zdnm . zdnm[index])
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, za);
(this->*fn)(scratch, zn, zm, index);
}
Mov(zd, scratch);
} else {
// zd = za + (zn . zm)
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zn, zm, index);
}
}
void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm) {
if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
// zd = za . zd . zm
// zd = za . zn . zd
// zd = za . zd . zd
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, za);
(this->*fn)(scratch, zn, zm);
}
Mov(zd, scratch);
} else {
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zn, zm);
}
}
void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm) {
if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
// zd = za . zd . zm
// zd = za . zn . zd
// zd = za . zd . zd
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, za);
(this->*fn)(scratch, scratch, zn, zm);
}
Mov(zd, scratch);
} else {
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zd, zn, zm);
}
}
void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int imm) {
if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
// zd = za . zd . zm[i]
// zd = za . zn . zd[i]
// zd = za . zd . zd[i]
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, za);
(this->*fn)(scratch, zn, zm, imm);
}
Mov(zd, scratch);
} else {
// zd = za . zn . zm[i]
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zn, zm, imm);
}
}
void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm) {
if (zn.Aliases(zm)) {
// If zn == zm, the difference is zero.
if (!zd.Aliases(za)) {
Mov(zd, za);
}
} else if (zd.Aliases(za)) {
SingleEmissionCheckScope guard(this);
(this->*fn)(zd, zn, zm);
} else if (zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
Mov(ztmp, zn);
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, ztmp, zm);
} else if (zd.Aliases(zm)) {
UseScratchRegisterScope temps(this);
ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
Mov(ztmp, zm);
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zn, ztmp);
} else {
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zn, zm);
}
}
#define VIXL_SVE_4REG_LIST(V) \
V(Saba, saba, AbsoluteDifferenceAccumulate) \
V(Uaba, uaba, AbsoluteDifferenceAccumulate) \
V(Sabalb, sabalb, AbsoluteDifferenceAccumulate) \
V(Sabalt, sabalt, AbsoluteDifferenceAccumulate) \
V(Uabalb, uabalb, AbsoluteDifferenceAccumulate) \
V(Uabalt, uabalt, AbsoluteDifferenceAccumulate) \
V(Sdot, sdot, FourRegDestructiveHelper) \
V(Udot, udot, FourRegDestructiveHelper) \
V(Adclb, adclb, FourRegDestructiveHelper) \
V(Adclt, adclt, FourRegDestructiveHelper) \
V(Sbclb, sbclb, FourRegDestructiveHelper) \
V(Sbclt, sbclt, FourRegDestructiveHelper) \
V(Smlalb, smlalb, FourRegDestructiveHelper) \
V(Smlalt, smlalt, FourRegDestructiveHelper) \
V(Smlslb, smlslb, FourRegDestructiveHelper) \
V(Smlslt, smlslt, FourRegDestructiveHelper) \
V(Umlalb, umlalb, FourRegDestructiveHelper) \
V(Umlalt, umlalt, FourRegDestructiveHelper) \
V(Umlslb, umlslb, FourRegDestructiveHelper) \
V(Umlslt, umlslt, FourRegDestructiveHelper) \
V(Bcax, bcax, FourRegDestructiveHelper) \
V(Bsl, bsl, FourRegDestructiveHelper) \
V(Bsl1n, bsl1n, FourRegDestructiveHelper) \
V(Bsl2n, bsl2n, FourRegDestructiveHelper) \
V(Eor3, eor3, FourRegDestructiveHelper) \
V(Nbsl, nbsl, FourRegDestructiveHelper) \
V(Fmlalb, fmlalb, FourRegDestructiveHelper) \
V(Fmlalt, fmlalt, FourRegDestructiveHelper) \
V(Fmlslb, fmlslb, FourRegDestructiveHelper) \
V(Fmlslt, fmlslt, FourRegDestructiveHelper) \
V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper) \
V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper) \
V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper) \
V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper) \
V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper) \
V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper) \
V(Fmmla, fmmla, FourRegDestructiveHelper) \
V(Smmla, smmla, FourRegDestructiveHelper) \
V(Ummla, ummla, FourRegDestructiveHelper) \
V(Usmmla, usmmla, FourRegDestructiveHelper) \
V(Usdot, usdot, FourRegDestructiveHelper)
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
void MacroAssembler::MASMFN(const ZRegister& zd, \
const ZRegister& za, \
const ZRegister& zn, \
const ZRegister& zm) { \
VIXL_ASSERT(allow_macro_instructions_); \
HELPER(&Assembler::ASMFN, zd, za, zn, zm); \
}
VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
#define VIXL_SVE_4REG_1IMM_LIST(V) \
V(Fmla, fmla, FourRegOneImmDestructiveHelper) \
V(Fmls, fmls, FourRegOneImmDestructiveHelper) \
V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper) \
V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper) \
V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper) \
V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper) \
V(Mla, mla, FourRegOneImmDestructiveHelper) \
V(Mls, mls, FourRegOneImmDestructiveHelper) \
V(Smlalb, smlalb, FourRegOneImmDestructiveHelper) \
V(Smlalt, smlalt, FourRegOneImmDestructiveHelper) \
V(Smlslb, smlslb, FourRegOneImmDestructiveHelper) \
V(Smlslt, smlslt, FourRegOneImmDestructiveHelper) \
V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
V(Umlalb, umlalb, FourRegOneImmDestructiveHelper) \
V(Umlalt, umlalt, FourRegOneImmDestructiveHelper) \
V(Umlslb, umlslb, FourRegOneImmDestructiveHelper) \
V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
void MacroAssembler::MASMFN(const ZRegister& zd, \
const ZRegister& za, \
const ZRegister& zn, \
const ZRegister& zm, \
int imm) { \
VIXL_ASSERT(allow_macro_instructions_); \
HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm); \
}
VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
#undef VIXL_DEFINE_MASM_FUNC
void MacroAssembler::Sdot(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index) {
VIXL_ASSERT(allow_macro_instructions_);
SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
}
void MacroAssembler::Udot(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index) {
VIXL_ASSERT(allow_macro_instructions_);
SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
}
void MacroAssembler::Sudot(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index) {
VIXL_ASSERT(allow_macro_instructions_);
SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
}
void MacroAssembler::Usdot(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index) {
VIXL_ASSERT(allow_macro_instructions_);
SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
}
void MacroAssembler::Cdot(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index,
int rot) {
// This doesn't handle zm when it's out of the range that can be encoded in
// instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
UseScratchRegisterScope temps(this);
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, ztmp, za);
cdot(ztmp, zn, zm, index, rot);
}
Mov(zd, ztmp);
} else {
MovprfxHelperScope guard(this, zd, za);
cdot(zd, zn, zm, index, rot);
}
}
void MacroAssembler::Cdot(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int rot) {
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
UseScratchRegisterScope temps(this);
VIXL_ASSERT(AreSameLaneSize(zn, zm));
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
Mov(ztmp, zd.Aliases(zn) ? zn : zm);
MovprfxHelperScope guard(this, zd, za);
cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
} else {
MovprfxHelperScope guard(this, zd, za);
cdot(zd, zn, zm, rot);
}
}
void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
SVEMulAddPredicatedZdaFn fn_zda,
SVEMulAddPredicatedZdnFn fn_zdn,
FPMacroNaNPropagationOption nan_option) {
ResolveFPNaNPropagationOption(&nan_option);
if (zd.Aliases(za)) {
// zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
SingleEmissionCheckScope guard(this);
(this->*fn_zda)(zd, pg, zn, zm);
} else if (zd.Aliases(zn)) {
// zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
SingleEmissionCheckScope guard(this);
(this->*fn_zdn)(zd, pg, zm, za);
} else if (zd.Aliases(zm)) {
switch (nan_option) {
case FastNaNPropagation: {
// We treat multiplication as commutative in the fast mode, so we can
// swap zn and zm.
// zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
SingleEmissionCheckScope guard(this);
(this->*fn_zdn)(zd, pg, zn, za);
return;
}
case StrictNaNPropagation: {
UseScratchRegisterScope temps(this);
// Use a scratch register to keep the argument order exactly as
// specified.
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
{
MovprfxHelperScope guard(this, scratch, pg, za);
// scratch = (-)za + ((-)zn * zm)
(this->*fn_zda)(scratch, pg, zn, zm);
}
Mov(zd, scratch);
return;
}
case NoFPMacroNaNPropagationSelected:
VIXL_UNREACHABLE();
return;
}
} else {
// zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
MovprfxHelperScope guard(this, zd, pg, za);
(this->*fn_zda)(zd, pg, zn, zm);
}
}
void MacroAssembler::Fmla(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPMulAddHelper(zd,
pg,
za,
zn,
zm,
&Assembler::fmla,
&Assembler::fmad,
nan_option);
}
void MacroAssembler::Fmls(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPMulAddHelper(zd,
pg,
za,
zn,
zm,
&Assembler::fmls,
&Assembler::fmsb,
nan_option);
}
void MacroAssembler::Fnmla(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPMulAddHelper(zd,
pg,
za,
zn,
zm,
&Assembler::fnmla,
&Assembler::fnmad,
nan_option);
}
void MacroAssembler::Fnmls(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option) {
VIXL_ASSERT(allow_macro_instructions_);
FPMulAddHelper(zd,
pg,
za,
zn,
zm,
&Assembler::fnmls,
&Assembler::fnmsb,
nan_option);
}
void MacroAssembler::Ftmad(const ZRegister& zd,
const ZRegister& zn,
const ZRegister& zm,
int imm3) {
VIXL_ASSERT(allow_macro_instructions_);
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
Mov(scratch, zm);
MovprfxHelperScope guard(this, zd, zn);
ftmad(zd, zd, scratch, imm3);
} else {
MovprfxHelperScope guard(this, zd, zn);
ftmad(zd, zd, zm, imm3);
}
}
void MacroAssembler::Fcadd(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
int rot) {
VIXL_ASSERT(allow_macro_instructions_);
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, pg, zn);
fcadd(scratch, pg, scratch, zm, rot);
}
Mov(zd, scratch);
} else {
MovprfxHelperScope guard(this, zd, pg, zn);
fcadd(zd, pg, zd, zm, rot);
}
}
void MacroAssembler::Fcmla(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int rot) {
VIXL_ASSERT(allow_macro_instructions_);
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
UseScratchRegisterScope temps(this);
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, ztmp, za);
fcmla(ztmp, pg, zn, zm, rot);
}
Mov(zd, pg, ztmp);
} else {
MovprfxHelperScope guard(this, zd, pg, za);
fcmla(zd, pg, zn, zm, rot);
}
}
void MacroAssembler::Splice(const ZRegister& zd,
const PRegister& pg,
const ZRegister& zn,
const ZRegister& zm) {
VIXL_ASSERT(allow_macro_instructions_);
if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
SingleEmissionCheckScope guard(this);
splice(zd, pg, zn, zm);
} else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, zn);
splice(scratch, pg, scratch, zm);
}
Mov(zd, scratch);
} else {
MovprfxHelperScope guard(this, zd, zn);
splice(zd, pg, zd, zm);
}
}
void MacroAssembler::Clasta(const ZRegister& zd,
const PRegister& pg,
const ZRegister& zn,
const ZRegister& zm) {
VIXL_ASSERT(allow_macro_instructions_);
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, zn);
clasta(scratch, pg, scratch, zm);
}
Mov(zd, scratch);
} else {
MovprfxHelperScope guard(this, zd, zn);
clasta(zd, pg, zd, zm);
}
}
void MacroAssembler::Clastb(const ZRegister& zd,
const PRegister& pg,
const ZRegister& zn,
const ZRegister& zm) {
VIXL_ASSERT(allow_macro_instructions_);
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
{
MovprfxHelperScope guard(this, scratch, zn);
clastb(scratch, pg, scratch, zm);
}
Mov(zd, scratch);
} else {
MovprfxHelperScope guard(this, zd, zn);
clastb(zd, pg, zd, zm);
}
}
void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
int shift) {
VIXL_ASSERT(allow_macro_instructions_);
if (!zd.Aliases(za) && zd.Aliases(zn)) {
UseScratchRegisterScope temps(this);
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
Mov(ztmp, zn);
{
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, ztmp, shift);
}
} else {
MovprfxHelperScope guard(this, zd, za);
(this->*fn)(zd, zn, shift);
}
}
void MacroAssembler::Srsra(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
int shift) {
ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
}
void MacroAssembler::Ssra(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
int shift) {
ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
}
void MacroAssembler::Ursra(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
int shift) {
ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
}
void MacroAssembler::Usra(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
int shift) {
ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
}
void MacroAssembler::ComplexAddition(ZZZImmFn fn,
const ZRegister& zd,
const ZRegister& zn,
const ZRegister& zm,
int rot) {
VIXL_ASSERT(allow_macro_instructions_);
if (!zd.Aliases(zn) && zd.Aliases(zm)) {
UseScratchRegisterScope temps(this);
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
Mov(ztmp, zm);
{
MovprfxHelperScope guard(this, zd, zn);
(this->*fn)(zd, zd, ztmp, rot);
}
} else {
MovprfxHelperScope guard(this, zd, zn);
(this->*fn)(zd, zd, zm, rot);
}
}
void MacroAssembler::Cadd(const ZRegister& zd,
const ZRegister& zn,
const ZRegister& zm,
int rot) {
ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
}
void MacroAssembler::Sqcadd(const ZRegister& zd,
const ZRegister& zn,
const ZRegister& zm,
int rot) {
ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
}
} // namespace aarch64
} // namespace vixl