From a4fbbd779a29b912299bc2830f0157513080ddb7 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 3 May 2021 16:47:37 -0700 Subject: [PATCH 01/15] tcg: Change parameters for tcg_target_const_match Change the return value to bool, because that's what is should have been from the start. Pass the ct mask instead of the whole TCGArgConstraint, as that's the only part that's relevant. Change the value argument to int64_t. We will need the extra width for 32-bit hosts wanting to match vector constants. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/aarch64/tcg-target.c.inc | 5 +---- tcg/arm/tcg-target.c.inc | 5 +---- tcg/i386/tcg-target.c.inc | 4 +--- tcg/mips/tcg-target.c.inc | 5 +---- tcg/ppc/tcg-target.c.inc | 4 +--- tcg/riscv/tcg-target.c.inc | 4 +--- tcg/s390/tcg-target.c.inc | 5 +---- tcg/sparc/tcg-target.c.inc | 5 +---- tcg/tcg.c | 5 ++--- tcg/tci/tcg-target.c.inc | 6 ++---- 10 files changed, 12 insertions(+), 36 deletions(-) diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc index 5bd366f2d4..27cde314a9 100644 --- a/tcg/aarch64/tcg-target.c.inc +++ b/tcg/aarch64/tcg-target.c.inc @@ -277,11 +277,8 @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8) } } -static int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct = arg_ct->ct; - if (ct & TCG_CT_CONST) { return 1; } diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 8457108a87..eb4f42e53d 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -301,11 +301,8 @@ static inline int check_fit_imm(uint32_t imm) * mov operand2: values represented with x << (2 * y), x < 0x100 * add, sub, eor...: ditto */ -static inline int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct; - ct = arg_ct->ct; if (ct & TCG_CT_CONST) { return 1; } else if ((ct & TCG_CT_CONST_ARM) && check_fit_imm(val)) { diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index 415c5c0796..34113388ef 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -210,10 +210,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, } /* test if a constant matches the constraint */ -static inline int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct = arg_ct->ct; if (ct & TCG_CT_CONST) { return 1; } diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc index 8b16726242..5944448b2a 100644 --- a/tcg/mips/tcg-target.c.inc +++ b/tcg/mips/tcg-target.c.inc @@ -193,11 +193,8 @@ static inline bool is_p2m1(tcg_target_long val) } /* test if a constant matches the constraint */ -static inline int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct; - ct = arg_ct->ct; if (ct & TCG_CT_CONST) { return 1; } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) { diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 838ccfa42d..795701442b 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -238,10 +238,8 @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target) } /* test if a constant matches the constraint */ -static int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct = arg_ct->ct; if (ct & TCG_CT_CONST) { return 1; } diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index ef43147040..da7eecafc5 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -145,10 +145,8 @@ static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len) } /* test if a constant matches the constraint */ -static int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct = arg_ct->ct; if (ct & TCG_CT_CONST) { return 1; } diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc index af8dfe81ac..5fe073f09a 100644 --- a/tcg/s390/tcg-target.c.inc +++ b/tcg/s390/tcg-target.c.inc @@ -417,11 +417,8 @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type, } /* Test if a constant matches the constraint. */ -static int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct = arg_ct->ct; - if (ct & TCG_CT_CONST) { return 1; } diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc index 3d50f985c6..ce39ac2d86 100644 --- a/tcg/sparc/tcg-target.c.inc +++ b/tcg/sparc/tcg-target.c.inc @@ -341,11 +341,8 @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type, } /* test if a constant matches the constraint */ -static inline int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - int ct = arg_ct->ct; - if (ct & TCG_CT_CONST) { return 1; } diff --git a/tcg/tcg.c b/tcg/tcg.c index db806a6658..0dc271aac9 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -148,8 +148,7 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, TCGReg base, intptr_t ofs); static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target); -static int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct); +static bool tcg_target_const_match(int64_t val, TCGType type, int ct); #ifdef TCG_TARGET_NEED_LDST_LABELS static int tcg_out_ldst_finalize(TCGContext *s); #endif @@ -4078,7 +4077,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) ts = arg_temp(arg); if (ts->val_type == TEMP_VAL_CONST - && tcg_target_const_match(ts->val, ts->type, arg_ct)) { + && tcg_target_const_match(ts->val, ts->type, arg_ct->ct)) { /* constant is OK for instruction */ const_args[i] = 1; new_args[i] = ts->val; diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc index ee6cdfec71..823ecd5d35 100644 --- a/tcg/tci/tcg-target.c.inc +++ b/tcg/tci/tcg-target.c.inc @@ -789,11 +789,9 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, } /* Test if a constant matches the constraint. */ -static int tcg_target_const_match(tcg_target_long val, TCGType type, - const TCGArgConstraint *arg_ct) +static bool tcg_target_const_match(int64_t val, TCGType type, int ct) { - /* No need to return 0 or 1, 0 or != 0 is good enough. */ - return arg_ct->ct & TCG_CT_CONST; + return ct & TCG_CT_CONST; } static void tcg_target_init(TCGContext *s) From 000cf4777aadda69d14a6994ca0d195a36733cbd Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 3 May 2021 16:47:52 -0700 Subject: [PATCH 02/15] tcg/arm: Add host vector framework Add registers and function stubs. The functionality is disabled via use_neon_instructions defined to 0. We must still include results for the mandatory opcodes in tcg_target_op_def, as all opcodes are checked during tcg init. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target-con-set.h | 4 ++ tcg/arm/tcg-target-con-str.h | 1 + tcg/arm/tcg-target.c.inc | 117 +++++++++++++++++++++++++++++------ tcg/arm/tcg-target.h | 48 ++++++++++++-- tcg/arm/tcg-target.opc.h | 12 ++++ 5 files changed, 158 insertions(+), 24 deletions(-) create mode 100644 tcg/arm/tcg-target.opc.h diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h index ab63e089c2..27aced5391 100644 --- a/tcg/arm/tcg-target-con-set.h +++ b/tcg/arm/tcg-target-con-set.h @@ -13,11 +13,14 @@ C_O0_I1(r) C_O0_I2(r, r) C_O0_I2(r, rIN) C_O0_I2(s, s) +C_O0_I2(w, r) C_O0_I3(s, s, s) C_O0_I4(r, r, rI, rI) C_O0_I4(s, s, s, s) C_O1_I1(r, l) C_O1_I1(r, r) +C_O1_I1(w, r) +C_O1_I1(w, wr) C_O1_I2(r, 0, rZ) C_O1_I2(r, l, l) C_O1_I2(r, r, r) @@ -26,6 +29,7 @@ C_O1_I2(r, r, rIK) C_O1_I2(r, r, rIN) C_O1_I2(r, r, ri) C_O1_I2(r, rZ, rZ) +C_O1_I2(w, w, w) C_O1_I4(r, r, r, rI, rI) C_O1_I4(r, r, rIN, rIK, 0) C_O2_I1(r, r, l) diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h index a0ab7747db..255a1ae0e2 100644 --- a/tcg/arm/tcg-target-con-str.h +++ b/tcg/arm/tcg-target-con-str.h @@ -11,6 +11,7 @@ REGS('r', ALL_GENERAL_REGS) REGS('l', ALL_QLOAD_REGS) REGS('s', ALL_QSTORE_REGS) +REGS('w', ALL_VECTOR_REGS) /* * Define constraint letters for constants: diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index eb4f42e53d..4770d0c537 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -40,22 +40,10 @@ bool use_idiv_instructions; #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { - "%r0", - "%r1", - "%r2", - "%r3", - "%r4", - "%r5", - "%r6", - "%r7", - "%r8", - "%r9", - "%r10", - "%r11", - "%r12", - "%r13", - "%r14", - "%pc", + "%r0", "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", + "%r8", "%r9", "%r10", "%r11", "%r12", "%sp", "%r14", "%pc", + "%q0", "%q1", "%q2", "%q3", "%q4", "%q5", "%q6", "%q7", + "%q8", "%q9", "%q10", "%q11", "%q12", "%q13", "%q14", "%q15", }; #endif @@ -75,6 +63,20 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_R3, TCG_REG_R12, TCG_REG_R14, + + TCG_REG_Q0, + TCG_REG_Q1, + TCG_REG_Q2, + TCG_REG_Q3, + /* Q4 - Q7 are call-saved, and skipped. */ + TCG_REG_Q8, + TCG_REG_Q9, + TCG_REG_Q10, + TCG_REG_Q11, + TCG_REG_Q12, + TCG_REG_Q13, + TCG_REG_Q14, + TCG_REG_Q15, }; static const int tcg_target_call_iarg_regs[4] = { @@ -85,6 +87,7 @@ static const int tcg_target_call_oarg_regs[2] = { }; #define TCG_REG_TMP TCG_REG_R12 +#define TCG_VEC_TMP TCG_REG_Q15 enum arm_cond_code_e { COND_EQ = 0x0, @@ -238,6 +241,7 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, #define TCG_CT_CONST_ZERO 0x800 #define ALL_GENERAL_REGS 0xffffu +#define ALL_VECTOR_REGS 0xffff0000u /* * r0-r2 will be overwritten when reading the tlb entry (softmmu only) @@ -2117,6 +2121,22 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_st_i64: return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s); + case INDEX_op_st_vec: + return C_O0_I2(w, r); + case INDEX_op_ld_vec: + case INDEX_op_dupm_vec: + return C_O1_I1(w, r); + case INDEX_op_dup_vec: + return C_O1_I1(w, wr); + case INDEX_op_dup2_vec: + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_xor_vec: + case INDEX_op_or_vec: + case INDEX_op_and_vec: + case INDEX_op_cmp_vec: + return C_O1_I2(w, w, w); + default: g_assert_not_reached(); } @@ -2126,12 +2146,18 @@ static void tcg_target_init(TCGContext *s) { /* Only probe for the platform and capabilities if we havn't already determined maximum values at compile time. */ -#ifndef use_idiv_instructions +#if !defined(use_idiv_instructions) || !defined(use_neon_instructions) { unsigned long hwcap = qemu_getauxval(AT_HWCAP); +#ifndef use_idiv_instructions use_idiv_instructions = (hwcap & HWCAP_ARM_IDIVA) != 0; +#endif +#ifndef use_neon_instructions + use_neon_instructions = (hwcap & HWCAP_ARM_NEON) != 0; +#endif } #endif + if (__ARM_ARCH < 7) { const char *pl = (const char *)qemu_getauxval(AT_PLATFORM); if (pl != NULL && pl[0] == 'v' && pl[1] >= '4' && pl[1] <= '9') { @@ -2139,7 +2165,7 @@ static void tcg_target_init(TCGContext *s) } } - tcg_target_available_regs[TCG_TYPE_I32] = 0xffff; + tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; tcg_target_call_clobber_regs = 0; tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R0); @@ -2149,10 +2175,29 @@ static void tcg_target_init(TCGContext *s) tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R12); tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R14); + if (use_neon_instructions) { + tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; + tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; + + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q0); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q1); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q2); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q3); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q8); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q9); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q10); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q11); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q12); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q13); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q14); + tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q15); + } + s->reserved_regs = 0; tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_PC); + tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP); } static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, @@ -2186,6 +2231,42 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type, tcg_out_movi32(s, COND_AL, ret, arg); } +static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, + TCGReg rd, TCGReg rs) +{ + g_assert_not_reached(); +} + +static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, + TCGReg rd, TCGReg base, intptr_t offset) +{ + g_assert_not_reached(); +} + +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, + TCGReg rd, int64_t v64) +{ + g_assert_not_reached(); +} + +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, + unsigned vecl, unsigned vece, + const TCGArg *args, const int *const_args) +{ + g_assert_not_reached(); +} + +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) +{ + return 0; +} + +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ + g_assert_not_reached(); +} + static void tcg_out_nop_fill(tcg_insn_unit *p, int count) { int i; diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 8d1fee6327..a9dc09bd08 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -78,19 +78,38 @@ typedef enum { TCG_REG_R13, TCG_REG_R14, TCG_REG_PC, + + TCG_REG_Q0, + TCG_REG_Q1, + TCG_REG_Q2, + TCG_REG_Q3, + TCG_REG_Q4, + TCG_REG_Q5, + TCG_REG_Q6, + TCG_REG_Q7, + TCG_REG_Q8, + TCG_REG_Q9, + TCG_REG_Q10, + TCG_REG_Q11, + TCG_REG_Q12, + TCG_REG_Q13, + TCG_REG_Q14, + TCG_REG_Q15, + + TCG_AREG0 = TCG_REG_R6, + TCG_REG_CALL_STACK = TCG_REG_R13, } TCGReg; -#define TCG_TARGET_NB_REGS 16 +#define TCG_TARGET_NB_REGS 32 #ifdef __ARM_ARCH_EXT_IDIV__ #define use_idiv_instructions 1 #else extern bool use_idiv_instructions; #endif - +#define use_neon_instructions 0 /* used for function call generation */ -#define TCG_REG_CALL_STACK TCG_REG_R13 #define TCG_TARGET_STACK_ALIGN 8 #define TCG_TARGET_CALL_ALIGN_ARGS 1 #define TCG_TARGET_CALL_STACK_OFFSET 0 @@ -128,9 +147,26 @@ extern bool use_idiv_instructions; #define TCG_TARGET_HAS_direct_jump 0 #define TCG_TARGET_HAS_qemu_st8_i32 0 -enum { - TCG_AREG0 = TCG_REG_R6, -}; +#define TCG_TARGET_HAS_v64 use_neon_instructions +#define TCG_TARGET_HAS_v128 use_neon_instructions +#define TCG_TARGET_HAS_v256 0 + +#define TCG_TARGET_HAS_andc_vec 0 +#define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_neg_vec 0 +#define TCG_TARGET_HAS_abs_vec 0 +#define TCG_TARGET_HAS_roti_vec 0 +#define TCG_TARGET_HAS_rots_vec 0 +#define TCG_TARGET_HAS_rotv_vec 0 +#define TCG_TARGET_HAS_shi_vec 0 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_mul_vec 0 +#define TCG_TARGET_HAS_sat_vec 0 +#define TCG_TARGET_HAS_minmax_vec 0 +#define TCG_TARGET_HAS_bitsel_vec 0 +#define TCG_TARGET_HAS_cmpsel_vec 0 #define TCG_TARGET_DEFAULT_MO (0) #define TCG_TARGET_HAS_MEMORY_BSWAP 1 diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h new file mode 100644 index 0000000000..7a4578e9b4 --- /dev/null +++ b/tcg/arm/tcg-target.opc.h @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2019 Linaro + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. + * + * See the COPYING file in the top-level directory for details. + * + * Target-specific opcodes for host vector expansion. These will be + * emitted by tcg_expand_vec_op. For those familiar with GCC internals, + * consider these to be UNSPEC with names. + */ From 6e49fad23f56f2618ddcbca78f0d4e1fc692ae9d Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 3 May 2021 16:48:03 -0700 Subject: [PATCH 03/15] tcg/arm: Implement tcg_out_ld/st for vector types Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 70 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 4770d0c537..c6cfb83308 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -172,6 +172,9 @@ typedef enum { INSN_NOP_v6k = 0xe320f000, /* Otherwise the assembler uses mov r0,r0 */ INSN_NOP_v4 = (COND_AL << 28) | ARITH_MOV, + + INSN_VLD1 = 0xf4200000, /* VLD1 (multiple single elements) */ + INSN_VST1 = 0xf4000000, /* VST1 (multiple single elements) */ } ARMInsn; #define INSN_NOP (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4) @@ -1093,6 +1096,33 @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args, } } +/* + * Note that TCGReg references Q-registers. + * Q-regno = 2 * D-regno, so shift left by 1 whlie inserting. + */ +static uint32_t encode_vd(TCGReg rd) +{ + tcg_debug_assert(rd >= TCG_REG_Q0); + return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13); +} + +static void tcg_out_vldst(TCGContext *s, ARMInsn insn, + TCGReg rd, TCGReg rn, int offset) +{ + if (offset != 0) { + if (check_fit_imm(offset) || check_fit_imm(-offset)) { + tcg_out_dat_rIN(s, COND_AL, ARITH_ADD, ARITH_SUB, + TCG_REG_TMP, rn, offset, true); + } else { + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, offset); + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, + TCG_REG_TMP, TCG_REG_TMP, rn, 0); + } + rn = TCG_REG_TMP; + } + tcg_out32(s, insn | (rn << 16) | encode_vd(rd) | 0xf); +} + #ifdef CONFIG_SOFTMMU #include "../tcg-ldst.c.inc" @@ -2200,16 +2230,44 @@ static void tcg_target_init(TCGContext *s) tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP); } -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, + TCGReg arg1, intptr_t arg2) { - tcg_out_ld32u(s, COND_AL, arg, arg1, arg2); + switch (type) { + case TCG_TYPE_I32: + tcg_out_ld32u(s, COND_AL, arg, arg1, arg2); + return; + case TCG_TYPE_V64: + /* regs 1; size 8; align 8 */ + tcg_out_vldst(s, INSN_VLD1 | 0x7d0, arg, arg1, arg2); + return; + case TCG_TYPE_V128: + /* regs 2; size 8; align 16 */ + tcg_out_vldst(s, INSN_VLD1 | 0xae0, arg, arg1, arg2); + return; + default: + g_assert_not_reached(); + } } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, + TCGReg arg1, intptr_t arg2) { - tcg_out_st32(s, COND_AL, arg, arg1, arg2); + switch (type) { + case TCG_TYPE_I32: + tcg_out_st32(s, COND_AL, arg, arg1, arg2); + return; + case TCG_TYPE_V64: + /* regs 1; size 8; align 8 */ + tcg_out_vldst(s, INSN_VST1 | 0x7d0, arg, arg1, arg2); + return; + case TCG_TYPE_V128: + /* regs 2; size 8; align 16 */ + tcg_out_vldst(s, INSN_VST1 | 0xae0, arg, arg1, arg2); + return; + default: + g_assert_not_reached(); + } } static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, From 2df2a8cf77e76370698164e3d361e07075d2a699 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 3 May 2021 16:48:07 -0700 Subject: [PATCH 04/15] tcg/arm: Implement tcg_out_mov for vector types Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 52 +++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index c6cfb83308..a802d4b585 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -173,6 +173,8 @@ typedef enum { /* Otherwise the assembler uses mov r0,r0 */ INSN_NOP_v4 = (COND_AL << 28) | ARITH_MOV, + INSN_VORR = 0xf2200110, + INSN_VLD1 = 0xf4200000, /* VLD1 (multiple single elements) */ INSN_VST1 = 0xf4000000, /* VST1 (multiple single elements) */ } ARMInsn; @@ -1106,6 +1108,25 @@ static uint32_t encode_vd(TCGReg rd) return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13); } +static uint32_t encode_vn(TCGReg rn) +{ + tcg_debug_assert(rn >= TCG_REG_Q0); + return (extract32(rn, 3, 1) << 7) | (extract32(rn, 0, 3) << 17); +} + +static uint32_t encode_vm(TCGReg rm) +{ + tcg_debug_assert(rm >= TCG_REG_Q0); + return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1); +} + +static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece, + TCGReg d, TCGReg n, TCGReg m) +{ + tcg_out32(s, insn | (vece << 20) | (q << 6) | + encode_vd(d) | encode_vn(n) | encode_vm(m)); +} + static void tcg_out_vldst(TCGContext *s, ARMInsn insn, TCGReg rd, TCGReg rn, int offset) { @@ -2276,16 +2297,35 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, return false; } -static inline bool tcg_out_mov(TCGContext *s, TCGType type, - TCGReg ret, TCGReg arg) +static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { - tcg_out_mov_reg(s, COND_AL, ret, arg); - return true; + if (ret == arg) { + return true; + } + switch (type) { + case TCG_TYPE_I32: + if (ret < TCG_REG_Q0 && arg < TCG_REG_Q0) { + tcg_out_mov_reg(s, COND_AL, ret, arg); + return true; + } + return false; + + case TCG_TYPE_V64: + case TCG_TYPE_V128: + /* "VMOV D,N" is an alias for "VORR D,N,N". */ + tcg_out_vreg3(s, INSN_VORR, type - TCG_TYPE_V64, 0, ret, arg, arg); + return true; + + default: + g_assert_not_reached(); + } } -static inline void tcg_out_movi(TCGContext *s, TCGType type, - TCGReg ret, tcg_target_long arg) +static void tcg_out_movi(TCGContext *s, TCGType type, + TCGReg ret, tcg_target_long arg) { + tcg_debug_assert(type == TCG_TYPE_I32); + tcg_debug_assert(ret < TCG_REG_Q0); tcg_out_movi32(s, COND_AL, ret, arg); } From 213e8d84735f56e16d4485509ef48ccb6488d4a9 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 00:03:27 -0700 Subject: [PATCH 05/15] tcg/arm: Implement tcg_out_dup*_vec Most of dupi is copied from tcg/aarch64, which has the same encoding for AdvSimdExpandImm. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 283 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 275 insertions(+), 8 deletions(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index a802d4b585..1707214b46 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -175,8 +175,13 @@ typedef enum { INSN_VORR = 0xf2200110, + INSN_VDUP_G = 0xee800b10, /* VDUP (ARM core register) */ + INSN_VDUP_S = 0xf3b00c00, /* VDUP (scalar) */ + INSN_VLDR_D = 0xed100b00, /* VLDR.64 */ INSN_VLD1 = 0xf4200000, /* VLD1 (multiple single elements) */ + INSN_VLD1R = 0xf4a00c00, /* VLD1 (single element to all lanes) */ INSN_VST1 = 0xf4000000, /* VST1 (multiple single elements) */ + INSN_VMOVI = 0xf2800010, /* VMOV (immediate) */ } ARMInsn; #define INSN_NOP (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4) @@ -195,6 +200,14 @@ static const uint8_t tcg_cond_to_arm_cond[] = { [TCG_COND_GTU] = COND_HI, }; +static int encode_imm(uint32_t imm); + +/* TCG private relocation type: add with pc+imm8 */ +#define R_ARM_PC8 11 + +/* TCG private relocation type: vldr with imm8 << 2 */ +#define R_ARM_PC11 12 + static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target) { const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw); @@ -226,16 +239,52 @@ static bool reloc_pc13(tcg_insn_unit *src_rw, const tcg_insn_unit *target) return false; } +static bool reloc_pc11(tcg_insn_unit *src_rw, const tcg_insn_unit *target) +{ + const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw); + ptrdiff_t offset = (tcg_ptr_byte_diff(target, src_rx) - 8) / 4; + + if (offset >= -0xff && offset <= 0xff) { + tcg_insn_unit insn = *src_rw; + bool u = (offset >= 0); + if (!u) { + offset = -offset; + } + insn = deposit32(insn, 23, 1, u); + insn = deposit32(insn, 0, 8, offset); + *src_rw = insn; + return true; + } + return false; +} + +static bool reloc_pc8(tcg_insn_unit *src_rw, const tcg_insn_unit *target) +{ + const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw); + ptrdiff_t offset = tcg_ptr_byte_diff(target, src_rx) - 8; + int rot = encode_imm(offset); + + if (rot >= 0) { + *src_rw = deposit32(*src_rw, 0, 12, rol32(offset, rot) | (rot << 7)); + return true; + } + return false; +} + static bool patch_reloc(tcg_insn_unit *code_ptr, int type, intptr_t value, intptr_t addend) { tcg_debug_assert(addend == 0); - - if (type == R_ARM_PC24) { + switch (type) { + case R_ARM_PC24: return reloc_pc24(code_ptr, (const tcg_insn_unit *)value); - } else if (type == R_ARM_PC13) { + case R_ARM_PC13: return reloc_pc13(code_ptr, (const tcg_insn_unit *)value); - } else { + case R_ARM_PC11: + return reloc_pc11(code_ptr, (const tcg_insn_unit *)value); + case R_ARM_PC8: + return reloc_pc8(code_ptr, (const tcg_insn_unit *)value); + default: g_assert_not_reached(); } } @@ -275,7 +324,7 @@ static inline uint32_t rotl(uint32_t val, int n) /* ARM immediates for ALU instructions are made of an unsigned 8-bit right-rotated by an even amount between 0 and 30. */ -static inline int encode_imm(uint32_t imm) +static int encode_imm(uint32_t imm) { int shift; @@ -302,6 +351,79 @@ static inline int check_fit_imm(uint32_t imm) return encode_imm(imm) >= 0; } +/* Return true if v16 is a valid 16-bit shifted immediate. */ +static bool is_shimm16(uint16_t v16, int *cmode, int *imm8) +{ + if (v16 == (v16 & 0xff)) { + *cmode = 0x8; + *imm8 = v16 & 0xff; + return true; + } else if (v16 == (v16 & 0xff00)) { + *cmode = 0xa; + *imm8 = v16 >> 8; + return true; + } + return false; +} + +/* Return true if v32 is a valid 32-bit shifted immediate. */ +static bool is_shimm32(uint32_t v32, int *cmode, int *imm8) +{ + if (v32 == (v32 & 0xff)) { + *cmode = 0x0; + *imm8 = v32 & 0xff; + return true; + } else if (v32 == (v32 & 0xff00)) { + *cmode = 0x2; + *imm8 = (v32 >> 8) & 0xff; + return true; + } else if (v32 == (v32 & 0xff0000)) { + *cmode = 0x4; + *imm8 = (v32 >> 16) & 0xff; + return true; + } else if (v32 == (v32 & 0xff000000)) { + *cmode = 0x6; + *imm8 = v32 >> 24; + return true; + } + return false; +} + +/* Return true if v32 is a valid 32-bit shifting ones immediate. */ +static bool is_soimm32(uint32_t v32, int *cmode, int *imm8) +{ + if ((v32 & 0xffff00ff) == 0xff) { + *cmode = 0xc; + *imm8 = (v32 >> 8) & 0xff; + return true; + } else if ((v32 & 0xff00ffff) == 0xffff) { + *cmode = 0xd; + *imm8 = (v32 >> 16) & 0xff; + return true; + } + return false; +} + +/* + * Return non-zero if v32 can be formed by MOVI+ORR. + * Place the parameters for MOVI in (cmode, imm8). + * Return the cmode for ORR; the imm8 can be had via extraction from v32. + */ +static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8) +{ + int i; + + for (i = 6; i > 0; i -= 2) { + /* Mask out one byte we can add with ORR. */ + uint32_t tmp = v32 & ~(0xffu << (i * 4)); + if (is_shimm32(tmp, cmode, imm8) || + is_soimm32(tmp, cmode, imm8)) { + break; + } + } + return i; +} + /* Test if a constant matches the constraint. * TODO: define constraints for: * @@ -1127,6 +1249,15 @@ static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece, encode_vd(d) | encode_vn(n) | encode_vm(m)); } +static void tcg_out_vmovi(TCGContext *s, TCGReg rd, + int q, int op, int cmode, uint8_t imm8) +{ + tcg_out32(s, INSN_VMOVI | encode_vd(rd) | (q << 6) | (op << 5) + | (cmode << 8) | extract32(imm8, 0, 4) + | (extract32(imm8, 4, 3) << 16) + | (extract32(imm8, 7, 1) << 24)); +} + static void tcg_out_vldst(TCGContext *s, ARMInsn insn, TCGReg rd, TCGReg rn, int offset) { @@ -2329,22 +2460,158 @@ static void tcg_out_movi(TCGContext *s, TCGType type, tcg_out_movi32(s, COND_AL, ret, arg); } +/* Type is always V128, with I64 elements. */ +static void tcg_out_dup2_vec(TCGContext *s, TCGReg rd, TCGReg rl, TCGReg rh) +{ + /* Move high element into place first. */ + /* VMOV Dd+1, Ds */ + tcg_out_vreg3(s, INSN_VORR | (1 << 12), 0, 0, rd, rh, rh); + /* Move low element into place; tcg_out_mov will check for nop. */ + tcg_out_mov(s, TCG_TYPE_V64, rd, rl); +} + static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, TCGReg rs) { - g_assert_not_reached(); + int q = type - TCG_TYPE_V64; + + if (vece == MO_64) { + if (type == TCG_TYPE_V128) { + tcg_out_dup2_vec(s, rd, rs, rs); + } else { + tcg_out_mov(s, TCG_TYPE_V64, rd, rs); + } + } else if (rs < TCG_REG_Q0) { + int b = (vece == MO_8); + int e = (vece == MO_16); + tcg_out32(s, INSN_VDUP_G | (b << 22) | (q << 21) | (e << 5) | + encode_vn(rd) | (rs << 12)); + } else { + int imm4 = 1 << vece; + tcg_out32(s, INSN_VDUP_S | (imm4 << 16) | (q << 6) | + encode_vd(rd) | encode_vm(rs)); + } + return true; } static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, TCGReg base, intptr_t offset) { - g_assert_not_reached(); + if (vece == MO_64) { + tcg_out_ld(s, TCG_TYPE_V64, rd, base, offset); + if (type == TCG_TYPE_V128) { + tcg_out_dup2_vec(s, rd, rd, rd); + } + } else { + int q = type - TCG_TYPE_V64; + tcg_out_vldst(s, INSN_VLD1R | (vece << 6) | (q << 5), + rd, base, offset); + } + return true; } static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64) { - g_assert_not_reached(); + int q = type - TCG_TYPE_V64; + int cmode, imm8, i; + + /* Test all bytes equal first. */ + if (vece == MO_8) { + tcg_out_vmovi(s, rd, q, 0, 0xe, v64); + return; + } + + /* + * Test all bytes 0x00 or 0xff second. This can match cases that + * might otherwise take 2 or 3 insns for MO_16 or MO_32 below. + */ + for (i = imm8 = 0; i < 8; i++) { + uint8_t byte = v64 >> (i * 8); + if (byte == 0xff) { + imm8 |= 1 << i; + } else if (byte != 0) { + goto fail_bytes; + } + } + tcg_out_vmovi(s, rd, q, 1, 0xe, imm8); + return; + fail_bytes: + + /* + * Tests for various replications. For each element width, if we + * cannot find an expansion there's no point checking a larger + * width because we already know by replication it cannot match. + */ + if (vece == MO_16) { + uint16_t v16 = v64; + + if (is_shimm16(v16, &cmode, &imm8)) { + tcg_out_vmovi(s, rd, q, 0, cmode, imm8); + return; + } + if (is_shimm16(~v16, &cmode, &imm8)) { + tcg_out_vmovi(s, rd, q, 1, cmode, imm8); + return; + } + + /* + * Otherwise, all remaining constants can be loaded in two insns: + * rd = v16 & 0xff, rd |= v16 & 0xff00. + */ + tcg_out_vmovi(s, rd, q, 0, 0x8, v16 & 0xff); + tcg_out_vmovi(s, rd, q, 0, 0xb, v16 >> 8); /* VORRI */ + return; + } + + if (vece == MO_32) { + uint32_t v32 = v64; + + if (is_shimm32(v32, &cmode, &imm8) || + is_soimm32(v32, &cmode, &imm8)) { + tcg_out_vmovi(s, rd, q, 0, cmode, imm8); + return; + } + if (is_shimm32(~v32, &cmode, &imm8) || + is_soimm32(~v32, &cmode, &imm8)) { + tcg_out_vmovi(s, rd, q, 1, cmode, imm8); + return; + } + + /* + * Restrict the set of constants to those we can load with + * two instructions. Others we load from the pool. + */ + i = is_shimm32_pair(v32, &cmode, &imm8); + if (i) { + tcg_out_vmovi(s, rd, q, 0, cmode, imm8); + tcg_out_vmovi(s, rd, q, 0, i | 1, extract32(v32, i * 4, 8)); + return; + } + i = is_shimm32_pair(~v32, &cmode, &imm8); + if (i) { + tcg_out_vmovi(s, rd, q, 1, cmode, imm8); + tcg_out_vmovi(s, rd, q, 1, i | 1, extract32(~v32, i * 4, 8)); + return; + } + } + + /* + * As a last resort, load from the constant pool. + */ + if (!q || vece == MO_64) { + new_pool_l2(s, R_ARM_PC11, s->code_ptr, 0, v64, v64 >> 32); + /* VLDR Dd, [pc + offset] */ + tcg_out32(s, INSN_VLDR_D | encode_vd(rd) | (0xf << 16)); + if (q) { + tcg_out_dup2_vec(s, rd, rd, rd); + } + } else { + new_pool_label(s, (uint32_t)v64, R_ARM_PC8, s->code_ptr, 0); + /* add tmp, pc, offset */ + tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_TMP, TCG_REG_PC, 0); + tcg_out_dupm_vec(s, type, MO_32, rd, TCG_REG_TMP, 0); + } } static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, From d74b86ed4ad452eb5069ca0c168c731a5c429127 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 15:54:33 -0700 Subject: [PATCH 06/15] tcg/arm: Implement minimal vector operations Implementing dup2, add, sub, and, or, xor as the minimal set. This allows us to actually enable neon in the header file. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target-con-set.h | 3 + tcg/arm/tcg-target-con-str.h | 2 + tcg/arm/tcg-target.c.inc | 201 +++++++++++++++++++++++++++++++++-- tcg/arm/tcg-target.h | 6 +- 4 files changed, 204 insertions(+), 8 deletions(-) diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h index 27aced5391..f30b3900e0 100644 --- a/tcg/arm/tcg-target-con-set.h +++ b/tcg/arm/tcg-target-con-set.h @@ -30,6 +30,9 @@ C_O1_I2(r, r, rIN) C_O1_I2(r, r, ri) C_O1_I2(r, rZ, rZ) C_O1_I2(w, w, w) +C_O1_I2(w, w, wO) +C_O1_I2(w, w, wV) +C_O1_I2(w, w, wZ) C_O1_I4(r, r, r, rI, rI) C_O1_I4(r, r, rIN, rIK, 0) C_O2_I1(r, r, l) diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h index 255a1ae0e2..8f501149e1 100644 --- a/tcg/arm/tcg-target-con-str.h +++ b/tcg/arm/tcg-target-con-str.h @@ -20,4 +20,6 @@ REGS('w', ALL_VECTOR_REGS) CONST('I', TCG_CT_CONST_ARM) CONST('K', TCG_CT_CONST_INV) CONST('N', TCG_CT_CONST_NEG) +CONST('O', TCG_CT_CONST_ORRI) +CONST('V', TCG_CT_CONST_ANDI) CONST('Z', TCG_CT_CONST_ZERO) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 1707214b46..ab98fa1381 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -30,6 +30,9 @@ int arm_arch = __ARM_ARCH; #ifndef use_idiv_instructions bool use_idiv_instructions; #endif +#ifndef use_neon_instructions +bool use_neon_instructions; +#endif /* ??? Ought to think about changing CONFIG_SOFTMMU to always defined. */ #ifdef CONFIG_SOFTMMU @@ -173,7 +176,27 @@ typedef enum { /* Otherwise the assembler uses mov r0,r0 */ INSN_NOP_v4 = (COND_AL << 28) | ARITH_MOV, + INSN_VADD = 0xf2000800, + INSN_VAND = 0xf2000110, + INSN_VEOR = 0xf3000110, INSN_VORR = 0xf2200110, + INSN_VSUB = 0xf3000800, + + INSN_VMVN = 0xf3b00580, + + INSN_VCEQ0 = 0xf3b10100, + INSN_VCGT0 = 0xf3b10000, + INSN_VCGE0 = 0xf3b10080, + INSN_VCLE0 = 0xf3b10180, + INSN_VCLT0 = 0xf3b10200, + + INSN_VCEQ = 0xf3000810, + INSN_VCGE = 0xf2000310, + INSN_VCGT = 0xf2000300, + INSN_VCGE_U = 0xf3000310, + INSN_VCGT_U = 0xf3000300, + + INSN_VTST = 0xf2000810, INSN_VDUP_G = 0xee800b10, /* VDUP (ARM core register) */ INSN_VDUP_S = 0xf3b00c00, /* VDUP (scalar) */ @@ -293,6 +316,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, #define TCG_CT_CONST_INV 0x200 #define TCG_CT_CONST_NEG 0x400 #define TCG_CT_CONST_ZERO 0x800 +#define TCG_CT_CONST_ORRI 0x1000 +#define TCG_CT_CONST_ANDI 0x2000 #define ALL_GENERAL_REGS 0xffffu #define ALL_VECTOR_REGS 0xffff0000u @@ -424,6 +449,16 @@ static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8) return i; } +/* Return true if V is a valid 16-bit or 32-bit shifted immediate. */ +static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8) +{ + if (v32 == deposit32(v32, 16, 16, v32)) { + return is_shimm16(v32, cmode, imm8); + } else { + return is_shimm32(v32, cmode, imm8); + } +} + /* Test if a constant matches the constraint. * TODO: define constraints for: * @@ -444,9 +479,26 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) return 1; } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) { return 1; - } else { - return 0; } + + switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) { + case 0: + break; + case TCG_CT_CONST_ANDI: + val = ~val; + /* fallthru */ + case TCG_CT_CONST_ORRI: + if (val == deposit64(val, 32, 32, val)) { + int cmode, imm8; + return is_shimm1632(val, &cmode, &imm8); + } + break; + default: + /* Both bits should not be set for the same insn. */ + g_assert_not_reached(); + } + + return 0; } static inline void tcg_out_b(TCGContext *s, int cond, int32_t offset) @@ -1242,6 +1294,13 @@ static uint32_t encode_vm(TCGReg rm) return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1); } +static void tcg_out_vreg2(TCGContext *s, ARMInsn insn, int q, int vece, + TCGReg d, TCGReg m) +{ + tcg_out32(s, insn | (vece << 18) | (q << 6) | + encode_vd(d) | encode_vm(m)); +} + static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece, TCGReg d, TCGReg n, TCGReg m) { @@ -2314,10 +2373,13 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_add_vec: case INDEX_op_sub_vec: case INDEX_op_xor_vec: - case INDEX_op_or_vec: - case INDEX_op_and_vec: - case INDEX_op_cmp_vec: return C_O1_I2(w, w, w); + case INDEX_op_or_vec: + return C_O1_I2(w, w, wO); + case INDEX_op_and_vec: + return C_O1_I2(w, w, wV); + case INDEX_op_cmp_vec: + return C_O1_I2(w, w, wZ); default: g_assert_not_reached(); @@ -2614,16 +2676,141 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, } } +static const ARMInsn vec_cmp_insn[16] = { + [TCG_COND_EQ] = INSN_VCEQ, + [TCG_COND_GT] = INSN_VCGT, + [TCG_COND_GE] = INSN_VCGE, + [TCG_COND_GTU] = INSN_VCGT_U, + [TCG_COND_GEU] = INSN_VCGE_U, +}; + +static const ARMInsn vec_cmp0_insn[16] = { + [TCG_COND_EQ] = INSN_VCEQ0, + [TCG_COND_GT] = INSN_VCGT0, + [TCG_COND_GE] = INSN_VCGE0, + [TCG_COND_LT] = INSN_VCLT0, + [TCG_COND_LE] = INSN_VCLE0, +}; + static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned vece, const TCGArg *args, const int *const_args) { - g_assert_not_reached(); + TCGType type = vecl + TCG_TYPE_V64; + unsigned q = vecl; + TCGArg a0, a1, a2; + int cmode, imm8; + + a0 = args[0]; + a1 = args[1]; + a2 = args[2]; + + switch (opc) { + case INDEX_op_ld_vec: + tcg_out_ld(s, type, a0, a1, a2); + return; + case INDEX_op_st_vec: + tcg_out_st(s, type, a0, a1, a2); + return; + case INDEX_op_dupm_vec: + tcg_out_dupm_vec(s, type, vece, a0, a1, a2); + return; + case INDEX_op_dup2_vec: + tcg_out_dup2_vec(s, a0, a1, a2); + return; + case INDEX_op_add_vec: + tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2); + return; + case INDEX_op_sub_vec: + tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2); + return; + case INDEX_op_xor_vec: + tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2); + return; + + case INDEX_op_and_vec: + if (const_args[2]) { + is_shimm1632(~a2, &cmode, &imm8); + if (a0 == a1) { + tcg_out_vmovi(s, a0, q, 1, cmode | 1, imm8); /* VBICI */ + return; + } + tcg_out_vmovi(s, a0, q, 1, cmode, imm8); /* VMVNI */ + a2 = a0; + } + tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2); + return; + + case INDEX_op_or_vec: + if (const_args[2]) { + is_shimm1632(a2, &cmode, &imm8); + if (a0 == a1) { + tcg_out_vmovi(s, a0, q, 0, cmode | 1, imm8); /* VORRI */ + return; + } + tcg_out_vmovi(s, a0, q, 0, cmode, imm8); /* VMOVI */ + a2 = a0; + } + tcg_out_vreg3(s, INSN_VORR, q, 0, a0, a1, a2); + return; + + case INDEX_op_cmp_vec: + { + TCGCond cond = args[3]; + + if (cond == TCG_COND_NE) { + if (const_args[2]) { + tcg_out_vreg3(s, INSN_VTST, q, vece, a0, a1, a1); + } else { + tcg_out_vreg3(s, INSN_VCEQ, q, vece, a0, a1, a2); + tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a0); + } + } else { + ARMInsn insn; + + if (const_args[2]) { + insn = vec_cmp0_insn[cond]; + if (insn) { + tcg_out_vreg2(s, insn, q, vece, a0, a1); + return; + } + tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0); + a2 = TCG_VEC_TMP; + } + insn = vec_cmp_insn[cond]; + if (insn == 0) { + TCGArg t; + t = a1, a1 = a2, a2 = t; + cond = tcg_swap_cond(cond); + insn = vec_cmp_insn[cond]; + tcg_debug_assert(insn != 0); + } + tcg_out_vreg3(s, insn, q, vece, a0, a1, a2); + } + } + return; + + case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ + case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ + default: + g_assert_not_reached(); + } } int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) { - return 0; + switch (opc) { + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + return 1; + case INDEX_op_cmp_vec: + return vece < MO_64; + default: + return 0; + } } void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index a9dc09bd08..48993636ea 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -107,7 +107,11 @@ typedef enum { #else extern bool use_idiv_instructions; #endif -#define use_neon_instructions 0 +#ifdef __ARM_NEON__ +#define use_neon_instructions 1 +#else +extern bool use_neon_instructions; +#endif /* used for function call generation */ #define TCG_TARGET_STACK_ALIGN 8 From 7df44cf6e9e5726c5f9c56a398fc606566673007 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 11:58:47 -0700 Subject: [PATCH 07/15] tcg/arm: Implement andc, orc, abs, neg, not vector operations These logical and arithmetic operations are optional, but are trivial to accomplish with the existing infrastructure. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target-con-set.h | 1 + tcg/arm/tcg-target.c.inc | 38 ++++++++++++++++++++++++++++++++++++ tcg/arm/tcg-target.h | 10 +++++----- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h index f30b3900e0..cc006f99cd 100644 --- a/tcg/arm/tcg-target-con-set.h +++ b/tcg/arm/tcg-target-con-set.h @@ -20,6 +20,7 @@ C_O0_I4(s, s, s, s) C_O1_I1(r, l) C_O1_I1(r, r) C_O1_I1(w, r) +C_O1_I1(w, w) C_O1_I1(w, wr) C_O1_I2(r, 0, rZ) C_O1_I2(r, l, l) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index ab98fa1381..2286e0aa09 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -178,11 +178,15 @@ typedef enum { INSN_VADD = 0xf2000800, INSN_VAND = 0xf2000110, + INSN_VBIC = 0xf2100110, INSN_VEOR = 0xf3000110, + INSN_VORN = 0xf2300110, INSN_VORR = 0xf2200110, INSN_VSUB = 0xf3000800, + INSN_VABS = 0xf3b10300, INSN_VMVN = 0xf3b00580, + INSN_VNEG = 0xf3b10380, INSN_VCEQ0 = 0xf3b10100, INSN_VCGT0 = 0xf3b10000, @@ -2369,14 +2373,20 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) return C_O1_I1(w, r); case INDEX_op_dup_vec: return C_O1_I1(w, wr); + case INDEX_op_abs_vec: + case INDEX_op_neg_vec: + case INDEX_op_not_vec: + return C_O1_I1(w, w); case INDEX_op_dup2_vec: case INDEX_op_add_vec: case INDEX_op_sub_vec: case INDEX_op_xor_vec: return C_O1_I2(w, w, w); case INDEX_op_or_vec: + case INDEX_op_andc_vec: return C_O1_I2(w, w, wO); case INDEX_op_and_vec: + case INDEX_op_orc_vec: return C_O1_I2(w, w, wV); case INDEX_op_cmp_vec: return C_O1_I2(w, w, wZ); @@ -2718,6 +2728,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_dup2_vec: tcg_out_dup2_vec(s, a0, a1, a2); return; + case INDEX_op_abs_vec: + tcg_out_vreg2(s, INSN_VABS, q, vece, a0, a1); + return; + case INDEX_op_neg_vec: + tcg_out_vreg2(s, INSN_VNEG, q, vece, a0, a1); + return; + case INDEX_op_not_vec: + tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a1); + return; case INDEX_op_add_vec: tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2); return; @@ -2728,6 +2747,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2); return; + case INDEX_op_andc_vec: + if (!const_args[2]) { + tcg_out_vreg3(s, INSN_VBIC, q, 0, a0, a1, a2); + return; + } + a2 = ~a2; + /* fall through */ case INDEX_op_and_vec: if (const_args[2]) { is_shimm1632(~a2, &cmode, &imm8); @@ -2741,6 +2767,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2); return; + case INDEX_op_orc_vec: + if (!const_args[2]) { + tcg_out_vreg3(s, INSN_VORN, q, 0, a0, a1, a2); + return; + } + a2 = ~a2; + /* fall through */ case INDEX_op_or_vec: if (const_args[2]) { is_shimm1632(a2, &cmode, &imm8); @@ -2803,10 +2836,15 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_add_vec: case INDEX_op_sub_vec: case INDEX_op_and_vec: + case INDEX_op_andc_vec: case INDEX_op_or_vec: + case INDEX_op_orc_vec: case INDEX_op_xor_vec: + case INDEX_op_not_vec: return 1; + case INDEX_op_abs_vec: case INDEX_op_cmp_vec: + case INDEX_op_neg_vec: return vece < MO_64; default: return 0; diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 48993636ea..6ac9fc6b9b 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -155,11 +155,11 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_v128 use_neon_instructions #define TCG_TARGET_HAS_v256 0 -#define TCG_TARGET_HAS_andc_vec 0 -#define TCG_TARGET_HAS_orc_vec 0 -#define TCG_TARGET_HAS_not_vec 0 -#define TCG_TARGET_HAS_neg_vec 0 -#define TCG_TARGET_HAS_abs_vec 0 +#define TCG_TARGET_HAS_andc_vec 1 +#define TCG_TARGET_HAS_orc_vec 1 +#define TCG_TARGET_HAS_not_vec 1 +#define TCG_TARGET_HAS_neg_vec 1 +#define TCG_TARGET_HAS_abs_vec 1 #define TCG_TARGET_HAS_roti_vec 0 #define TCG_TARGET_HAS_rots_vec 0 #define TCG_TARGET_HAS_rotv_vec 0 From d4c4e9c51b91d413cf3020ce80ba4914186bfbb4 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 12:24:28 -0700 Subject: [PATCH 08/15] tcg/arm: Implement TCG_TARGET_HAS_shi_vec This consists of the three immediate shifts: shli, shri, sari. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 27 +++++++++++++++++++++++++++ tcg/arm/tcg-target.h | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 2286e0aa09..d21aaab6f7 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -200,6 +200,10 @@ typedef enum { INSN_VCGE_U = 0xf3000310, INSN_VCGT_U = 0xf3000300, + INSN_VSHLI = 0xf2800510, /* VSHL (immediate) */ + INSN_VSARI = 0xf2800010, /* VSHR.S */ + INSN_VSHRI = 0xf3800010, /* VSHR.U */ + INSN_VTST = 0xf2000810, INSN_VDUP_G = 0xee800b10, /* VDUP (ARM core register) */ @@ -1321,6 +1325,14 @@ static void tcg_out_vmovi(TCGContext *s, TCGReg rd, | (extract32(imm8, 7, 1) << 24)); } +static void tcg_out_vshifti(TCGContext *s, ARMInsn insn, int q, + TCGReg rd, TCGReg rm, int l_imm6) +{ + tcg_out32(s, insn | (q << 6) | encode_vd(rd) | encode_vm(rm) | + (extract32(l_imm6, 6, 1) << 7) | + (extract32(l_imm6, 0, 6) << 16)); +} + static void tcg_out_vldst(TCGContext *s, ARMInsn insn, TCGReg rd, TCGReg rn, int offset) { @@ -2376,6 +2388,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_abs_vec: case INDEX_op_neg_vec: case INDEX_op_not_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: return C_O1_I1(w, w); case INDEX_op_dup2_vec: case INDEX_op_add_vec: @@ -2746,6 +2761,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_xor_vec: tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2); return; + case INDEX_op_shli_vec: + tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece)); + return; + case INDEX_op_shri_vec: + tcg_out_vshifti(s, INSN_VSHRI, q, a0, a1, (16 << vece) - a2); + return; + case INDEX_op_sari_vec: + tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2); + return; case INDEX_op_andc_vec: if (!const_args[2]) { @@ -2841,6 +2865,9 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_orc_vec: case INDEX_op_xor_vec: case INDEX_op_not_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: return 1; case INDEX_op_abs_vec: case INDEX_op_cmp_vec: diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 6ac9fc6b9b..cfbadad72c 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -163,7 +163,7 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_roti_vec 0 #define TCG_TARGET_HAS_rots_vec 0 #define TCG_TARGET_HAS_rotv_vec 0 -#define TCG_TARGET_HAS_shi_vec 0 +#define TCG_TARGET_HAS_shi_vec 1 #define TCG_TARGET_HAS_shs_vec 0 #define TCG_TARGET_HAS_shv_vec 0 #define TCG_TARGET_HAS_mul_vec 0 From 752b17693e7af43ba77cee59eb9a1ec6966b3ded Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 12:30:17 -0700 Subject: [PATCH 09/15] tcg/arm: Implement TCG_TARGET_HAS_mul_vec Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 6 ++++++ tcg/arm/tcg-target.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index d21aaab6f7..b94e6ed0f3 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -183,6 +183,7 @@ typedef enum { INSN_VORN = 0xf2300110, INSN_VORR = 0xf2200110, INSN_VSUB = 0xf3000800, + INSN_VMUL = 0xf2000910, INSN_VABS = 0xf3b10300, INSN_VMVN = 0xf3b00580, @@ -2394,6 +2395,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) return C_O1_I1(w, w); case INDEX_op_dup2_vec: case INDEX_op_add_vec: + case INDEX_op_mul_vec: case INDEX_op_sub_vec: case INDEX_op_xor_vec: return C_O1_I2(w, w, w); @@ -2755,6 +2757,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_add_vec: tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2); return; + case INDEX_op_mul_vec: + tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2); + return; case INDEX_op_sub_vec: tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2); return; @@ -2871,6 +2876,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) return 1; case INDEX_op_abs_vec: case INDEX_op_cmp_vec: + case INDEX_op_mul_vec: case INDEX_op_neg_vec: return vece < MO_64; default: diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index cfbadad72c..94d768f249 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -166,7 +166,7 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_shi_vec 1 #define TCG_TARGET_HAS_shs_vec 0 #define TCG_TARGET_HAS_shv_vec 0 -#define TCG_TARGET_HAS_mul_vec 0 +#define TCG_TARGET_HAS_mul_vec 1 #define TCG_TARGET_HAS_sat_vec 0 #define TCG_TARGET_HAS_minmax_vec 0 #define TCG_TARGET_HAS_bitsel_vec 0 From 4fcd301707ccc656f27e3dc324cdbe20122a9740 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 12:37:36 -0700 Subject: [PATCH 10/15] tcg/arm: Implement TCG_TARGET_HAS_sat_vec This is saturating add and subtract, signed and unsigned. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++ tcg/arm/tcg-target.h | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index b94e6ed0f3..f0cfed7700 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -184,6 +184,10 @@ typedef enum { INSN_VORR = 0xf2200110, INSN_VSUB = 0xf3000800, INSN_VMUL = 0xf2000910, + INSN_VQADD = 0xf2000010, + INSN_VQADD_U = 0xf3000010, + INSN_VQSUB = 0xf2000210, + INSN_VQSUB_U = 0xf3000210, INSN_VABS = 0xf3b10300, INSN_VMVN = 0xf3b00580, @@ -2396,7 +2400,11 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_dup2_vec: case INDEX_op_add_vec: case INDEX_op_mul_vec: + case INDEX_op_ssadd_vec: + case INDEX_op_sssub_vec: case INDEX_op_sub_vec: + case INDEX_op_usadd_vec: + case INDEX_op_ussub_vec: case INDEX_op_xor_vec: return C_O1_I2(w, w, w); case INDEX_op_or_vec: @@ -2763,6 +2771,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_sub_vec: tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2); return; + case INDEX_op_ssadd_vec: + tcg_out_vreg3(s, INSN_VQADD, q, vece, a0, a1, a2); + return; + case INDEX_op_sssub_vec: + tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2); + return; + case INDEX_op_usadd_vec: + tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2); + return; + case INDEX_op_ussub_vec: + tcg_out_vreg3(s, INSN_VQSUB_U, q, vece, a0, a1, a2); + return; case INDEX_op_xor_vec: tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2); return; @@ -2873,6 +2893,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_shli_vec: case INDEX_op_shri_vec: case INDEX_op_sari_vec: + case INDEX_op_ssadd_vec: + case INDEX_op_sssub_vec: + case INDEX_op_usadd_vec: + case INDEX_op_ussub_vec: return 1; case INDEX_op_abs_vec: case INDEX_op_cmp_vec: diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 94d768f249..71621f28e9 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -167,7 +167,7 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_shs_vec 0 #define TCG_TARGET_HAS_shv_vec 0 #define TCG_TARGET_HAS_mul_vec 1 -#define TCG_TARGET_HAS_sat_vec 0 +#define TCG_TARGET_HAS_sat_vec 1 #define TCG_TARGET_HAS_minmax_vec 0 #define TCG_TARGET_HAS_bitsel_vec 0 #define TCG_TARGET_HAS_cmpsel_vec 0 From dbbeff77645242241fe2296b88a7b1d3b3614ffe Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 12:44:06 -0700 Subject: [PATCH 11/15] tcg/arm: Implement TCG_TARGET_HAS_minmax_vec This is minimum and maximum, signed and unsigned. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++ tcg/arm/tcg-target.h | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index f0cfed7700..8193d768d6 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -188,6 +188,10 @@ typedef enum { INSN_VQADD_U = 0xf3000010, INSN_VQSUB = 0xf2000210, INSN_VQSUB_U = 0xf3000210, + INSN_VMAX = 0xf2000600, + INSN_VMAX_U = 0xf3000600, + INSN_VMIN = 0xf2000610, + INSN_VMIN_U = 0xf3000610, INSN_VABS = 0xf3b10300, INSN_VMVN = 0xf3b00580, @@ -2400,9 +2404,13 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_dup2_vec: case INDEX_op_add_vec: case INDEX_op_mul_vec: + case INDEX_op_smax_vec: + case INDEX_op_smin_vec: case INDEX_op_ssadd_vec: case INDEX_op_sssub_vec: case INDEX_op_sub_vec: + case INDEX_op_umax_vec: + case INDEX_op_umin_vec: case INDEX_op_usadd_vec: case INDEX_op_ussub_vec: case INDEX_op_xor_vec: @@ -2768,6 +2776,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_mul_vec: tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2); return; + case INDEX_op_smax_vec: + tcg_out_vreg3(s, INSN_VMAX, q, vece, a0, a1, a2); + return; + case INDEX_op_smin_vec: + tcg_out_vreg3(s, INSN_VMIN, q, vece, a0, a1, a2); + return; case INDEX_op_sub_vec: tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2); return; @@ -2777,6 +2791,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_sssub_vec: tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2); return; + case INDEX_op_umax_vec: + tcg_out_vreg3(s, INSN_VMAX_U, q, vece, a0, a1, a2); + return; + case INDEX_op_umin_vec: + tcg_out_vreg3(s, INSN_VMIN_U, q, vece, a0, a1, a2); + return; case INDEX_op_usadd_vec: tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2); return; @@ -2902,6 +2922,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_cmp_vec: case INDEX_op_mul_vec: case INDEX_op_neg_vec: + case INDEX_op_smax_vec: + case INDEX_op_smin_vec: + case INDEX_op_umax_vec: + case INDEX_op_umin_vec: return vece < MO_64; default: return 0; diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 71621f28e9..4815a34e75 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -168,7 +168,7 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_shv_vec 0 #define TCG_TARGET_HAS_mul_vec 1 #define TCG_TARGET_HAS_sat_vec 1 -#define TCG_TARGET_HAS_minmax_vec 0 +#define TCG_TARGET_HAS_minmax_vec 1 #define TCG_TARGET_HAS_bitsel_vec 0 #define TCG_TARGET_HAS_cmpsel_vec 0 From f2b46c7162f86b05bbc05f1728b1d2a0e6a9e457 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 12:54:37 -0700 Subject: [PATCH 12/15] tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec NEON has 3 instructions implementing this 4 argument operation, with each insn overlapping a different logical input onto the destination register. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target-con-set.h | 1 + tcg/arm/tcg-target.c.inc | 22 ++++++++++++++++++++-- tcg/arm/tcg-target.h | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h index cc006f99cd..d02797cbf4 100644 --- a/tcg/arm/tcg-target-con-set.h +++ b/tcg/arm/tcg-target-con-set.h @@ -34,6 +34,7 @@ C_O1_I2(w, w, w) C_O1_I2(w, w, wO) C_O1_I2(w, w, wV) C_O1_I2(w, w, wZ) +C_O1_I3(w, w, w, w) C_O1_I4(r, r, r, rI, rI) C_O1_I4(r, r, rIN, rIK, 0) C_O2_I1(r, r, l) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 8193d768d6..3381240909 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -213,6 +213,10 @@ typedef enum { INSN_VSARI = 0xf2800010, /* VSHR.S */ INSN_VSHRI = 0xf3800010, /* VSHR.U */ + INSN_VBSL = 0xf3100110, + INSN_VBIT = 0xf3200110, + INSN_VBIF = 0xf3300110, + INSN_VTST = 0xf2000810, INSN_VDUP_G = 0xee800b10, /* VDUP (ARM core register) */ @@ -2423,7 +2427,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) return C_O1_I2(w, w, wV); case INDEX_op_cmp_vec: return C_O1_I2(w, w, wZ); - + case INDEX_op_bitsel_vec: + return C_O1_I3(w, w, w, w); default: g_assert_not_reached(); } @@ -2741,7 +2746,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, { TCGType type = vecl + TCG_TYPE_V64; unsigned q = vecl; - TCGArg a0, a1, a2; + TCGArg a0, a1, a2, a3; int cmode, imm8; a0 = args[0]; @@ -2892,6 +2897,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, } return; + case INDEX_op_bitsel_vec: + a3 = args[3]; + if (a0 == a3) { + tcg_out_vreg3(s, INSN_VBIT, q, 0, a0, a2, a1); + } else if (a0 == a2) { + tcg_out_vreg3(s, INSN_VBIF, q, 0, a0, a3, a1); + } else { + tcg_out_mov(s, type, a0, a1); + tcg_out_vreg3(s, INSN_VBSL, q, 0, a0, a2, a3); + } + return; + case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ default: @@ -2917,6 +2934,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_sssub_vec: case INDEX_op_usadd_vec: case INDEX_op_ussub_vec: + case INDEX_op_bitsel_vec: return 1; case INDEX_op_abs_vec: case INDEX_op_cmp_vec: diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 4815a34e75..d6222ba2db 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -169,7 +169,7 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_mul_vec 1 #define TCG_TARGET_HAS_sat_vec 1 #define TCG_TARGET_HAS_minmax_vec 1 -#define TCG_TARGET_HAS_bitsel_vec 0 +#define TCG_TARGET_HAS_bitsel_vec 1 #define TCG_TARGET_HAS_cmpsel_vec 0 #define TCG_TARGET_DEFAULT_MO (0) From 31d366390cc4316e55362d40cfc52542d6eea5ab Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 13:13:10 -0700 Subject: [PATCH 13/15] tcg/arm: Implement TCG_TARGET_HAS_shv_vec The three vector shift by vector operations are all implemented via expansion. Therefore do not actually set TCG_TARGET_HAS_shv_vec, as none of shlv_vec, shrv_vec, sarv_vec may actually appear in the instruction stream, and therefore also do not appear in tcg_target_op_def. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 61 +++++++++++++++++++++++++++++++++++++++- tcg/arm/tcg-target.opc.h | 3 ++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 3381240909..a6c7889d9f 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -212,6 +212,8 @@ typedef enum { INSN_VSHLI = 0xf2800510, /* VSHL (immediate) */ INSN_VSARI = 0xf2800010, /* VSHR.S */ INSN_VSHRI = 0xf3800010, /* VSHR.U */ + INSN_VSHL_S = 0xf2000400, /* VSHL.S (register) */ + INSN_VSHL_U = 0xf3000400, /* VSHL.U (register) */ INSN_VBSL = 0xf3100110, INSN_VBIT = 0xf3200110, @@ -2418,6 +2420,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_usadd_vec: case INDEX_op_ussub_vec: case INDEX_op_xor_vec: + case INDEX_op_arm_sshl_vec: + case INDEX_op_arm_ushl_vec: return C_O1_I2(w, w, w); case INDEX_op_or_vec: case INDEX_op_andc_vec: @@ -2811,6 +2815,17 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_xor_vec: tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2); return; + case INDEX_op_arm_sshl_vec: + /* + * Note that Vm is the data and Vn is the shift count, + * therefore the arguments appear reversed. + */ + tcg_out_vreg3(s, INSN_VSHL_S, q, vece, a0, a2, a1); + return; + case INDEX_op_arm_ushl_vec: + /* See above. */ + tcg_out_vreg3(s, INSN_VSHL_U, q, vece, a0, a2, a1); + return; case INDEX_op_shli_vec: tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece)); return; @@ -2945,6 +2960,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_umax_vec: case INDEX_op_umin_vec: return vece < MO_64; + case INDEX_op_shlv_vec: + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + return -1; default: return 0; } @@ -2953,7 +2972,47 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, TCGArg a0, ...) { - g_assert_not_reached(); + va_list va; + TCGv_vec v0, v1, v2, t1; + TCGArg a2; + + va_start(va, a0); + v0 = temp_tcgv_vec(arg_temp(a0)); + v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); + a2 = va_arg(va, TCGArg); + va_end(va); + + switch (opc) { + case INDEX_op_shlv_vec: + /* + * Merely propagate shlv_vec to arm_ushl_vec. + * In this way we don't set TCG_TARGET_HAS_shv_vec + * because everything is done via expansion. + */ + v2 = temp_tcgv_vec(arg_temp(a2)); + vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0), + tcgv_vec_arg(v1), tcgv_vec_arg(v2)); + break; + + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + /* Right shifts are negative left shifts for NEON. */ + v2 = temp_tcgv_vec(arg_temp(a2)); + t1 = tcg_temp_new_vec(type); + tcg_gen_neg_vec(vece, t1, v2); + if (opc == INDEX_op_shrv_vec) { + opc = INDEX_op_arm_ushl_vec; + } else { + opc = INDEX_op_arm_sshl_vec; + } + vec_gen_3(opc, type, vece, tcgv_vec_arg(v0), + tcgv_vec_arg(v1), tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); + break; + + default: + g_assert_not_reached(); + } } static void tcg_out_nop_fill(tcg_insn_unit *p, int count) diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h index 7a4578e9b4..d19153dcb9 100644 --- a/tcg/arm/tcg-target.opc.h +++ b/tcg/arm/tcg-target.opc.h @@ -10,3 +10,6 @@ * emitted by tcg_expand_vec_op. For those familiar with GCC internals, * consider these to be UNSPEC with names. */ + +DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC) +DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC) From 5047ae648b7f25d3cdb6ce4995c62aa7806abd7f Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 13:26:48 -0700 Subject: [PATCH 14/15] tcg/arm: Implement TCG_TARGET_HAS_roti_vec Implement via expansion, so don't actually set TCG_TARGET_HAS_roti_vec. For NEON, this is shift-right followed by shift-left-and-insert. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target-con-set.h | 1 + tcg/arm/tcg-target.c.inc | 15 +++++++++++++++ tcg/arm/tcg-target.opc.h | 1 + 3 files changed, 17 insertions(+) diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h index d02797cbf4..3685e1786a 100644 --- a/tcg/arm/tcg-target-con-set.h +++ b/tcg/arm/tcg-target-con-set.h @@ -30,6 +30,7 @@ C_O1_I2(r, r, rIK) C_O1_I2(r, r, rIN) C_O1_I2(r, r, ri) C_O1_I2(r, rZ, rZ) +C_O1_I2(w, 0, w) C_O1_I2(w, w, w) C_O1_I2(w, w, wO) C_O1_I2(w, w, wV) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index a6c7889d9f..d0af654c65 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -212,6 +212,7 @@ typedef enum { INSN_VSHLI = 0xf2800510, /* VSHL (immediate) */ INSN_VSARI = 0xf2800010, /* VSHR.S */ INSN_VSHRI = 0xf3800010, /* VSHR.U */ + INSN_VSLI = 0xf3800510, INSN_VSHL_S = 0xf2000400, /* VSHL.S (register) */ INSN_VSHL_U = 0xf3000400, /* VSHL.U (register) */ @@ -2423,6 +2424,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_arm_sshl_vec: case INDEX_op_arm_ushl_vec: return C_O1_I2(w, w, w); + case INDEX_op_arm_sli_vec: + return C_O1_I2(w, 0, w); case INDEX_op_or_vec: case INDEX_op_andc_vec: return C_O1_I2(w, w, wO); @@ -2835,6 +2838,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_sari_vec: tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2); return; + case INDEX_op_arm_sli_vec: + tcg_out_vshifti(s, INSN_VSLI, q, a0, a2, args[3] + (8 << vece)); + return; case INDEX_op_andc_vec: if (!const_args[2]) { @@ -2963,6 +2969,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_shlv_vec: case INDEX_op_shrv_vec: case INDEX_op_sarv_vec: + case INDEX_op_rotli_vec: return -1; default: return 0; @@ -3010,6 +3017,14 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, tcg_temp_free_vec(t1); break; + case INDEX_op_rotli_vec: + t1 = tcg_temp_new_vec(type); + tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1)); + vec_gen_4(INDEX_op_arm_sli_vec, type, vece, + tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2); + tcg_temp_free_vec(t1); + break; + default: g_assert_not_reached(); } diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h index d19153dcb9..d38af9a808 100644 --- a/tcg/arm/tcg-target.opc.h +++ b/tcg/arm/tcg-target.opc.h @@ -11,5 +11,6 @@ * consider these to be UNSPEC with names. */ +DEF(arm_sli_vec, 1, 2, 1, IMPLVEC) DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC) DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC) From 0006039e29b9e6118beab300146f7c4931f7a217 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 5 Sep 2020 14:20:57 -0700 Subject: [PATCH 15/15] tcg/arm: Implement TCG_TARGET_HAS_rotv_vec Implement via expansion, so don't actually set TCG_TARGET_HAS_rotv_vec. Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index d0af654c65..f4c9cb8f9f 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -2970,6 +2970,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_shrv_vec: case INDEX_op_sarv_vec: case INDEX_op_rotli_vec: + case INDEX_op_rotlv_vec: + case INDEX_op_rotrv_vec: return -1; default: return 0; @@ -2980,7 +2982,7 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, TCGArg a0, ...) { va_list va; - TCGv_vec v0, v1, v2, t1; + TCGv_vec v0, v1, v2, t1, t2, c1; TCGArg a2; va_start(va, a0); @@ -3025,6 +3027,37 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, tcg_temp_free_vec(t1); break; + case INDEX_op_rotlv_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + t1 = tcg_temp_new_vec(type); + c1 = tcg_constant_vec(type, vece, 8 << vece); + tcg_gen_sub_vec(vece, t1, v2, c1); + /* Right shifts are negative left shifts for NEON. */ + vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1), + tcgv_vec_arg(v1), tcgv_vec_arg(t1)); + vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0), + tcgv_vec_arg(v1), tcgv_vec_arg(v2)); + tcg_gen_or_vec(vece, v0, v0, t1); + tcg_temp_free_vec(t1); + break; + + case INDEX_op_rotrv_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + c1 = tcg_constant_vec(type, vece, 8 << vece); + tcg_gen_neg_vec(vece, t1, v2); + tcg_gen_sub_vec(vece, t2, c1, v2); + /* Right shifts are negative left shifts for NEON. */ + vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1), + tcgv_vec_arg(v1), tcgv_vec_arg(t1)); + vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t2), + tcgv_vec_arg(v1), tcgv_vec_arg(t2)); + tcg_gen_or_vec(vece, v0, t1, t2); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + default: g_assert_not_reached(); }