target/arm: Convert Neon 'load single structure to all lanes' to decodetree

Convert the Neon "load single structure to all lanes" insns to decodetree. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20200430181003.21682-13-peter.maydell@linaro.org
2020-04-30 19:09:39 +01:00 · 2020-04-30 19:09:39 +01:00 · 3698747c48
parent a27b463043
commit 3698747c48
3 changed files with 80 additions and 53 deletions
--- a/target/arm/neon-ls.decode
+++ b/target/arm/neon-ls.decode
@ -34,3 +34,8 @@
 VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
               vd=%vd_dp
 # Neon load single element to all lanes
 VLD_all_lanes  1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
               vd=%vd_dp
--- a/target/arm/translate-neon.inc.c
+++ b/target/arm/translate-neon.inc.c
@ -398,3 +398,76 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
    return true;
 }
 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 {
    /* Neon load single structure to all lanes */
    int reg, stride, vec_size;
    int vd = a->vd;
    int size = a->size;
    int nregs = a->n + 1;
    TCGv_i32 addr, tmp;
    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
    }
    /* UNDEF accesses to D16-D31 if they don't exist */
    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
        return false;
    }
    if (size == 3) {
        if (nregs != 4 || a->a == 0) {
            return false;
        }
        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
        size = 2;
    }
    if (nregs == 1 && a->a == 1 && size == 0) {
        return false;
    }
    if (nregs == 3 && a->a == 1) {
        return false;
    }
    if (!vfp_access_check(s)) {
        return true;
    }
    /*
     * VLD1 to all lanes: T bit indicates how many Dregs to write.
     * VLD2/3/4 to all lanes: T bit indicates register stride.
     */
    stride = a->t ? 2 : 1;
    vec_size = nregs == 1 ? stride * 8 : 8;
    tmp = tcg_temp_new_i32();
    addr = tcg_temp_new_i32();
    load_reg_var(s, addr, a->rn);
    for (reg = 0; reg < nregs; reg++) {
        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
                        s->be_data | size);
        if ((vd & 1) && vec_size == 16) {
            /*
             * We cannot write 16 bytes at once because the
             * destination is unaligned.
             */
            tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
                                 8, 8, tmp);
            tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
                             neon_reg_offset(vd, 0), 8, 8);
        } else {
            tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
                                 vec_size, vec_size, tmp);
        }
        tcg_gen_addi_i32(addr, addr, 1 << size);
        vd += stride;
    }
    tcg_temp_free_i32(tmp);
    tcg_temp_free_i32(addr);
    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
    return true;
 }
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@ -3224,7 +3224,6 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
    int size;
    int reg;
    int load;
    int vec_size;
    TCGv_i32 addr;
    TCGv_i32 tmp;
@ -3254,58 +3253,8 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
    } else {
        size = (insn >> 10) & 3;
        if (size == 3) {
-            /* Load single element to all lanes.  */
+            /* Load single element to all lanes -- handled by decodetree  */
-            int a = (insn >> 4) & 1;
+            return 1;
            if (!load) {
                return 1;
            }
            size = (insn >> 6) & 3;
            nregs = ((insn >> 8) & 3) + 1;
            if (size == 3) {
                if (nregs != 4 || a == 0) {
                    return 1;
                }
                /* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */
                size = 2;
            }
            if (nregs == 1 && a == 1 && size == 0) {
                return 1;
            }
            if (nregs == 3 && a == 1) {
                return 1;
            }
            addr = tcg_temp_new_i32();
            load_reg_var(s, addr, rn);
            /* VLD1 to all lanes: bit 5 indicates how many Dregs to write.
             * VLD2/3/4 to all lanes: bit 5 indicates register stride.
             */
            stride = (insn & (1 << 5)) ? 2 : 1;
            vec_size = nregs == 1 ? stride * 8 : 8;
            tmp = tcg_temp_new_i32();
            for (reg = 0; reg < nregs; reg++) {
                gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
                                s->be_data | size);
                if ((rd & 1) && vec_size == 16) {
                    /* We cannot write 16 bytes at once because the
                     * destination is unaligned.
                     */
                    tcg_gen_gvec_dup_i32(size, neon_reg_offset(rd, 0),
                                         8, 8, tmp);
                    tcg_gen_gvec_mov(0, neon_reg_offset(rd + 1, 0),
                                     neon_reg_offset(rd, 0), 8, 8);
                } else {
                    tcg_gen_gvec_dup_i32(size, neon_reg_offset(rd, 0),
                                         vec_size, vec_size, tmp);
                }
                tcg_gen_addi_i32(addr, addr, 1 << size);
                rd += stride;
            }
            tcg_temp_free_i32(tmp);
            tcg_temp_free_i32(addr);
            stride = (1 << size) * nregs;
        } else {
            /* Single element.  */
            int idx = (insn >> 4) & 0xf;