Hexagon (target/hexagon) Short-circuit more HVX single instruction packets

The generated helpers for HVX use pass-by-reference, so they can't short-circuit when the reads/writes overlap. The instructions with overrides are OK because they use tcg_gen_gvec_*. We add a flag has_hvx_helper to DisasContext and extend gen_analyze_funcs to set the flag when the instruction is an HVX instruction with a generated helper. We add an override for V6_vcombine so that it can be short-circuited along with a test case in tests/tcg/hexagon/hvx_misc.c Signed-off-by: Taylor Simpson <tsimpson@quicinc.com> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-Id: <20230427230012.3800327-15-tsimpson@quicinc.com>
2023-04-27 16:00:05 -07:00 · 2023-04-27 16:00:05 -07:00 · d05d5eebc7
parent b85529854e
commit d05d5eebc7
5 changed files with 65 additions and 2 deletions
--- a/target/hexagon/gen_analyze_funcs.py
+++ b/target/hexagon/gen_analyze_funcs.py
@ -212,6 +212,11 @@ def gen_analyze_func(f, tag, regs, imms):
    if has_generated_helper and "A_SCALAR_LOAD" in hex_common.attribdict[tag]:
        f.write("    ctx->need_pkt_has_store_s1 = true;\n")

+    ## Mark HVX instructions with generated helpers
+    if (has_generated_helper and
+        "A_CVI" in hex_common.attribdict[tag]):
+        f.write("    ctx->has_hvx_helper = true;\n")
+
    f.write("}\n\n")


--- a/target/hexagon/gen_tcg_hvx.h
+++ b/target/hexagon/gen_tcg_hvx.h
@ -140,6 +140,29 @@ static inline void assert_vhist_tmp(DisasContext *ctx)
                         sizeof(MMVector), sizeof(MMVector)); \
    } while (0)

+/*
+ * Vector combine
+ *
+ * Be careful that the source and dest don't overlap
+ */
+#define fGEN_TCG_V6_vcombine(SHORTCODE) \
+    do { \
+        if (VddV_off != VuV_off) { \
+            tcg_gen_gvec_mov(MO_64, VddV_off, VvV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+            tcg_gen_gvec_mov(MO_64, VddV_off + sizeof(MMVector), VuV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+        } else { \
+            intptr_t tmpoff = offsetof(CPUHexagonState, vtmp); \
+            tcg_gen_gvec_mov(MO_64, tmpoff, VuV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+            tcg_gen_gvec_mov(MO_64, VddV_off, VvV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+            tcg_gen_gvec_mov(MO_64, VddV_off + sizeof(MMVector), tmpoff, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+        } \
+    } while (0)
+
 /* Vector conditional move */
 #define fGEN_TCG_VEC_CMOV(PRED) \
    do { \
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@ -378,8 +378,20 @@ static bool need_commit(DisasContext *ctx)
        return true;
    }

-    if (pkt->num_insns == 1 && !pkt->pkt_has_hvx) {
-        return false;
+    if (pkt->num_insns == 1) {
+        if (pkt->pkt_has_hvx) {
+            /*
+             * The HVX instructions with generated helpers use
+             * pass-by-reference, so they need the read/write overlap
+             * check below.
+             * The HVX instructions with overrides are OK.
+             */
+            if (!ctx->has_hvx_helper) {
+                return false;
+            }
+        } else {
+            return false;
+        }
    }

    /* Check for overlap between register reads and writes */
@ -454,6 +466,7 @@ static void analyze_packet(DisasContext *ctx)
 {
    Packet *pkt = ctx->pkt;
    ctx->need_pkt_has_store_s1 = false;
+    ctx->has_hvx_helper = false;
    for (int i = 0; i < pkt->num_insns; i++) {
        Insn *insn = &pkt->insn[i];
        ctx->insn = insn;
--- a/target/hexagon/translate.h
+++ b/target/hexagon/translate.h
@ -68,6 +68,7 @@ typedef struct DisasContext {
    bool is_tight_loop;
    bool need_pkt_has_store_s1;
    bool short_circuit;
+    bool has_hvx_helper;
 } DisasContext;

 static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
--- a/tests/tcg/hexagon/hvx_misc.c
+++ b/tests/tcg/hexagon/hvx_misc.c
@ -454,6 +454,25 @@ static void test_load_cur_predicated(void)
    check_output_w(__LINE__, BUFSIZE);
 }

+static void test_vcombine(void)
+{
+    for (int i = 0; i < BUFSIZE / 2; i++) {
+        asm volatile("v2 = vsplat(%0)\n\t"
+                     "v3 = vsplat(%1)\n\t"
+                     "v3:2 = vcombine(v2, v3)\n\t"
+                     "vmem(%2+#0) = v2\n\t"
+                     "vmem(%2+#1) = v3\n\t"
+                     :
+                     : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i])
+                     : "v2", "v3", "memory");
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[2 * i].w[j] = 2 * i + 1;
+            expect[2 * i + 1].w[j] = 2 * i;
+        }
+    }
+    check_output_w(__LINE__, BUFSIZE);
+}
+
 int main()
 {
    init_buffers();
@ -494,6 +513,8 @@ int main()
    test_load_tmp_predicated();
    test_load_cur_predicated();

+    test_vcombine();
+
    puts(err ? "FAIL" : "PASS");
    return err ? 1 : 0;
 }