From 58ff2981667262f77d57219fc9cef2a43a740159 Mon Sep 17 00:00:00 2001
From: Michael Lambert <mlambert@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:45 -0800
Subject: [PATCH 01/12] Hexagon (target/hexagon) fix bug in circular addressing

Versions V3 and earlier should treat the "K_const" and "length" values
as unsigned.

Modified circ_test_v3() in tests/tcg/hexagon/circ.c to reproduce the bug

Signed-off-by: Michael Lambert <mlambert@quicinc.com>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-2-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 6 +++---
 tests/tcg/hexagon/circ.c   | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 057baf9a48..47bd51e0ca 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -304,8 +304,8 @@ void HELPER(debug_commit_end)(CPUHexagonState *env, int has_st0, int has_st1)
 
 int32_t HELPER(fcircadd)(int32_t RxV, int32_t offset, int32_t M, int32_t CS)
 {
-    int32_t K_const = sextract32(M, 24, 4);
-    int32_t length = sextract32(M, 0, 17);
+    uint32_t K_const = extract32(M, 24, 4);
+    uint32_t length = extract32(M, 0, 17);
     uint32_t new_ptr = RxV + offset;
     uint32_t start_addr;
     uint32_t end_addr;
diff --git a/tests/tcg/hexagon/circ.c b/tests/tcg/hexagon/circ.c
index 67a1aa3054..354416eb6d 100644
--- a/tests/tcg/hexagon/circ.c
+++ b/tests/tcg/hexagon/circ.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -415,7 +415,8 @@ static void circ_test_v3(void)
 {
     int *p = wbuf;
     int size = 15;
-    int K = 4;      /* 64 bytes */
+    /* set high bit in K to test unsigned extract in fcirc */
+    int K = 8;      /* 1024 bytes */
     int element;
     int i;
 

From 5b0043c67ccd7b88e0858204e79b09448adf4b34 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:46 -0800
Subject: [PATCH 02/12] Hexagon HVX (target/hexagon) fix bug in HVX saturate
 instructions

Two tests added to tests/tcg/hexagon/hvx_misc.c
    v21.uw = vadd(v11.uw, v10.uw):sat
    v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-3-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/macros.h      |  4 +-
 tests/tcg/hexagon/hvx_misc.c | 71 +++++++++++++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
index 19d103cad5..a78e84faa4 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -268,7 +268,7 @@ static inline void gen_pred_cancel(TCGv pred, int slot_num)
 
 #define fVSATUVALN(N, VAL) \
     ({ \
-        (((int)(VAL)) < 0) ? 0 : ((1LL << (N)) - 1); \
+        (((int64_t)(VAL)) < 0) ? 0 : ((1LL << (N)) - 1); \
     })
 #define fSATUVALN(N, VAL) \
     ({ \
diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
index 312bb98b41..b896f5897e 100644
--- a/tests/tcg/hexagon/hvx_misc.c
+++ b/tests/tcg/hexagon/hvx_misc.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -19,6 +19,7 @@
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
+#include <limits.h>
 
 int err;
 
@@ -432,6 +433,71 @@ TEST_PRED_OP2(pred_and, and, &, "")
 TEST_PRED_OP2(pred_and_n, and, &, "!")
 TEST_PRED_OP2(pred_xor, xor, ^, "")
 
+static void test_vadduwsat(void)
+{
+    /*
+     * Test for saturation by adding two numbers that add to more than UINT_MAX
+     * and make sure the result saturates to UINT_MAX
+     */
+    const uint32_t x = 0xffff0000;
+    const uint32_t y = 0x000fffff;
+
+    memset(expect, 0x12, sizeof(MMVector));
+    memset(output, 0x34, sizeof(MMVector));
+
+    asm volatile ("v10 = vsplat(%0)\n\t"
+                  "v11 = vsplat(%1)\n\t"
+                  "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
+                  "vmem(%2+#0) = v21\n\t"
+                  : /* no outputs */
+                  : "r"(x), "r"(y), "r"(output)
+                  : "v10", "v11", "v21", "memory");
+
+    for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+        expect[0].uw[j] = UINT_MAX;
+    }
+
+    check_output_w(__LINE__, 1);
+}
+
+static void test_vsubuwsat_dv(void)
+{
+    /*
+     * Test for saturation by subtracting two numbers where the result is
+     * negative and make sure the result saturates to zero
+     *
+     * vsubuwsat_dv operates on an HVX register pair, so we'll have a
+     * pair of subtractions
+     *     w - x < 0
+     *     y - z < 0
+     */
+    const uint32_t w = 0x000000b7;
+    const uint32_t x = 0xffffff4e;
+    const uint32_t y = 0x31fe88e7;
+    const uint32_t z = 0x7fffff79;
+
+    memset(expect, 0x12, sizeof(MMVector) * 2);
+    memset(output, 0x34, sizeof(MMVector) * 2);
+
+    asm volatile ("v16 = vsplat(%0)\n\t"
+                  "v17 = vsplat(%1)\n\t"
+                  "v26 = vsplat(%2)\n\t"
+                  "v27 = vsplat(%3)\n\t"
+                  "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
+                  "vmem(%4+#0) = v24\n\t"
+                  "vmem(%4+#1) = v25\n\t"
+                  : /* no outputs */
+                  : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
+                  : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
+
+    for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+        expect[0].uw[j] = 0x00000000;
+        expect[1].uw[j] = 0x00000000;
+    }
+
+    check_output_w(__LINE__, 2);
+}
+
 int main()
 {
     init_buffers();
@@ -464,6 +530,9 @@ int main()
     test_pred_and_n(true);
     test_pred_xor(false);
 
+    test_vadduwsat();
+    test_vsubuwsat_dv();
+
     puts(err ? "FAIL" : "PASS");
     return err ? 1 : 0;
 }

From 9a65990326cd59f28323714d72073515091383c9 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:47 -0800
Subject: [PATCH 03/12] Hexagon (target/hexagon) properly set FPINVF bit in
 sfcmp.uo and dfcmp.uo

Instead of checking for nan arguments, use float??_unordered_quiet

test cases added in a subsequent patch to more extensively test USR bits

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-4-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 47bd51e0ca..75dc0f23f0 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -938,8 +938,7 @@ int32_t HELPER(sfcmpuo)(CPUHexagonState *env, float32 RsV, float32 RtV)
 {
     int32_t PdV;
     arch_fpop_start(env);
-    PdV = f8BITSOF(float32_is_any_nan(RsV) ||
-                   float32_is_any_nan(RtV));
+    PdV = f8BITSOF(float32_unordered_quiet(RsV, RtV, &env->fp_status));
     arch_fpop_end(env);
     return PdV;
 }
@@ -1097,8 +1096,7 @@ int32_t HELPER(dfcmpuo)(CPUHexagonState *env, float64 RssV, float64 RttV)
 {
     int32_t PdV;
     arch_fpop_start(env);
-    PdV = f8BITSOF(float64_is_any_nan(RssV) ||
-                   float64_is_any_nan(RttV));
+    PdV = f8BITSOF(float64_unordered_quiet(RssV, RttV, &env->fp_status));
     arch_fpop_end(env);
     return PdV;
 }

From 77ccf44453a83e17cc830df700cc072f6bcf6a71 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:49 -0800
Subject: [PATCH 04/12] Hexagon (target/hexagon) properly handle denorm in
 arch_sf_recip_common

The arch_sf_recip_common function was calling float32_getexp which
adjusts for denorm, but the we actually need the raw exponent bits.

This function is called from 3 instructions
    sfrecipa
    sffixupn
    sffixupd

Test cases added to tests/tcg/hexagon/fpstuff.c

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-6-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/arch.c       |  6 ++---
 target/hexagon/fma_emu.h    |  6 ++++-
 tests/tcg/hexagon/fpstuff.c | 44 ++++++++++++++++++++++++++++++++++---
 3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/target/hexagon/arch.c b/target/hexagon/arch.c
index 68a55b3bd4..da79b41c4d 100644
--- a/target/hexagon/arch.c
+++ b/target/hexagon/arch.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -298,8 +298,8 @@ int arch_sf_recip_common(float32 *Rs, float32 *Rt, float32 *Rd, int *adjust,
     } else {
         PeV = 0x00;
         /* Basic checks passed */
-        n_exp = float32_getexp(RsV);
-        d_exp = float32_getexp(RtV);
+        n_exp = float32_getexp_raw(RsV);
+        d_exp = float32_getexp_raw(RtV);
         if ((n_exp - d_exp + SF_BIAS) <= SF_MANTBITS) {
             /* Near quotient underflow / inexact Q */
             PeV = 0x80;
diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
index e3b99a8cf4..91591d6050 100644
--- a/target/hexagon/fma_emu.h
+++ b/target/hexagon/fma_emu.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -24,6 +24,10 @@ static inline bool is_finite(float64 x)
 }
 
 int32_t float64_getexp(float64 f64);
+static inline uint32_t float32_getexp_raw(float32 f32)
+{
+    return extract32(f32, 23, 8);
+}
 int32_t float32_getexp(float32 f32);
 float32 infinite_float32(uint8_t sign);
 float32 internal_fmafx(float32 a, float32 b, float32 c,
diff --git a/tests/tcg/hexagon/fpstuff.c b/tests/tcg/hexagon/fpstuff.c
index 0dff429f4c..043f18fab3 100644
--- a/tests/tcg/hexagon/fpstuff.c
+++ b/tests/tcg/hexagon/fpstuff.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2020-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2020-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -38,6 +38,8 @@ const int SF_NaN_special =                0x7f800001;
 const int SF_ANY =                        0x3f800000;
 const int SF_HEX_NAN =                    0xffffffff;
 const int SF_small_neg =                  0xab98fba8;
+const int SF_denorm =                     0x00000001;
+const int SF_random =                     0x346001d6;
 
 const long long DF_NaN =                  0x7ff8000000000000ULL;
 const long long DF_ANY =                  0x3f80000000000000ULL;
@@ -250,10 +252,11 @@ static void check_dfminmax(void)
     check_fpstatus(usr, FPINVF);
 }
 
-static void check_recip_exception(void)
+static void check_sfrecipa(void)
 {
     int result;
     int usr;
+    int pred;
 
     /*
      * Check that sfrecipa doesn't set status bits when
@@ -329,6 +332,17 @@ static void check_recip_exception(void)
          : "r2", "p0", "usr");
     check32(result, 0x3f800000);
     check_fpstatus(usr, 0);
+
+    /*
+     * Check that sfrecipa properly handles denorm
+     */
+    asm (CLEAR_FPSTATUS
+         "%0,p0 = sfrecipa(%2, %3)\n\t"
+         "%1 = p0\n\t"
+         : "=r"(result), "=r"(pred) : "r"(SF_denorm), "r"(SF_random)
+         : "p0", "usr");
+    check32(result, 0x6a920001);
+    check32(pred, 0x80);
 }
 
 static void check_canonical_NaN(void)
@@ -455,6 +469,28 @@ static void check_invsqrta(void)
     check32(predval, 0x0);
 }
 
+static void check_sffixupn(void)
+{
+    int result;
+
+    /* Check that sffixupn properly deals with denorm */
+    asm volatile("%0 = sffixupn(%1, %2)\n\t"
+                 : "=r"(result)
+                 : "r"(SF_random), "r"(SF_denorm));
+    check32(result, 0x246001d6);
+}
+
+static void check_sffixupd(void)
+{
+    int result;
+
+    /* Check that sffixupd properly deals with denorm */
+    asm volatile("%0 = sffixupd(%1, %2)\n\t"
+                 : "=r"(result)
+                 : "r"(SF_denorm), "r"(SF_random));
+    check32(result, 0x146001d6);
+}
+
 static void check_float2int_convs()
 {
     int res32;
@@ -602,9 +638,11 @@ int main()
     check_compare_exception();
     check_sfminmax();
     check_dfminmax();
-    check_recip_exception();
+    check_sfrecipa();
     check_canonical_NaN();
     check_invsqrta();
+    check_sffixupn();
+    check_sffixupd();
     check_float2int_convs();
 
     puts(err ? "FAIL" : "PASS");

From d76dd816bf328a66ce57b2fb27d046656d3ab411 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Tue, 15 Feb 2022 20:39:39 -0800
Subject: [PATCH 05/12] Hexagon (target/hexagon) properly handle NaN in
 dfmin/dfmax/sfmin/sfmax

The float??_minnum implementation differs from Hexagon for SNaN,
it returns NaN, but Hexagon returns the other input.  So, we use
float??_minimum_number.

Test cases added to tests/tcg/hexagon/fpstuff.c

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220308190410.22355-1-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c  | 14 ++-----
 tests/tcg/hexagon/fpstuff.c | 79 +++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 75dc0f23f0..366caf9ec8 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -947,7 +947,7 @@ float32 HELPER(sfmax)(CPUHexagonState *env, float32 RsV, float32 RtV)
 {
     float32 RdV;
     arch_fpop_start(env);
-    RdV = float32_maxnum(RsV, RtV, &env->fp_status);
+    RdV = float32_maximum_number(RsV, RtV, &env->fp_status);
     arch_fpop_end(env);
     return RdV;
 }
@@ -956,7 +956,7 @@ float32 HELPER(sfmin)(CPUHexagonState *env, float32 RsV, float32 RtV)
 {
     float32 RdV;
     arch_fpop_start(env);
-    RdV = float32_minnum(RsV, RtV, &env->fp_status);
+    RdV = float32_minimum_number(RsV, RtV, &env->fp_status);
     arch_fpop_end(env);
     return RdV;
 }
@@ -1040,10 +1040,7 @@ float64 HELPER(dfmax)(CPUHexagonState *env, float64 RssV, float64 RttV)
 {
     float64 RddV;
     arch_fpop_start(env);
-    RddV = float64_maxnum(RssV, RttV, &env->fp_status);
-    if (float64_is_any_nan(RssV) || float64_is_any_nan(RttV)) {
-        float_raise(float_flag_invalid, &env->fp_status);
-    }
+    RddV = float64_maximum_number(RssV, RttV, &env->fp_status);
     arch_fpop_end(env);
     return RddV;
 }
@@ -1052,10 +1049,7 @@ float64 HELPER(dfmin)(CPUHexagonState *env, float64 RssV, float64 RttV)
 {
     float64 RddV;
     arch_fpop_start(env);
-    RddV = float64_minnum(RssV, RttV, &env->fp_status);
-    if (float64_is_any_nan(RssV) || float64_is_any_nan(RttV)) {
-        float_raise(float_flag_invalid, &env->fp_status);
-    }
+    RddV = float64_minimum_number(RssV, RttV, &env->fp_status);
     arch_fpop_end(env);
     return RddV;
 }
diff --git a/tests/tcg/hexagon/fpstuff.c b/tests/tcg/hexagon/fpstuff.c
index 043f18fab3..56bf562a40 100644
--- a/tests/tcg/hexagon/fpstuff.c
+++ b/tests/tcg/hexagon/fpstuff.c
@@ -41,7 +41,8 @@ const int SF_small_neg =                  0xab98fba8;
 const int SF_denorm =                     0x00000001;
 const int SF_random =                     0x346001d6;
 
-const long long DF_NaN =                  0x7ff8000000000000ULL;
+const long long DF_QNaN =                 0x7ff8000000000000ULL;
+const long long DF_SNaN =                 0x7ff7000000000000ULL;
 const long long DF_ANY =                  0x3f80000000000000ULL;
 const long long DF_HEX_NAN =              0xffffffffffffffffULL;
 const long long DF_small_neg =            0xbd731f7500000000ULL;
@@ -128,7 +129,7 @@ static void check_compare_exception(void)
          "p0 = dfcmp.eq(%2, %3)\n\t"
          "%0 = p0\n\t"
          "%1 = usr\n\t"
-         : "=r"(cmp), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(cmp), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
          : "r2", "p0", "usr");
     check32(cmp, 0);
     check_fpstatus(usr, 0);
@@ -137,7 +138,7 @@ static void check_compare_exception(void)
          "p0 = dfcmp.gt(%2, %3)\n\t"
          "%0 = p0\n\t"
          "%1 = usr\n\t"
-         : "=r"(cmp), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(cmp), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
          : "r2", "p0", "usr");
     check32(cmp, 0);
     check_fpstatus(usr, 0);
@@ -146,7 +147,7 @@ static void check_compare_exception(void)
          "p0 = dfcmp.ge(%2, %3)\n\t"
          "%0 = p0\n\t"
          "%1 = usr\n\t"
-         : "=r"(cmp), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(cmp), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
          : "r2", "p0", "usr");
     check32(cmp, 0);
     check_fpstatus(usr, 0);
@@ -208,7 +209,7 @@ static void check_dfminmax(void)
     int usr;
 
     /*
-     * Execute dfmin/dfmax instructions with one operand as NaN
+     * Execute dfmin/dfmax instructions with one operand as SNaN
      * Check that
      *     Result is the other operand
      *     Invalid bit in USR is set
@@ -216,7 +217,7 @@ static void check_dfminmax(void)
      asm (CLEAR_FPSTATUS
          "%0 = dfmin(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_ANY)
          : "r2", "usr");
     check64(minmax, DF_ANY);
     check_fpstatus(usr, FPINVF);
@@ -224,13 +225,35 @@ static void check_dfminmax(void)
     asm (CLEAR_FPSTATUS
          "%0 = dfmax(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_ANY)
          : "r2", "usr");
     check64(minmax, DF_ANY);
     check_fpstatus(usr, FPINVF);
 
     /*
-     * Execute dfmin/dfmax instructions with both operands NaN
+     * Execute dfmin/dfmax instructions with one operand as QNaN
+     * Check that
+     *     Result is the other operand
+     *     No bit in USR is set
+     */
+     asm (CLEAR_FPSTATUS
+         "%0 = dfmin(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
+         : "r2", "usr");
+    check64(minmax, DF_ANY);
+    check_fpstatus(usr, 0);
+
+    asm (CLEAR_FPSTATUS
+         "%0 = dfmax(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
+         : "r2", "usr");
+    check64(minmax, DF_ANY);
+    check_fpstatus(usr, 0);
+
+    /*
+     * Execute dfmin/dfmax instructions with both operands SNaN
      * Check that
      *     Result is DF_HEX_NAN
      *     Invalid bit in USR is set
@@ -238,7 +261,7 @@ static void check_dfminmax(void)
     asm (CLEAR_FPSTATUS
          "%0 = dfmin(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_NaN)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_SNaN)
          : "r2", "usr");
     check64(minmax, DF_HEX_NAN);
     check_fpstatus(usr, FPINVF);
@@ -246,10 +269,32 @@ static void check_dfminmax(void)
     asm (CLEAR_FPSTATUS
          "%0 = dfmax(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_NaN)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_SNaN)
          : "r2", "usr");
     check64(minmax, DF_HEX_NAN);
     check_fpstatus(usr, FPINVF);
+
+    /*
+     * Execute dfmin/dfmax instructions with both operands QNaN
+     * Check that
+     *     Result is DF_HEX_NAN
+     *     No bit in USR is set
+     */
+    asm (CLEAR_FPSTATUS
+         "%0 = dfmin(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_QNaN)
+         : "r2", "usr");
+    check64(minmax, DF_HEX_NAN);
+    check_fpstatus(usr, 0);
+
+    asm (CLEAR_FPSTATUS
+         "%0 = dfmax(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_QNaN)
+         : "r2", "usr");
+    check64(minmax, DF_HEX_NAN);
+    check_fpstatus(usr, 0);
 }
 
 static void check_sfrecipa(void)
@@ -425,7 +470,7 @@ static void check_canonical_NaN(void)
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2sf(%2)\n\t"
         "%1 = usr\n\t"
-        : "=r"(sf_result), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(sf_result), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check32(sf_result, SF_HEX_NAN);
     check_fpstatus(usr, 0);
@@ -433,7 +478,7 @@ static void check_canonical_NaN(void)
     asm(CLEAR_FPSTATUS
         "%0 = dfadd(%2, %3)\n\t"
         "%1 = usr\n\t"
-        : "=r"(df_result), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+        : "=r"(df_result), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
         : "r2", "usr");
     check64(df_result, DF_HEX_NAN);
     check_fpstatus(usr, 0);
@@ -441,7 +486,7 @@ static void check_canonical_NaN(void)
     asm(CLEAR_FPSTATUS
         "%0 = dfsub(%2, %3)\n\t"
         "%1 = usr\n\t"
-        : "=r"(df_result), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+        : "=r"(df_result), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
         : "r2", "usr");
     check64(df_result, DF_HEX_NAN);
     check_fpstatus(usr, 0);
@@ -603,7 +648,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2w(%2)\n\t"
         "%1 = usr\n\t"
-        : "=r"(res32), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res32), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check32(res32, -1);
     check_fpstatus(usr, FPINVF);
@@ -611,7 +656,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2w(%2):chop\n\t"
         "%1 = usr\n\t"
-        : "=r"(res32), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res32), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check32(res32, -1);
     check_fpstatus(usr, FPINVF);
@@ -619,7 +664,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2d(%2)\n\t"
         "%1 = usr\n\t"
-        : "=r"(res64), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res64), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check64(res64, -1);
     check_fpstatus(usr, FPINVF);
@@ -627,7 +672,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2d(%2):chop\n\t"
         "%1 = usr\n\t"
-        : "=r"(res64), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res64), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check64(res64, -1);
     check_fpstatus(usr, FPINVF);

From 4d04395a1716c669cf634a90e768c1baa0e68aff Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:50 -0800
Subject: [PATCH 06/12] Hexagon (tests/tcg/hexagon) test instructions that
 might set bits in USR

Hexagon has ~200 instructions that set the saturate bit in USR, these
were broken into groups of similar instructions and one instruction
from each group is tested with at least one input that does not
saturate and at least one input that does saturate.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-7-tsimpson@quicinc.com>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/Makefile.target |   8 +-
 tests/tcg/hexagon/usr.c           | 798 ++++++++++++++++++++++++++++++
 2 files changed, 805 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/hexagon/usr.c

diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index 8b07a28166..23b9870534 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -1,5 +1,5 @@
 ##
-##  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -30,6 +30,7 @@ first: $(HEX_SRC)/first.S
 HEX_TESTS = first
 HEX_TESTS += hex_sigsegv
 HEX_TESTS += misc
+HEX_TESTS += usr
 HEX_TESTS += preg_alias
 HEX_TESTS += dual_stores
 HEX_TESTS += multi_result
@@ -43,3 +44,8 @@ HEX_TESTS += fpstuff
 HEX_TESTS += overflow
 
 TESTS += $(HEX_TESTS)
+
+# This test has to be compiled for the -mv67t target
+usr: usr.c
+	$(CC) $(CFLAGS) -mv67t -O2 -Wno-inline-asm -Wno-expansion-to-defined $< -o $@ $(LDFLAGS)
+
diff --git a/tests/tcg/hexagon/usr.c b/tests/tcg/hexagon/usr.c
new file mode 100644
index 0000000000..e82727237e
--- /dev/null
+++ b/tests/tcg/hexagon/usr.c
@@ -0,0 +1,798 @@
+/*
+ *  Copyright(c) 2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Test instructions that might set bits in user status register (USR)
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+int err;
+
+static void __check(int line, uint32_t val, uint32_t expect)
+{
+    if (val != expect) {
+        printf("ERROR at line %d: %d != %d\n", line, val, expect);
+        err++;
+    }
+}
+
+#define check(RES, EXP) __check(__LINE__, RES, EXP)
+
+static void __check32(int line, uint32_t val, uint32_t expect)
+{
+    if (val != expect) {
+        printf("ERROR at line %d: 0x%08x != 0x%08x\n", line, val, expect);
+        err++;
+    }
+}
+
+#define check32(RES, EXP) __check32(__LINE__, RES, EXP)
+
+static void __check64(int line, uint64_t val, uint64_t expect)
+{
+    if (val != expect) {
+        printf("ERROR at line %d: 0x%016llx != 0x%016llx\n", line, val, expect);
+        err++;
+    }
+}
+
+#define check64(RES, EXP) __check64(__LINE__, RES, EXP)
+
+/*
+ * Some of the instructions tested are only available on certain versions
+ * of the Hexagon core
+ */
+#define CORE_HAS_AUDIO    (__HEXAGON_ARCH__ >= 67 && defined(__HEXAGON_AUDIO__))
+#define CORE_IS_V67       (__HEXAGON_ARCH__ >= 67)
+
+/* Define the bits in Hexagon USR register */
+#define USR_OVF_BIT          0        /* Sticky saturation overflow */
+#define USR_FPINVF_BIT       1        /* IEEE FP invalid sticky flag */
+#define USR_FPDBZF_BIT       2        /* IEEE FP divide-by-zero sticky flag */
+#define USR_FPOVFF_BIT       3        /* IEEE FP overflow sticky flag */
+#define USR_FPUNFF_BIT       4        /* IEEE FP underflow sticky flag */
+#define USR_FPINPF_BIT       5        /* IEEE FP inexact sticky flag */
+
+/* Corresponding values in USR */
+#define USR_CLEAR            0
+#define USR_OVF              (1 << USR_OVF_BIT)
+#define USR_FPINVF           (1 << USR_FPINVF_BIT)
+#define USR_FPDBZF           (1 << USR_FPDBZF_BIT)
+#define USR_FPOVFF           (1 << USR_FPOVFF_BIT)
+#define USR_FPUNFF           (1 << USR_FPUNFF_BIT)
+#define USR_FPINPF           (1 << USR_FPINPF_BIT)
+
+/*
+ * Templates for functions to execute an instruction
+ *
+ * The templates vary by the number of arguments and the types of the args
+ * and result.  We use one letter in the macro name for the result and each
+ * argument:
+ *     x             unknown (specified in a subsequent template) or don't care
+ *     R             register (32 bits)
+ *     P             pair (64 bits)
+ *     p             predicate
+ *     I             immediate
+ *     Xx            read/write
+ */
+
+/* Clear bits 0-5 in USR */
+#define CLEAR_USRBITS \
+    "r2 = usr\n\t" \
+    "r2 = and(r2, #0xffffffc0)\n\t" \
+    "usr = r2\n\t"
+
+/* Template for instructions with one register operand */
+#define FUNC_x_OP_x(RESTYPE, SRCTYPE, NAME, INSN) \
+static RESTYPE NAME(SRCTYPE src, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN  "\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src) \
+        : "r2", "usr"); \
+      *usr_result = usr & 0x3f; \
+      return result; \
+}
+
+#define FUNC_R_OP_R(NAME, INSN) \
+FUNC_x_OP_x(uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_P(NAME, INSN) \
+FUNC_x_OP_x(uint32_t, uint64_t, NAME, INSN)
+
+#define FUNC_P_OP_P(NAME, INSN) \
+FUNC_x_OP_x(uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_P_OP_R(NAME, INSN) \
+FUNC_x_OP_x(uint64_t, uint32_t, NAME, INSN)
+
+/*
+ * Template for instructions with a register and predicate result
+ * and one register operand
+ */
+#define FUNC_xp_OP_x(RESTYPE, SRCTYPE, NAME, INSN) \
+static RESTYPE NAME(SRCTYPE src, uint8_t *pred_result, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint8_t pred; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN  "\n\t" \
+        "%1 = p2\n\t" \
+        "%2 = usr\n\t" \
+        : "=r"(result), "=r"(pred), "=r"(usr) \
+        : "r"(src) \
+        : "r2", "p2", "usr"); \
+    *pred_result = pred; \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_Rp_OP_R(NAME, INSN) \
+FUNC_xp_OP_x(uint32_t, uint32_t, NAME, INSN)
+
+/* Template for instructions with two register operands */
+#define FUNC_x_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(SRC1TYPE src1, SRC2TYPE src2, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_P_OP_PP(NAME, INSN) \
+FUNC_x_OP_xx(uint64_t, uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_R_OP_PP(NAME, INSN) \
+FUNC_x_OP_xx(uint32_t, uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_P_OP_RR(NAME, INSN) \
+FUNC_x_OP_xx(uint64_t, uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_RR(NAME, INSN) \
+FUNC_x_OP_xx(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_PR(NAME, INSN) \
+FUNC_x_OP_xx(uint32_t, uint64_t, uint32_t, NAME, INSN)
+
+#define FUNC_P_OP_PR(NAME, INSN) \
+FUNC_x_OP_xx(uint64_t, uint64_t, uint32_t, NAME, INSN)
+
+/*
+ * Template for instructions with a register and predicate result
+ * and two register operands
+ */
+#define FUNC_xp_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(SRC1TYPE src1, SRC2TYPE src2, \
+                    uint8_t *pred_result, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint8_t pred; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN  "\n\t" \
+        "%1 = p2\n\t" \
+        "%2 = usr\n\t" \
+        : "=r"(result), "=r"(pred), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "p2", "usr"); \
+    *pred_result = pred; \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_Rp_OP_RR(NAME, INSN) \
+FUNC_xp_OP_xx(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+/* Template for instructions with one register and one immediate */
+#define FUNC_x_OP_xI(RESTYPE, SRC1TYPE, NAME, INSN) \
+static RESTYPE NAME(SRC1TYPE src1, int32_t src2, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src1), "i"(src2) \
+        : "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_R_OP_RI(NAME, INSN) \
+FUNC_x_OP_xI(uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_PI(NAME, INSN) \
+FUNC_x_OP_xI(uint32_t, uint64_t, NAME, INSN)
+
+/*
+ * Template for instructions with a read/write result
+ * and two register operands
+ */
+#define FUNC_Xx_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, \
+                    uint32_t *usr_result) \
+{ \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "+r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_XR_OP_RR(NAME, INSN) \
+FUNC_Xx_OP_xx(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_XP_OP_PP(NAME, INSN) \
+FUNC_Xx_OP_xx(uint64_t, uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_XP_OP_RR(NAME, INSN) \
+FUNC_Xx_OP_xx(uint64_t, uint32_t, uint32_t, NAME, INSN)
+
+/*
+ * Template for instructions with a read/write result
+ * and two register operands
+ */
+#define FUNC_Xxp_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, \
+                    uint8_t *pred_result, uint32_t *usr_result) \
+{ \
+    uint32_t usr; \
+    uint8_t pred; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = p2\n\t" \
+        "%2 = usr\n\t" \
+        : "+r"(result), "=r"(pred), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "usr"); \
+    *pred_result = pred; \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_XPp_OP_PP(NAME, INSN) \
+FUNC_Xxp_OP_xx(uint64_t, uint64_t, uint64_t, NAME, INSN)
+
+/*
+ * Template for instructions with a read/write result and
+ * two register and one predicate operands
+ */
+#define FUNC_Xx_OP_xxp(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, uint8_t pred,\
+                    uint32_t *usr_result) \
+{ \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        "p2 = %4\n\t" \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "+r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2), "r"(pred) \
+        : "r2", "p2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_XR_OP_RRp(NAME, INSN) \
+FUNC_Xx_OP_xxp(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+/*
+ * Function declarations using the templates
+ */
+FUNC_R_OP_R(satub,              "%0 = satub(%2)")
+FUNC_P_OP_PP(vaddubs,           "%0 = vaddub(%2, %3):sat")
+FUNC_P_OP_PP(vadduhs,           "%0 = vadduh(%2, %3):sat")
+FUNC_P_OP_PP(vsububs,           "%0 = vsubub(%2, %3):sat")
+FUNC_P_OP_PP(vsubuhs,           "%0 = vsubuh(%2, %3):sat")
+
+/* Add vector of half integers with saturation and pack to unsigned bytes */
+FUNC_R_OP_PP(vaddhubs,          "%0 = vaddhub(%2, %3):sat")
+
+/* Vector saturate half to unsigned byte */
+FUNC_R_OP_P(vsathub,            "%0 = vsathub(%2)")
+
+/* Similar to above but takes a 32-bit argument */
+FUNC_R_OP_R(svsathub,           "%0 = vsathub(%2)")
+
+/* Vector saturate word to unsigned half */
+FUNC_P_OP_P(vsatwuh_nopack,     "%0 = vsatwuh(%2)")
+
+/* Similar to above but returns a 32-bit result */
+FUNC_R_OP_P(vsatwuh,            "%0 = vsatwuh(%2)")
+
+/* Vector arithmetic shift halfwords with saturate and pack */
+FUNC_R_OP_PI(asrhub_sat,        "%0 = vasrhub(%2, #%3):sat")
+
+/* Vector arithmetic shift halfwords with round, saturate and pack */
+FUNC_R_OP_PI(asrhub_rnd_sat,    "%0 = vasrhub(%2, #%3):raw")
+
+FUNC_R_OP_RR(addsat,            "%0 = add(%2, %3):sat")
+/* Similar to above but with register pairs */
+FUNC_P_OP_PP(addpsat,           "%0 = add(%2, %3):sat")
+
+FUNC_XR_OP_RR(mpy_acc_sat_hh_s0, "%0 += mpy(%2.H, %3.H):sat")
+FUNC_R_OP_RR(mpy_sat_hh_s1,     "%0 = mpy(%2.H, %3.H):<<1:sat")
+FUNC_R_OP_RR(mpy_sat_rnd_hh_s1, "%0 = mpy(%2.H, %3.H):<<1:rnd:sat")
+FUNC_R_OP_RR(mpy_up_s1_sat,     "%0 = mpy(%2, %3):<<1:sat")
+FUNC_P_OP_RR(vmpy2s_s1,         "%0 = vmpyh(%2, %3):<<1:sat")
+FUNC_P_OP_RR(vmpy2su_s1,        "%0 = vmpyhsu(%2, %3):<<1:sat")
+FUNC_R_OP_RR(vmpy2s_s1pack,     "%0 = vmpyh(%2, %3):<<1:rnd:sat")
+FUNC_P_OP_PP(vmpy2es_s1,        "%0 = vmpyeh(%2, %3):<<1:sat")
+FUNC_R_OP_PP(vdmpyrs_s1,        "%0 = vdmpy(%2, %3):<<1:rnd:sat")
+FUNC_XP_OP_PP(vdmacs_s0,        "%0 += vdmpy(%2, %3):sat")
+FUNC_R_OP_RR(cmpyrs_s0,         "%0 = cmpy(%2, %3):rnd:sat")
+FUNC_XP_OP_RR(cmacs_s0,         "%0 += cmpy(%2, %3):sat")
+FUNC_XP_OP_RR(cnacs_s0,         "%0 -= cmpy(%2, %3):sat")
+FUNC_P_OP_PP(vrcmpys_s1_h,      "%0 = vrcmpys(%2, %3):<<1:sat:raw:hi")
+FUNC_XP_OP_PP(mmacls_s0,        "%0 += vmpyweh(%2, %3):sat")
+FUNC_R_OP_RR(hmmpyl_rs1,        "%0 = mpy(%2, %3.L):<<1:rnd:sat")
+FUNC_XP_OP_PP(mmaculs_s0,       "%0 += vmpyweuh(%2, %3):sat")
+FUNC_R_OP_PR(cmpyi_wh,          "%0 = cmpyiwh(%2, %3):<<1:rnd:sat")
+FUNC_P_OP_PP(vcmpy_s0_sat_i,    "%0 = vcmpyi(%2, %3):sat")
+FUNC_P_OP_PR(vcrotate,          "%0 = vcrotate(%2, %3)")
+FUNC_P_OP_PR(vcnegh,            "%0 = vcnegh(%2, %3)")
+
+#if CORE_HAS_AUDIO
+FUNC_R_OP_PP(wcmpyrw,           "%0 = cmpyrw(%2, %3):<<1:sat")
+#endif
+
+FUNC_R_OP_RR(addh_l16_sat_ll,   "%0 = add(%2.L, %3.L):sat")
+FUNC_P_OP_P(vconj,              "%0 = vconj(%2):sat")
+FUNC_P_OP_PP(vxaddsubw,         "%0 = vxaddsubw(%2, %3):sat")
+FUNC_P_OP_P(vabshsat,           "%0 = vabsh(%2):sat")
+FUNC_P_OP_PP(vnavgwr,           "%0 = vnavgw(%2, %3):rnd:sat")
+FUNC_R_OP_RI(round_ri_sat,      "%0 = round(%2, #%3):sat")
+FUNC_R_OP_RR(asr_r_r_sat,       "%0 = asr(%2, %3):sat")
+
+FUNC_XPp_OP_PP(ACS,             "%0, p2 = vacsh(%3, %4)")
+
+/*
+ * Templates for test cases
+ *
+ * Same naming convention as the function templates
+ */
+#define TEST_x_OP_x(RESTYPE, CHECKFN, SRCTYPE, FUNC, SRC, RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRCTYPE src = SRC; \
+        uint32_t usr_result; \
+        result = FUNC(src, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_R_OP_R(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint32_t, check32, uint32_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_R_OP_P(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint32_t, check32, uint64_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_P_OP_P(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint64_t, check64, uint64_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_P_OP_R(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint64_t, check64, uint32_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_xp_OP_x(RESTYPE, CHECKFN, SRCTYPE, FUNC, SRC, \
+                     RES, PRED_RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRCTYPE src = SRC; \
+        uint8_t pred_result; \
+        uint32_t usr_result; \
+        result = FUNC(src, &pred_result, &usr_result); \
+        CHECKFN(result, RES); \
+        check(pred_result, PRED_RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_Rp_OP_R(FUNC, SRC, RES, PRED_RES, USR_RES) \
+TEST_xp_OP_x(uint32_t, check32, uint32_t, FUNC, SRC, RES, PRED_RES, USR_RES)
+
+#define TEST_x_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                     FUNC, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_P_OP_PP(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint64_t, check64, uint64_t, uint64_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_PP(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint32_t, check32, uint64_t, uint64_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_P_OP_RR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint64_t, check64, uint32_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_RR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint32_t, check32, uint32_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_PR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint32_t, check32, uint64_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_P_OP_PR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint64_t, check64, uint64_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_xp_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, FUNC, SRC1, SRC2, \
+                      RES, PRED_RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint8_t pred_result; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &pred_result, &usr_result); \
+        CHECKFN(result, RES); \
+        check(pred_result, PRED_RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_Rp_OP_RR(FUNC, SRC1, SRC2, RES, PRED_RES, USR_RES) \
+TEST_xp_OP_xx(uint32_t, check32, uint32_t, uint32_t, FUNC, SRC1, SRC2, \
+              RES, PRED_RES, USR_RES)
+
+#define TEST_x_OP_xI(RESTYPE, CHECKFN, SRC1TYPE, \
+                     FUNC, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRC1TYPE src1 = SRC1; \
+        uint32_t src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_R_OP_RI(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xI(uint32_t, check32, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_PI(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xI(uint32_t, check64, uint64_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_Xx_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                      FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        RESTYPE result = RESIN; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(result, src1, src2, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_XR_OP_RR(FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+TEST_Xx_OP_xx(uint32_t, check32, uint32_t, uint32_t, \
+              FUNC, RESIN, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_XP_OP_PP(FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+TEST_Xx_OP_xx(uint64_t, check64, uint64_t, uint64_t, \
+              FUNC, RESIN, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_XP_OP_RR(FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+TEST_Xx_OP_xx(uint64_t, check64, uint32_t, uint32_t, \
+              FUNC, RESIN, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_Xxp_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                       FUNC, RESIN, SRC1, SRC2, RES, PRED_RES, USR_RES) \
+    do { \
+        RESTYPE result = RESIN; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint8_t pred_res; \
+        uint32_t usr_result; \
+        result = FUNC(result, src1, src2, &pred_res, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_XPp_OP_PP(FUNC, RESIN, SRC1, SRC2, RES, PRED_RES, USR_RES) \
+TEST_Xxp_OP_xx(uint64_t, check64, uint64_t, uint64_t, FUNC, RESIN, SRC1, SRC2, \
+               RES, PRED_RES, USR_RES)
+
+#define TEST_Xx_OP_xxp(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                      FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES) \
+    do { \
+        RESTYPE result = RESIN; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint8_t pred = PRED; \
+        uint32_t usr_result; \
+        result = FUNC(result, src1, src2, pred, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_XR_OP_RRp(FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES) \
+TEST_Xx_OP_xxp(uint32_t, check32, uint32_t, uint32_t, \
+              FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES)
+
+int main()
+{
+    TEST_R_OP_R(satub,       0,         0,         USR_CLEAR);
+    TEST_R_OP_R(satub,       0xff,      0xff,      USR_CLEAR);
+    TEST_R_OP_R(satub,       0xfff,     0xff,      USR_OVF);
+    TEST_R_OP_R(satub,       -1,        0,         USR_OVF);
+
+    TEST_P_OP_PP(vaddubs,    0xfeLL,    0x01LL,    0xffLL,    USR_CLEAR);
+    TEST_P_OP_PP(vaddubs,    0xffLL,    0xffLL,    0xffLL,    USR_OVF);
+
+    TEST_P_OP_PP(vadduhs,    0xfffeLL,  0x1LL,     0xffffLL,  USR_CLEAR);
+    TEST_P_OP_PP(vadduhs,    0xffffLL,  0x1LL,     0xffffLL,  USR_OVF);
+
+    TEST_P_OP_PP(vsububs, 0x0807060504030201LL, 0x0101010101010101LL,
+                 0x0706050403020100LL, USR_CLEAR);
+    TEST_P_OP_PP(vsububs, 0x0807060504030201LL, 0x0202020202020202LL,
+                 0x0605040302010000LL, USR_OVF);
+
+    TEST_P_OP_PP(vsubuhs, 0x0004000300020001LL, 0x0001000100010001LL,
+                 0x0003000200010000LL, USR_CLEAR);
+    TEST_P_OP_PP(vsubuhs, 0x0004000300020001LL, 0x0002000200020002LL,
+                 0x0002000100000000LL, USR_OVF);
+
+    TEST_R_OP_PP(vaddhubs, 0x0004000300020001LL, 0x0001000100010001LL,
+                 0x05040302, USR_CLEAR);
+    TEST_R_OP_PP(vaddhubs, 0x7fff000300020001LL, 0x0002000200020002LL,
+                 0xff050403, USR_OVF);
+
+    TEST_R_OP_P(vsathub,         0x0001000300020001LL, 0x01030201, USR_CLEAR);
+    TEST_R_OP_P(vsathub,         0x010000700080ffffLL, 0xff708000, USR_OVF);
+
+    TEST_R_OP_P(vsatwuh,         0x0000ffff00000001LL, 0xffff0001, USR_CLEAR);
+    TEST_R_OP_P(vsatwuh,         0x800000000000ffffLL, 0x0000ffff, USR_OVF);
+
+    TEST_P_OP_P(vsatwuh_nopack,  0x0000ffff00000001LL, 0x0000ffff00000001LL,
+                USR_CLEAR);
+    TEST_P_OP_P(vsatwuh_nopack,  0x800000000000ffffLL, 0x000000000000ffffLL,
+                USR_OVF);
+
+    TEST_R_OP_R(svsathub,        0x00020001,           0x0201,     USR_CLEAR);
+    TEST_R_OP_R(svsathub,        0x0080ffff,           0x8000,     USR_OVF);
+
+    TEST_R_OP_PI(asrhub_sat,     0x004f003f002f001fLL, 3,    0x09070503,
+                 USR_CLEAR);
+    TEST_R_OP_PI(asrhub_sat,     0x004fffff8fff001fLL, 3,    0x09000003,
+                 USR_OVF);
+
+    TEST_R_OP_PI(asrhub_rnd_sat, 0x004f003f002f001fLL, 2,    0x0a080604,
+                 USR_CLEAR);
+    TEST_R_OP_PI(asrhub_rnd_sat, 0x004fffff8fff001fLL, 2,    0x0a000004,
+                 USR_OVF);
+
+    TEST_R_OP_RR(addsat,        1,              2,              3,
+                 USR_CLEAR);
+    TEST_R_OP_RR(addsat,        0x7fffffff,     0x00000010,     0x7fffffff,
+                 USR_OVF);
+    TEST_R_OP_RR(addsat,        0x80000000,     0x80000006,     0x80000000,
+                 USR_OVF);
+
+    TEST_P_OP_PP(addpsat, 1LL, 2LL, 3LL, USR_CLEAR);
+    /* overflow to max positive */
+    TEST_P_OP_PP(addpsat, 0x7ffffffffffffff0LL, 0x0000000000000010LL,
+                 0x7fffffffffffffffLL, USR_OVF);
+    /* overflow to min negative */
+    TEST_P_OP_PP(addpsat, 0x8000000000000003LL, 0x8000000000000006LL,
+                 0x8000000000000000LL, USR_OVF);
+
+    TEST_XR_OP_RR(mpy_acc_sat_hh_s0, 0x7fffffff, 0xffff0000, 0x11110000,
+                  0x7fffeeee, USR_CLEAR);
+    TEST_XR_OP_RR(mpy_acc_sat_hh_s0, 0x7fffffff, 0x7fff0000, 0x7fff0000,
+                  0x7fffffff, USR_OVF);
+
+    TEST_R_OP_RR(mpy_sat_hh_s1,        0xffff0000, 0x11110000, 0xffffddde,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_hh_s1,        0x7fff0000, 0x7fff0000, 0x7ffe0002,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_hh_s1,        0x80000000, 0x80000000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_R_OP_RR(mpy_sat_rnd_hh_s1,    0xffff0000, 0x11110000, 0x00005dde,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_rnd_hh_s1,    0x7fff0000, 0x7fff0000, 0x7ffe8002,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_rnd_hh_s1,    0x80000000, 0x80000000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_R_OP_RR(mpy_up_s1_sat,        0xffff0000, 0x11110000, 0xffffddde,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_up_s1_sat,        0x7fff0000, 0x7fff0000, 0x7ffe0002,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_up_s1_sat,        0x80000000, 0x80000000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_P_OP_RR(vmpy2s_s1,  0x7fff0000, 0x7fff0000, 0x7ffe000200000000LL,
+                 USR_CLEAR);
+    TEST_P_OP_RR(vmpy2s_s1,  0x80000000, 0x80000000, 0x7fffffff00000000LL,
+                 USR_OVF);
+
+    TEST_P_OP_RR(vmpy2su_s1, 0x7fff0000, 0x7fff0000, 0x7ffe000200000000LL,
+                 USR_CLEAR);
+    TEST_P_OP_RR(vmpy2su_s1, 0xffffbd97, 0xffffffff, 0xfffe000280000000LL,
+                 USR_OVF);
+
+    TEST_R_OP_RR(vmpy2s_s1pack,        0x7fff0000, 0x7fff0000, 0x7ffe0000,
+                 USR_CLEAR);
+    TEST_R_OP_RR(vmpy2s_s1pack,        0x80008000, 0x80008000, 0x7fff7fff,
+                 USR_OVF);
+
+    TEST_P_OP_PP(vmpy2es_s1, 0x7fff7fff7fff7fffLL, 0x1fff1fff1fff1fffLL,
+                 0x1ffec0021ffec002LL, USR_CLEAR);
+    TEST_P_OP_PP(vmpy2es_s1, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fffffff7fffffffLL, USR_OVF);
+
+    TEST_R_OP_PP(vdmpyrs_s1, 0x7fff7fff7fff7fffLL, 0x1fff1fff1fff1fffLL,
+                 0x3ffe3ffe, USR_CLEAR);
+    TEST_R_OP_PP(vdmpyrs_s1, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fff7fffLL, USR_OVF);
+
+    TEST_XP_OP_PP(vdmacs_s0, 0x0fffffffULL, 0x00ff00ff00ff00ffLL,
+                  0x00ff00ff00ff00ffLL, 0x0001fc021001fc01LL, USR_CLEAR);
+    TEST_XP_OP_PP(vdmacs_s0, 0x01111111ULL, 0x8000800080001000LL,
+                  0x8000800080008000LL, 0x7fffffff39111111LL, USR_OVF);
+
+    TEST_R_OP_RR(cmpyrs_s0,            0x7fff0000, 0x7fff0000, 0x0000c001,
+                 USR_CLEAR);
+    TEST_R_OP_RR(cmpyrs_s0,            0x80008000, 0x80008000, 0x7fff0000,
+                 USR_OVF);
+
+    TEST_XP_OP_RR(cmacs_s0, 0x0fffffff, 0x7fff0000, 0x7fff0000,
+                  0x00000000d000fffeLL, USR_CLEAR);
+    TEST_XP_OP_RR(cmacs_s0, 0x0fff1111, 0x80008000, 0x80008000,
+                  0x7fffffff0fff1111LL, USR_OVF);
+
+    TEST_XP_OP_RR(cnacs_s0, 0x000000108fffffffULL, 0x7fff0000, 0x7fff0000,
+                  0x00000010cfff0000ULL, USR_CLEAR);
+    TEST_XP_OP_RR(cnacs_s0, 0x000000108ff1111fULL, 0x00002001, 0x00007ffd,
+                  0x0000001080000000ULL, USR_OVF);
+
+    TEST_P_OP_PP(vrcmpys_s1_h, 0x00ff00ff00ff00ffLL, 0x00ff00ff00ff00ffLL,
+                 0x0003f8040003f804LL, USR_CLEAR);
+    TEST_P_OP_PP(vrcmpys_s1_h, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fffffff7fffffffLL, USR_OVF);
+
+    TEST_XP_OP_PP(mmacls_s0, 0x6fffffff, 0x00ff00ff00ff00ffLL,
+                  0x00ff00ff00ff00ffLL, 0x0000fe017000fe00LL, USR_CLEAR);
+    TEST_XP_OP_PP(mmacls_s0, 0x6f1111ff, 0x8000800080008000LL,
+                  0x1000100080008000LL, 0xf80008007fffffffLL, USR_OVF);
+
+    TEST_R_OP_RR(hmmpyl_rs1,           0x7fff0000, 0x7fff0001, 0x0000fffe,
+                 USR_CLEAR);
+    TEST_R_OP_RR(hmmpyl_rs1,           0x80000000, 0x80008000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_XP_OP_PP(mmaculs_s0, 0x000000007fffffffULL, 0xffff800080008000LL,
+                  0xffff800080008000LL, 0xffffc00040003fffLL, USR_CLEAR);
+    TEST_XP_OP_PP(mmaculs_s0, 0x000011107fffffffULL, 0x00ff00ff00ff00ffLL,
+                  0x00ff00ff001100ffLL, 0x00010f117fffffffLL, USR_OVF);
+
+    TEST_R_OP_PR(cmpyi_wh, 0x7fff000000000000LL, 0x7fff0001, 0x0000fffe,
+                 USR_CLEAR);
+    TEST_R_OP_PR(cmpyi_wh, 0x8000000000000000LL, 0x80008000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_P_OP_PP(vcmpy_s0_sat_i, 0x00ff00ff00ff00ffLL, 0x00ff00ff00ff00ffLL,
+                 0x0001fc020001fc02LL, USR_CLEAR);
+    TEST_P_OP_PP(vcmpy_s0_sat_i, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fffffff7fffffffLL, USR_OVF);
+
+    TEST_P_OP_PR(vcrotate, 0x8000000000000000LL, 0x00000002,
+                 0x8000000000000000LL, USR_CLEAR);
+    TEST_P_OP_PR(vcrotate, 0x7fff80007fff8000LL, 0x00000001,
+                 0x7fff80007fff7fffLL, USR_OVF);
+
+    TEST_P_OP_PR(vcnegh, 0x8000000000000000LL, 0x00000002,
+                 0x8000000000000000LL, USR_CLEAR);
+    TEST_P_OP_PR(vcnegh, 0x7fff80007fff8000LL, 0x00000001,
+                 0x7fff80007fff7fffLL, USR_OVF);
+
+#if CORE_HAS_AUDIO
+    TEST_R_OP_PP(wcmpyrw, 0x8765432101234567LL, 0x00000002ffffffffLL,
+                 0x00000001, USR_CLEAR);
+    TEST_R_OP_PP(wcmpyrw, 0x800000007fffffffLL, 0x000000ff7fffffffLL,
+                 0x7fffffff, USR_OVF);
+    TEST_R_OP_PP(wcmpyrw, 0x7fffffff80000000LL, 0x7fffffff000000ffLL,
+                 0x80000000, USR_OVF);
+#else
+    printf("Audio instructions skipped\n");
+#endif
+
+    TEST_R_OP_RR(addh_l16_sat_ll,      0x0000ffff, 0x00000002, 0x00000001,
+                 USR_CLEAR);
+    TEST_R_OP_RR(addh_l16_sat_ll,      0x00007fff, 0x00000005, 0x00007fff,
+                 USR_OVF);
+    TEST_R_OP_RR(addh_l16_sat_ll,      0x00008000, 0x00008000, 0xffff8000,
+                 USR_OVF);
+
+    TEST_P_OP_P(vconj, 0x0000ffff00000001LL, 0x0000ffff00000001LL, USR_CLEAR);
+    TEST_P_OP_P(vconj, 0x800000000000ffffLL, 0x7fff00000000ffffLL, USR_OVF);
+
+    TEST_P_OP_PP(vxaddsubw, 0x8765432101234567LL, 0x00000002ffffffffLL,
+                 0x8765432201234569LL, USR_CLEAR);
+    TEST_P_OP_PP(vxaddsubw, 0x7fffffff7fffffffLL, 0xffffffffffffffffLL,
+                 0x7fffffff7ffffffeLL, USR_OVF);
+    TEST_P_OP_PP(vxaddsubw, 0x800000000fffffffLL, 0x0000000a00000008LL,
+                 0x8000000010000009LL, USR_OVF);
+
+    TEST_P_OP_P(vabshsat, 0x0001000afffff800LL, 0x0001000a00010800LL,
+                USR_CLEAR);
+    TEST_P_OP_P(vabshsat, 0x8000000b000c000aLL, 0x7fff000b000c000aLL,
+             USR_OVF);
+
+    TEST_P_OP_PP(vnavgwr, 0x8765432101234567LL, 0x00000002ffffffffLL,
+                 0xc3b2a1900091a2b4LL, USR_CLEAR);
+    TEST_P_OP_PP(vnavgwr, 0x7fffffff8000000aLL, 0x80000000ffffffffLL,
+                 0x7fffffffc0000006LL, USR_OVF);
+
+    TEST_R_OP_RI(round_ri_sat,         0x0000ffff, 2, 0x00004000, USR_CLEAR);
+    TEST_R_OP_RI(round_ri_sat,         0x7fffffff, 2, 0x1fffffff, USR_OVF);
+
+    TEST_R_OP_RR(asr_r_r_sat,          0x0000ffff, 0x00000002, 0x00003fff,
+                 USR_CLEAR);
+    TEST_R_OP_RR(asr_r_r_sat,          0x00ffffff, 0xfffffff5, 0x7fffffff,
+                 USR_OVF);
+    TEST_R_OP_RR(asr_r_r_sat,          0x80000000, 0xfffffff5, 0x80000000,
+                 USR_OVF);
+
+    TEST_XPp_OP_PP(ACS, 0x0004000300020001ULL, 0x0001000200030004ULL,
+                   0x0000000000000000ULL, 0x0004000300030004ULL, 0xf0,
+                   USR_CLEAR);
+    TEST_XPp_OP_PP(ACS, 0x0004000300020001ULL, 0x0001000200030004ULL,
+                   0x000affff000d0000ULL, 0x000e0003000f0004ULL, 0xcc,
+                   USR_CLEAR);
+    TEST_XPp_OP_PP(ACS, 0x00047fff00020001ULL, 0x00017fff00030004ULL,
+                  0x000a0fff000d0000ULL, 0x000e7fff000f0004ULL, 0xfc,
+                  USR_OVF);
+    TEST_XPp_OP_PP(ACS, 0x00047fff00020001ULL, 0x00017fff00030004ULL,
+                   0x000a0fff000d0000ULL, 0x000e7fff000f0004ULL, 0xf0,
+                   USR_OVF);
+
+    puts(err ? "FAIL" : "PASS");
+    return err;
+}

From 2479540fff4aa4519ff45e122be360492f970598 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:51 -0800
Subject: [PATCH 07/12] Hexagon (tests/tcg/hexagon) add floating point
 instructions to usr.c

Tests to confirm floating point instructions are properly
setting exception bits in USR

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-8-tsimpson@quicinc.com>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/usr.c | 339 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 339 insertions(+)

diff --git a/tests/tcg/hexagon/usr.c b/tests/tcg/hexagon/usr.c
index e82727237e..11415f8295 100644
--- a/tests/tcg/hexagon/usr.c
+++ b/tests/tcg/hexagon/usr.c
@@ -78,6 +78,34 @@ static void __check64(int line, uint64_t val, uint64_t expect)
 #define USR_FPUNFF           (1 << USR_FPUNFF_BIT)
 #define USR_FPINPF           (1 << USR_FPINPF_BIT)
 
+/* Some useful floating point values */
+const uint32_t SF_INF =              0x7f800000;
+const uint32_t SF_QNaN =             0x7fc00000;
+const uint32_t SF_SNaN =             0x7fb00000;
+const uint32_t SF_QNaN_neg =         0xffc00000;
+const uint32_t SF_SNaN_neg =         0xffb00000;
+const uint32_t SF_HEX_NaN =          0xffffffff;
+const uint32_t SF_zero =             0x00000000;
+const uint32_t SF_one =              0x3f800000;
+const uint32_t SF_one_recip =        0x3f7f0001;         /* 0.9960...  */
+const uint32_t SF_one_invsqrta =     0x3f7f0000;         /* 0.99609375 */
+const uint32_t SF_two =              0x40000000;
+const uint32_t SF_four =             0x40800000;
+const uint32_t SF_small_neg =        0xab98fba8;
+const uint32_t SF_large_pos =        0x5afa572e;
+
+const uint64_t DF_QNaN =             0x7ff8000000000000ULL;
+const uint64_t DF_SNaN =             0x7ff7000000000000ULL;
+const uint64_t DF_QNaN_neg =         0xfff8000000000000ULL;
+const uint64_t DF_SNaN_neg =         0xfff7000000000000ULL;
+const uint64_t DF_HEX_NaN =          0xffffffffffffffffULL;
+const uint64_t DF_zero =             0x0000000000000000ULL;
+const uint64_t DF_any =              0x3f80000000000000ULL;
+const uint64_t DF_one =              0x3ff0000000000000ULL;
+const uint64_t DF_one_hh =           0x3ff001ff80000000ULL;     /* 1.00048... */
+const uint64_t DF_small_neg =        0xbd731f7500000000ULL;
+const uint64_t DF_large_pos =        0x7f80000000000001ULL;
+
 /*
  * Templates for functions to execute an instruction
  *
@@ -309,6 +337,29 @@ static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, uint8_t pred,\
 #define FUNC_XR_OP_RRp(NAME, INSN) \
 FUNC_Xx_OP_xxp(uint32_t, uint32_t, uint32_t, NAME, INSN)
 
+/* Template for compare instructions with two register operands */
+#define FUNC_CMP_xx(SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static uint32_t NAME(SRC1TYPE src1, SRC2TYPE src2, uint32_t *usr_result) \
+{ \
+    uint32_t result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%0 = p1\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "p1", "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_CMP_RR(NAME, INSN) \
+FUNC_CMP_xx(uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_CMP_PP(NAME, INSN) \
+FUNC_CMP_xx(uint64_t, uint64_t, NAME, INSN)
+
 /*
  * Function declarations using the templates
  */
@@ -379,6 +430,69 @@ FUNC_R_OP_RR(asr_r_r_sat,       "%0 = asr(%2, %3):sat")
 
 FUNC_XPp_OP_PP(ACS,             "%0, p2 = vacsh(%3, %4)")
 
+/* Floating point */
+FUNC_R_OP_RR(sfmin,             "%0 = sfmin(%2, %3)")
+FUNC_R_OP_RR(sfmax,             "%0 = sfmax(%2, %3)")
+FUNC_R_OP_RR(sfadd,             "%0 = sfadd(%2, %3)")
+FUNC_R_OP_RR(sfsub,             "%0 = sfsub(%2, %3)")
+FUNC_R_OP_RR(sfmpy,             "%0 = sfmpy(%2, %3)")
+FUNC_XR_OP_RR(sffma,            "%0 += sfmpy(%2, %3)")
+FUNC_XR_OP_RR(sffms,            "%0 -= sfmpy(%2, %3)")
+FUNC_CMP_RR(sfcmpuo,            "p1 = sfcmp.uo(%2, %3)")
+FUNC_CMP_RR(sfcmpeq,            "p1 = sfcmp.eq(%2, %3)")
+FUNC_CMP_RR(sfcmpgt,            "p1 = sfcmp.gt(%2, %3)")
+FUNC_CMP_RR(sfcmpge,            "p1 = sfcmp.ge(%2, %3)")
+
+FUNC_P_OP_PP(dfadd,             "%0 = dfadd(%2, %3)")
+FUNC_P_OP_PP(dfsub,             "%0 = dfsub(%2, %3)")
+
+#if CORE_IS_V67
+FUNC_P_OP_PP(dfmin,             "%0 = dfmin(%2, %3)")
+FUNC_P_OP_PP(dfmax,             "%0 = dfmax(%2, %3)")
+FUNC_XP_OP_PP(dfmpyhh,          "%0 += dfmpyhh(%2, %3)")
+#endif
+
+FUNC_CMP_PP(dfcmpuo,            "p1 = dfcmp.uo(%2, %3)")
+FUNC_CMP_PP(dfcmpeq,            "p1 = dfcmp.eq(%2, %3)")
+FUNC_CMP_PP(dfcmpgt,            "p1 = dfcmp.gt(%2, %3)")
+FUNC_CMP_PP(dfcmpge,            "p1 = dfcmp.ge(%2, %3)")
+
+/* Conversions from sf */
+FUNC_P_OP_R(conv_sf2df,         "%0 = convert_sf2df(%2)")
+FUNC_R_OP_R(conv_sf2uw,         "%0 = convert_sf2uw(%2)")
+FUNC_R_OP_R(conv_sf2w,          "%0 = convert_sf2w(%2)")
+FUNC_P_OP_R(conv_sf2ud,         "%0 = convert_sf2ud(%2)")
+FUNC_P_OP_R(conv_sf2d,          "%0 = convert_sf2d(%2)")
+FUNC_R_OP_R(conv_sf2uw_chop,    "%0 = convert_sf2uw(%2):chop")
+FUNC_R_OP_R(conv_sf2w_chop,     "%0 = convert_sf2w(%2):chop")
+FUNC_P_OP_R(conv_sf2ud_chop,    "%0 = convert_sf2ud(%2):chop")
+FUNC_P_OP_R(conv_sf2d_chop,     "%0 = convert_sf2d(%2):chop")
+
+/* Conversions from df */
+FUNC_R_OP_P(conv_df2sf,         "%0 = convert_df2sf(%2)")
+FUNC_R_OP_P(conv_df2uw,         "%0 = convert_df2uw(%2)")
+FUNC_R_OP_P(conv_df2w,          "%0 = convert_df2w(%2)")
+FUNC_P_OP_P(conv_df2ud,         "%0 = convert_df2ud(%2)")
+FUNC_P_OP_P(conv_df2d,          "%0 = convert_df2d(%2)")
+FUNC_R_OP_P(conv_df2uw_chop,    "%0 = convert_df2uw(%2):chop")
+FUNC_R_OP_P(conv_df2w_chop,     "%0 = convert_df2w(%2):chop")
+FUNC_P_OP_P(conv_df2ud_chop,    "%0 = convert_df2ud(%2):chop")
+FUNC_P_OP_P(conv_df2d_chop,     "%0 = convert_df2d(%2):chop")
+
+/* Integer to float conversions */
+FUNC_R_OP_R(conv_uw2sf,         "%0 = convert_uw2sf(%2)")
+FUNC_R_OP_R(conv_w2sf,          "%0 = convert_w2sf(%2)")
+FUNC_R_OP_P(conv_ud2sf,         "%0 = convert_ud2sf(%2)")
+FUNC_R_OP_P(conv_d2sf,          "%0 = convert_d2sf(%2)")
+
+/* Special purpose floating point instructions */
+FUNC_XR_OP_RRp(sffma_sc,        "%0 += sfmpy(%2, %3, p2):scale")
+FUNC_Rp_OP_RR(sfrecipa,         "%0, p2 = sfrecipa(%3, %4)")
+FUNC_R_OP_RR(sffixupn,          "%0 = sffixupn(%2, %3)")
+FUNC_R_OP_RR(sffixupd,          "%0 = sffixupd(%2, %3)")
+FUNC_R_OP_R(sffixupr,           "%0 = sffixupr(%2)")
+FUNC_Rp_OP_R(sfinvsqrta,        "%0, p2 = sfinvsqrta(%3)")
+
 /*
  * Templates for test cases
  *
@@ -554,6 +668,24 @@ TEST_Xxp_OP_xx(uint64_t, check64, uint64_t, uint64_t, FUNC, RESIN, SRC1, SRC2, \
 TEST_Xx_OP_xxp(uint32_t, check32, uint32_t, uint32_t, \
               FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES)
 
+#define TEST_CMP_xx(SRC1TYPE, SRC2TYPE, \
+                    FUNC, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        uint32_t result; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &usr_result); \
+        check(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_CMP_RR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_CMP_xx(uint32_t, uint32_t, FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_CMP_PP(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_CMP_xx(uint64_t, uint64_t, FUNC, SRC1, SRC2, RES, USR_RES)
+
 int main()
 {
     TEST_R_OP_R(satub,       0,         0,         USR_CLEAR);
@@ -793,6 +925,213 @@ int main()
                    0x000a0fff000d0000ULL, 0x000e7fff000f0004ULL, 0xf0,
                    USR_OVF);
 
+    /* Floating point */
+    TEST_R_OP_RR(sfmin,  SF_one,      SF_small_neg,   SF_small_neg, USR_CLEAR);
+    TEST_R_OP_RR(sfmin,  SF_one,      SF_SNaN,        SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmin,  SF_SNaN,     SF_one,         SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmin,  SF_one,      SF_QNaN,        SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmin,  SF_QNaN,     SF_one,         SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmin,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmin,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfmax,  SF_one,      SF_small_neg,   SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmax,  SF_one,      SF_SNaN,        SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmax,  SF_SNaN,     SF_one,         SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmax,  SF_one,      SF_QNaN,        SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmax,  SF_QNaN,     SF_one,         SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmax,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmax,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfadd,  SF_one,      SF_QNaN,        SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sfadd,  SF_one,      SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfadd,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfadd,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfsub,  SF_one,      SF_QNaN,        SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sfsub,  SF_one,      SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfsub,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfsub,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfmpy,  SF_one,      SF_QNaN,        SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sfmpy,  SF_one,      SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmpy,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmpy,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_XR_OP_RR(sffma, SF_one,   SF_one,    SF_one,   SF_two,     USR_CLEAR);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_one,    SF_QNaN,  SF_HEX_NaN, USR_CLEAR);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_one,    SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_QNaN,   SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_SNaN,   SF_QNaN,  SF_HEX_NaN, USR_FPINVF);
+
+    TEST_XR_OP_RR(sffms, SF_one,   SF_one,    SF_one,   SF_zero,    USR_CLEAR);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_one,    SF_QNaN,  SF_HEX_NaN, USR_CLEAR);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_one,    SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_QNaN,   SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_SNaN,   SF_QNaN,  SF_HEX_NaN, USR_FPINVF);
+
+    TEST_CMP_RR(sfcmpuo, SF_one,      SF_large_pos,    0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_INF,      SF_large_pos,    0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN,     SF_large_pos,    0xff,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN_neg, SF_large_pos,    0xff,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_SNaN,     SF_large_pos,    0xff,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpuo, SF_SNaN_neg, SF_large_pos,    0xff,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN,     SF_QNaN,         0xff,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN,     SF_SNaN,         0xff,    USR_FPINVF);
+
+    TEST_CMP_RR(sfcmpeq, SF_one,      SF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpeq, SF_one,      SF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpgt, SF_one,      SF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpgt, SF_one,      SF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpge, SF_one,      SF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpge, SF_one,      SF_SNaN,         0x00,    USR_FPINVF);
+
+    TEST_P_OP_PP(dfadd,  DF_any,    DF_QNaN,         DF_HEX_NaN,    USR_CLEAR);
+    TEST_P_OP_PP(dfadd,  DF_any,    DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfadd,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfadd,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+    TEST_P_OP_PP(dfsub,  DF_any,    DF_QNaN,         DF_HEX_NaN,    USR_CLEAR);
+    TEST_P_OP_PP(dfsub,  DF_any,    DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfsub,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfsub,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+#if CORE_IS_V67
+    TEST_P_OP_PP(dfmin,  DF_any,    DF_small_neg,    DF_small_neg,  USR_CLEAR);
+    TEST_P_OP_PP(dfmin,  DF_any,    DF_SNaN,         DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmin,  DF_SNaN,   DF_any,          DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmin,  DF_any,    DF_QNaN,         DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmin,  DF_QNaN,   DF_any,          DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmin,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfmin,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+    TEST_P_OP_PP(dfmax,  DF_any,    DF_small_neg,    DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmax,  DF_any,    DF_SNaN,         DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmax,  DF_SNaN,   DF_any,          DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmax,  DF_any,    DF_QNaN,         DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmax,  DF_QNaN,   DF_any,          DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmax,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfmax,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+    TEST_XP_OP_PP(dfmpyhh, DF_one,   DF_one,  DF_one,   DF_one_hh,  USR_CLEAR);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_any,  DF_QNaN,  DF_HEX_NaN, USR_CLEAR);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_any,  DF_SNaN,  DF_HEX_NaN, USR_FPINVF);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_QNaN, DF_SNaN,  DF_HEX_NaN, USR_FPINVF);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_SNaN, DF_QNaN,  DF_HEX_NaN, USR_FPINVF);
+#else
+    printf("v67 instructions skipped\n");
+#endif
+
+    TEST_CMP_PP(dfcmpuo, DF_small_neg, DF_any,          0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_large_pos, DF_any,          0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN,      DF_any,          0xff,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN_neg,  DF_any,          0xff,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_SNaN,      DF_any,          0xff,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpuo, DF_SNaN_neg,  DF_any,          0xff,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN,      DF_QNaN,         0xff,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN,      DF_SNaN,         0xff,    USR_FPINVF);
+
+    TEST_CMP_PP(dfcmpeq, DF_any,       DF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpeq, DF_any,       DF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpgt, DF_any,       DF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpgt, DF_any,       DF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpge, DF_any,       DF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpge, DF_any,       DF_SNaN,         0x00,    USR_FPINVF);
+
+    TEST_P_OP_R(conv_sf2df,       SF_QNaN,  DF_HEX_NaN,             USR_CLEAR);
+    TEST_P_OP_R(conv_sf2df,       SF_SNaN,  DF_HEX_NaN,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw,       SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw,       SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w,        SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w,        SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud,       SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud,       SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d,        SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d,        SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw_chop,  SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw_chop,  SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w_chop,   SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w_chop,   SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud_chop,  SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud_chop,  SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d_chop,   SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d_chop,   SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+
+    TEST_R_OP_P(conv_df2sf,       DF_QNaN,  SF_HEX_NaN,             USR_CLEAR);
+    TEST_R_OP_P(conv_df2sf,       DF_SNaN,  SF_HEX_NaN,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw,       DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw,       DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w,        DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w,        DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud,       DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud,       DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d,        DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d,        DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw_chop,  DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw_chop,  DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w_chop,   DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w_chop,   DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud_chop,  DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud_chop,  DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d_chop,   DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d_chop,   DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+
+    TEST_R_OP_R(conv_uw2sf,    0x00000001,             SF_one,      USR_CLEAR);
+    TEST_R_OP_R(conv_uw2sf,    0x010020a5,             0x4b801052,  USR_FPINPF);
+    TEST_R_OP_R(conv_w2sf,     0x00000001,             SF_one,      USR_CLEAR);
+    TEST_R_OP_R(conv_w2sf,     0x010020a5,             0x4b801052,  USR_FPINPF);
+    TEST_R_OP_P(conv_ud2sf,    0x0000000000000001ULL,  SF_one,      USR_CLEAR);
+    TEST_R_OP_P(conv_ud2sf,    0x00000000010020a5ULL,  0x4b801052,  USR_FPINPF);
+    TEST_R_OP_P(conv_d2sf,     0x0000000000000001ULL,  SF_one,      USR_CLEAR);
+    TEST_R_OP_P(conv_d2sf,     0x00000000010020a5ULL,  0x4b801052,  USR_FPINPF);
+
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_one,    SF_one,   1, SF_four,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_QNaN,  SF_one,    SF_one,   1, SF_HEX_NaN,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_QNaN,   SF_one,   1, SF_HEX_NaN,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_one,    SF_QNaN,  1, SF_HEX_NaN,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_SNaN,  SF_one,    SF_one,   1, SF_HEX_NaN,
+                   USR_FPINVF);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_SNaN,   SF_one,   1, SF_HEX_NaN,
+                   USR_FPINVF);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_one,    SF_SNaN,  1, SF_HEX_NaN,
+                   USR_FPINVF);
+
+    TEST_Rp_OP_RR(sfrecipa, SF_one,    SF_one,    SF_one_recip,   0x00,
+                  USR_CLEAR);
+    TEST_Rp_OP_RR(sfrecipa, SF_QNaN,   SF_one,    SF_HEX_NaN,     0x00,
+                  USR_CLEAR);
+    TEST_Rp_OP_RR(sfrecipa, SF_one,    SF_QNaN,   SF_HEX_NaN,     0x00,
+                  USR_CLEAR);
+    TEST_Rp_OP_RR(sfrecipa, SF_one,    SF_SNaN,   SF_HEX_NaN,     0x00,
+                  USR_FPINVF);
+    TEST_Rp_OP_RR(sfrecipa, SF_SNaN,   SF_one,    SF_HEX_NaN,     0x00,
+                  USR_FPINVF);
+
+    TEST_R_OP_RR(sffixupn, SF_one,     SF_one,    SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sffixupn, SF_QNaN,    SF_one,    SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupn, SF_one,     SF_QNaN,   SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupn, SF_SNaN,    SF_one,    SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sffixupn, SF_one,     SF_SNaN,   SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sffixupd, SF_one,     SF_one,    SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sffixupd, SF_QNaN,    SF_one,    SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupd, SF_one,     SF_QNaN,   SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupd, SF_SNaN,    SF_one,    SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sffixupd, SF_one,     SF_SNaN,   SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_R(sffixupr, SF_one,             SF_one,           USR_CLEAR);
+    TEST_R_OP_R(sffixupr, SF_QNaN,            SF_HEX_NaN,       USR_CLEAR);
+    TEST_R_OP_R(sffixupr, SF_SNaN,            SF_HEX_NaN,       USR_FPINVF);
+
+    TEST_Rp_OP_R(sfinvsqrta, SF_one,        SF_one_invsqrta,  0x00, USR_CLEAR);
+    TEST_Rp_OP_R(sfinvsqrta, SF_zero,       SF_one,           0x00, USR_CLEAR);
+    TEST_Rp_OP_R(sfinvsqrta, SF_QNaN,       SF_HEX_NaN,       0x00, USR_CLEAR);
+    TEST_Rp_OP_R(sfinvsqrta, SF_small_neg,  SF_HEX_NaN,       0x00, USR_FPINVF);
+    TEST_Rp_OP_R(sfinvsqrta, SF_SNaN,       SF_HEX_NaN,       0x00, USR_FPINVF);
+
     puts(err ? "FAIL" : "PASS");
     return err;
 }

From 8576e7ecae056845de6e0bafc547501f2bc6461c Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:52 -0800
Subject: [PATCH 08/12] Hexagon (tests/tcg/hexagon) update overflow test

Add a test that sets USR multiple times in a packet

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-9-tsimpson@quicinc.com>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/overflow.c | 61 +++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/tcg/hexagon/overflow.c b/tests/tcg/hexagon/overflow.c
index 196fcf7f3a..94087851b0 100644
--- a/tests/tcg/hexagon/overflow.c
+++ b/tests/tcg/hexagon/overflow.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -72,6 +72,20 @@ int read_usr_overflow(void)
     return result & 1;
 }
 
+int get_usr_overflow(int usr)
+{
+    return usr & 1;
+}
+
+int get_usr_fp_invalid(int usr)
+{
+    return (usr >> 1) & 1;
+}
+
+int get_usr_lpcfg(int usr)
+{
+    return (usr >> 8) & 0x3;
+}
 
 jmp_buf jmp_env;
 int usr_overflow;
@@ -82,6 +96,49 @@ static void sig_segv(int sig, siginfo_t *info, void *puc)
     longjmp(jmp_env, 1);
 }
 
+static void test_packet(void)
+{
+    int convres;
+    int satres;
+    int usr;
+
+    asm("r2 = usr\n\t"
+        "r2 = clrbit(r2, #0)\n\t"        /* clear overflow bit */
+        "r2 = clrbit(r2, #1)\n\t"        /* clear FP invalid bit */
+        "usr = r2\n\t"
+        "{\n\t"
+        "    %0 = convert_sf2uw(%3):chop\n\t"
+        "    %1 = satb(%4)\n\t"
+        "}\n\t"
+        "%2 = usr\n\t"
+        : "=r"(convres), "=r"(satres), "=r"(usr)
+        : "r"(0x6a051b86), "r"(0x0410eec0)
+        : "r2", "usr");
+
+    check(convres, 0xffffffff);
+    check(satres, 0x7f);
+    check(get_usr_overflow(usr), 1);
+    check(get_usr_fp_invalid(usr), 1);
+
+    asm("r2 = usr\n\t"
+        "r2 = clrbit(r2, #0)\n\t"        /* clear overflow bit */
+        "usr = r2\n\t"
+        "%2 = r2\n\t"
+        "p3 = sp3loop0(1f, #1)\n\t"
+        "1:\n\t"
+        "{\n\t"
+        "    %0 = satb(%2)\n\t"
+        "}:endloop0\n\t"
+        "%1 = usr\n\t"
+        : "=r"(satres), "=r"(usr)
+        : "r"(0x0410eec0)
+        : "r2", "usr", "p3", "sa0", "lc0");
+
+    check(satres, 0x7f);
+    check(get_usr_overflow(usr), 1);
+    check(get_usr_lpcfg(usr), 2);
+}
+
 int main()
 {
     struct sigaction act;
@@ -102,6 +159,8 @@ int main()
 
     check(usr_overflow, 0);
 
+    test_packet();
+
     puts(err ? "FAIL" : "PASS");
     return err ? EXIT_FAILURE : EXIT_SUCCESS;
 }

From 8af2d9978ad2c52377fe69466a556fffeedcd057 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:53 -0800
Subject: [PATCH 09/12] Hexagon (tests/tcg/hexagon) fix inline asm in
 preg_alias.c

Replace consecutive inline asm blocks with a single one with proper
outputs/inputs/clobbers rather than making assumptions about register
values being carried between separate blocks.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-10-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/preg_alias.c | 44 ++++++++++++++++------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/tests/tcg/hexagon/preg_alias.c b/tests/tcg/hexagon/preg_alias.c
index 0cac469b78..79febeca97 100644
--- a/tests/tcg/hexagon/preg_alias.c
+++ b/tests/tcg/hexagon/preg_alias.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -57,17 +57,15 @@ typedef union {
 
 static inline void creg_alias(int cval, PRegs *pregs)
 {
-  unsigned char val;
-  asm volatile("c4 = %0" : : "r"(cval));
-
-  asm volatile("%0 = p0" : "=r"(val));
-  pregs->pregs.p0 = val;
-  asm volatile("%0 = p1" : "=r"(val));
-  pregs->pregs.p1 = val;
-  asm volatile("%0 = p2" : "=r"(val));
-  pregs->pregs.p2 = val;
-  asm volatile("%0 = p3" : "=r"(val));
-  pregs->pregs.p3 = val;
+  asm("c4 = %4\n\t"
+      "%0 = p0\n\t"
+      "%1 = p1\n\t"
+      "%2 = p2\n\t"
+      "%3 = p3\n\t"
+      : "=r"(pregs->pregs.p0), "=r"(pregs->pregs.p1),
+        "=r"(pregs->pregs.p2), "=r"(pregs->pregs.p3)
+      : "r"(cval)
+      : "p0", "p1", "p2", "p3");
 }
 
 int err;
@@ -83,19 +81,19 @@ static void check(int val, int expect)
 static inline void creg_alias_pair(unsigned int cval, PRegs *pregs)
 {
   unsigned long long cval_pair = (0xdeadbeefULL << 32) | cval;
-  unsigned char val;
   int c5;
-  asm volatile("c5:4 = %0" : : "r"(cval_pair));
 
-  asm volatile("%0 = p0" : "=r"(val));
-  pregs->pregs.p0 = val;
-  asm volatile("%0 = p1" : "=r"(val));
-  pregs->pregs.p1 = val;
-  asm volatile("%0 = p2" : "=r"(val));
-  pregs->pregs.p2 = val;
-  asm volatile("%0 = p3" : "=r"(val));
-  pregs->pregs.p3 = val;
-  asm volatile("%0 = c5" : "=r"(c5));
+  asm ("c5:4 = %5\n\t"
+       "%0 = p0\n\t"
+       "%1 = p1\n\t"
+       "%2 = p2\n\t"
+       "%3 = p3\n\t"
+       "%4 = c5\n\t"
+       : "=r"(pregs->pregs.p0), "=r"(pregs->pregs.p1),
+         "=r"(pregs->pregs.p2), "=r"(pregs->pregs.p3), "=r"(c5)
+       : "r"(cval_pair)
+       : "p0", "p1", "p2", "p3");
+
   check(c5, 0xdeadbeef);
 }
 

From 3977ba3078503fca0f182aa8a39fad2388f43cb1 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:54 -0800
Subject: [PATCH 10/12] Hexagon (target/hexagon) fix bug in conv_df2uw_chop

Fix typo that checked for 32 bit nan instead of 64 bit

Test case added in tests/tcg/hexagon/usr.c

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-11-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 2 +-
 tests/tcg/hexagon/usr.c    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 366caf9ec8..63e5ad5d68 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -829,7 +829,7 @@ uint32_t HELPER(conv_df2uw_chop)(CPUHexagonState *env, float64 RssV)
     uint32_t RdV;
     arch_fpop_start(env);
     /* Hexagon checks the sign before rounding */
-    if (float64_is_neg(RssV) && !float32_is_any_nan(RssV)) {
+    if (float64_is_neg(RssV) && !float64_is_any_nan(RssV)) {
         float_raise(float_flag_invalid, &env->fp_status);
         RdV = 0;
     } else {
diff --git a/tests/tcg/hexagon/usr.c b/tests/tcg/hexagon/usr.c
index 11415f8295..a531511cec 100644
--- a/tests/tcg/hexagon/usr.c
+++ b/tests/tcg/hexagon/usr.c
@@ -1068,6 +1068,10 @@ int main()
     TEST_P_OP_P(conv_df2d,        DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
     TEST_R_OP_P(conv_df2uw_chop,  DF_QNaN,  0xffffffff,             USR_FPINVF);
     TEST_R_OP_P(conv_df2uw_chop,  DF_SNaN,  0xffffffff,             USR_FPINVF);
+
+    /* Test for typo in HELPER(conv_df2uw_chop) */
+    TEST_R_OP_P(conv_df2uw_chop, 0xffffff7f00000001ULL, 0xffffffff, USR_FPINVF);
+
     TEST_R_OP_P(conv_df2w_chop,   DF_QNaN,  0xffffffff,             USR_FPINVF);
     TEST_R_OP_P(conv_df2w_chop,   DF_SNaN,  0xffffffff,             USR_FPINVF);
     TEST_P_OP_P(conv_df2ud_chop,  DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);

From c0d86060f033fc8d591b0163e380ff6cd04f213a Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:56 -0800
Subject: [PATCH 11/12] Hexagon (target/hexagon) assignment to c4 should wait
 until packet commit

On Hexagon, c4 is an alias for predicate registers P3:0.  If we assign to
c4 inside a packet with reads from predicate registers, the predicate
reads should get the old values.

Test case added to tests/tcg/hexagon/preg_alias.c

Co-authored-by: Michael Lambert <mlambert@cuicinc.com>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-13-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/genptr.c        | 14 ++++++++-----
 tests/tcg/hexagon/preg_alias.c | 38 ++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index 4419d30e23..cd6af4bceb 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -210,11 +210,15 @@ static inline void gen_read_ctrl_reg_pair(DisasContext *ctx, const int reg_num,
     }
 }
 
-static inline void gen_write_p3_0(TCGv control_reg)
+static void gen_write_p3_0(DisasContext *ctx, TCGv control_reg)
 {
+    TCGv hex_p8 = tcg_temp_new();
     for (int i = 0; i < NUM_PREGS; i++) {
-        tcg_gen_extract_tl(hex_pred[i], control_reg, i * 8, 8);
+        tcg_gen_extract_tl(hex_p8, control_reg, i * 8, 8);
+        gen_log_pred_write(ctx, i, hex_p8);
+        ctx_log_pred_write(ctx, i);
     }
+    tcg_temp_free(hex_p8);
 }
 
 /*
@@ -228,7 +232,7 @@ static inline void gen_write_ctrl_reg(DisasContext *ctx, int reg_num,
                                       TCGv val)
 {
     if (reg_num == HEX_REG_P3_0) {
-        gen_write_p3_0(val);
+        gen_write_p3_0(ctx, val);
     } else {
         gen_log_reg_write(reg_num, val);
         ctx_log_reg_write(ctx, reg_num);
@@ -250,7 +254,7 @@ static inline void gen_write_ctrl_reg_pair(DisasContext *ctx, int reg_num,
     if (reg_num == HEX_REG_P3_0) {
         TCGv val32 = tcg_temp_new();
         tcg_gen_extrl_i64_i32(val32, val);
-        gen_write_p3_0(val32);
+        gen_write_p3_0(ctx, val32);
         tcg_gen_extrh_i64_i32(val32, val);
         gen_log_reg_write(reg_num + 1, val32);
         tcg_temp_free(val32);
diff --git a/tests/tcg/hexagon/preg_alias.c b/tests/tcg/hexagon/preg_alias.c
index 79febeca97..b44a8112b4 100644
--- a/tests/tcg/hexagon/preg_alias.c
+++ b/tests/tcg/hexagon/preg_alias.c
@@ -97,6 +97,42 @@ static inline void creg_alias_pair(unsigned int cval, PRegs *pregs)
   check(c5, 0xdeadbeef);
 }
 
+static void test_packet(void)
+{
+    /*
+     * Test that setting c4 inside a packet doesn't impact the predicates
+     * that are read during the packet.
+     */
+
+    int result;
+    int old_val = 0x0000001c;
+
+    /* Test a predicated register transfer */
+    result = old_val;
+    asm (
+         "c4 = %1\n\t"
+         "{\n\t"
+         "    c4 = %2\n\t"
+         "    if (!p2) %0 = %3\n\t"
+         "}\n\t"
+         : "+r"(result)
+         : "r"(0xffffffff), "r"(0xff00ffff), "r"(0x837ed653)
+         : "p0", "p1", "p2", "p3");
+    check(result, old_val);
+
+    /* Test a predicated store */
+    result = 0xffffffff;
+    asm ("c4 = %0\n\t"
+         "{\n\t"
+         "    c4 = %1\n\t"
+         "    if (!p2) memw(%2) = #0\n\t"
+         "}\n\t"
+         :
+         : "r"(0), "r"(0xffffffff), "r"(&result)
+         : "p0", "p1", "p2", "p3", "memory");
+    check(result, 0x0);
+}
+
 int main()
 {
     int c4;
@@ -162,6 +198,8 @@ int main()
     creg_alias_pair(0xffffffff, &pregs);
     check(pregs.creg, 0xffffffff);
 
+    test_packet();
+
     puts(err ? "FAIL" : "PASS");
     return err;
 }

From 4a818b3767220dcd21cf5cc7f12e33e28c2073ed Mon Sep 17 00:00:00 2001
From: Zongyuan Li <zongyuan.li@smartx.com>
Date: Mon, 24 Jan 2022 14:43:39 +0800
Subject: [PATCH 12/12] target/hexagon: remove unused variable

When building with clang version 13.0.0 (eg. Fedora 13.0.0-3.fc35),
two unused variables introduced by macro GATHER_FUNCTION and
SCATTER_FUNCTION will cause building process failure due to
[-Werror -Wunused-variable].

Signed-off-by: Zongyuan Li <zongyuan.li@smartx.com>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/831
Message-Id: <20220124064339.56027-1-zongyuan.li@smartx.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
---
 target/hexagon/mmvec/macros.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h
index 10f4630364..8345753580 100644
--- a/target/hexagon/mmvec/macros.h
+++ b/target/hexagon/mmvec/macros.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -164,11 +164,9 @@
         target_ulong va = EA; \
         target_ulong va_high = EA + LEN; \
         uintptr_t ra = GETPC(); \
-        int log_bank = 0; \
         int log_byte = 0; \
         for (i0 = 0; i0 < ELEMENT_SIZE; i0++) { \
             log_byte = ((va + i0) <= va_high) && QVAL; \
-            log_bank |= (log_byte << i0); \
             uint8_t B; \
             B = cpu_ldub_data_ra(env, EA + i0, ra); \
             env->tmp_VRegs[0].ub[ELEMENT_SIZE * IDX + i0] = B; \
@@ -243,11 +241,9 @@
         int i0; \
         target_ulong va = EA; \
         target_ulong va_high = EA + LEN; \
-        int log_bank = 0; \
         int log_byte = 0; \
         for (i0 = 0; i0 < ELEM_SIZE; i0++) { \
             log_byte = ((va + i0) <= va_high) && QVAL; \
-            log_bank |= (log_byte << i0); \
             LOG_VTCM_BYTE(va + i0, log_byte, IN.ub[ELEM_SIZE * IDX + i0], \
                           ELEM_SIZE * IDX + i0); \
         } \