From 9948c38bd9aef8fa762a1b62b9fccc35e11a6fd5 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:45 +0000
Subject: [PATCH 01/30] vexpress: Set reset-cbar property for CPUs

Newer versions of the Linux kernel (as of commit bc41b8724 in 3.12)
now assume that if the CPU is a Cortex-A9 and the reset value of the
PERIPHBASE/CBAR register is zero then the CPU is a specific buggy
single core A9 SoC, and will not try to start other cores. Since we
now have a CPU property for the reset value of the CBAR, we can
just fix the vexpress board model to correctly set CBAR so SMP
works again. To avoid duplicate boilerplate code in both the A9
and A15 daughterboard init functions, we split out the CPU and
private memory region init to its own function.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reported-by: Rob Herring <rob.herring@linaro.org>
Reviewed-by: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
Message-id: 1394462692-8871-2-git-send-email-peter.maydell@linaro.org
---
 hw/arm/vexpress.c | 123 +++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c
index ef1707aef0..67628af588 100644
--- a/hw/arm/vexpress.c
+++ b/hw/arm/vexpress.c
@@ -32,6 +32,7 @@
 #include "sysemu/blockdev.h"
 #include "hw/block/flash.h"
 #include "sysemu/device_tree.h"
+#include "qemu/error-report.h"
 #include <libfdt.h>
 
 #define VEXPRESS_BOARD_ID 0x8e0
@@ -173,6 +174,64 @@ struct VEDBoardInfo {
     DBoardInitFn *init;
 };
 
+static void init_cpus(const char *cpu_model, const char *privdev,
+                      hwaddr periphbase, qemu_irq *pic)
+{
+    ObjectClass *cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, cpu_model);
+    DeviceState *dev;
+    SysBusDevice *busdev;
+    int n;
+
+    if (!cpu_oc) {
+        fprintf(stderr, "Unable to find CPU definition\n");
+        exit(1);
+    }
+
+    /* Create the actual CPUs */
+    for (n = 0; n < smp_cpus; n++) {
+        Object *cpuobj = object_new(object_class_get_name(cpu_oc));
+        Error *err = NULL;
+
+        object_property_set_int(cpuobj, periphbase, "reset-cbar", &err);
+        if (err) {
+            error_report("%s", error_get_pretty(err));
+            exit(1);
+        }
+        object_property_set_bool(cpuobj, true, "realized", &err);
+        if (err) {
+            error_report("%s", error_get_pretty(err));
+            exit(1);
+        }
+    }
+
+    /* Create the private peripheral devices (including the GIC);
+     * this must happen after the CPUs are created because a15mpcore_priv
+     * wires itself up to the CPU's generic_timer gpio out lines.
+     */
+    dev = qdev_create(NULL, privdev);
+    qdev_prop_set_uint32(dev, "num-cpu", smp_cpus);
+    qdev_init_nofail(dev);
+    busdev = SYS_BUS_DEVICE(dev);
+    sysbus_mmio_map(busdev, 0, periphbase);
+
+    /* Interrupts [42:0] are from the motherboard;
+     * [47:43] are reserved; [63:48] are daughterboard
+     * peripherals. Note that some documentation numbers
+     * external interrupts starting from 32 (because there
+     * are internal interrupts 0..31).
+     */
+    for (n = 0; n < 64; n++) {
+        pic[n] = qdev_get_gpio_in(dev, n);
+    }
+
+    /* Connect the CPUs to the GIC */
+    for (n = 0; n < smp_cpus; n++) {
+        DeviceState *cpudev = DEVICE(qemu_get_cpu(n));
+
+        sysbus_connect_irq(busdev, n, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ));
+    }
+}
+
 static void a9_daughterboard_init(const VEDBoardInfo *daughterboard,
                                   ram_addr_t ram_size,
                                   const char *cpu_model,
@@ -181,25 +240,12 @@ static void a9_daughterboard_init(const VEDBoardInfo *daughterboard,
     MemoryRegion *sysmem = get_system_memory();
     MemoryRegion *ram = g_new(MemoryRegion, 1);
     MemoryRegion *lowram = g_new(MemoryRegion, 1);
-    DeviceState *dev;
-    SysBusDevice *busdev;
-    int n;
-    qemu_irq cpu_irq[4];
     ram_addr_t low_ram_size;
 
     if (!cpu_model) {
         cpu_model = "cortex-a9";
     }
 
-    for (n = 0; n < smp_cpus; n++) {
-        ARMCPU *cpu = cpu_arm_init(cpu_model);
-        if (!cpu) {
-            fprintf(stderr, "Unable to find CPU definition\n");
-            exit(1);
-        }
-        cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ);
-    }
-
     if (ram_size > 0x40000000) {
         /* 1GB is the maximum the address space permits */
         fprintf(stderr, "vexpress-a9: cannot model more than 1GB RAM\n");
@@ -221,23 +267,7 @@ static void a9_daughterboard_init(const VEDBoardInfo *daughterboard,
     memory_region_add_subregion(sysmem, 0x60000000, ram);
 
     /* 0x1e000000 A9MPCore (SCU) private memory region */
-    dev = qdev_create(NULL, "a9mpcore_priv");
-    qdev_prop_set_uint32(dev, "num-cpu", smp_cpus);
-    qdev_init_nofail(dev);
-    busdev = SYS_BUS_DEVICE(dev);
-    sysbus_mmio_map(busdev, 0, 0x1e000000);
-    for (n = 0; n < smp_cpus; n++) {
-        sysbus_connect_irq(busdev, n, cpu_irq[n]);
-    }
-    /* Interrupts [42:0] are from the motherboard;
-     * [47:43] are reserved; [63:48] are daughterboard
-     * peripherals. Note that some documentation numbers
-     * external interrupts starting from 32 (because the
-     * A9MP has internal interrupts 0..31).
-     */
-    for (n = 0; n < 64; n++) {
-        pic[n] = qdev_get_gpio_in(dev, n);
-    }
+    init_cpus(cpu_model, "a9mpcore_priv", 0x1e000000, pic);
 
     /* Daughterboard peripherals : 0x10020000 .. 0x20000000 */
 
@@ -296,29 +326,14 @@ static void a15_daughterboard_init(const VEDBoardInfo *daughterboard,
                                    const char *cpu_model,
                                    qemu_irq *pic)
 {
-    int n;
     MemoryRegion *sysmem = get_system_memory();
     MemoryRegion *ram = g_new(MemoryRegion, 1);
     MemoryRegion *sram = g_new(MemoryRegion, 1);
-    qemu_irq cpu_irq[4];
-    DeviceState *dev;
-    SysBusDevice *busdev;
 
     if (!cpu_model) {
         cpu_model = "cortex-a15";
     }
 
-    for (n = 0; n < smp_cpus; n++) {
-        ARMCPU *cpu;
-
-        cpu = cpu_arm_init(cpu_model);
-        if (!cpu) {
-            fprintf(stderr, "Unable to find CPU definition\n");
-            exit(1);
-        }
-        cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ);
-    }
-
     {
         /* We have to use a separate 64 bit variable here to avoid the gcc
          * "comparison is always false due to limited range of data type"
@@ -337,23 +352,7 @@ static void a15_daughterboard_init(const VEDBoardInfo *daughterboard,
     memory_region_add_subregion(sysmem, 0x80000000, ram);
 
     /* 0x2c000000 A15MPCore private memory region (GIC) */
-    dev = qdev_create(NULL, "a15mpcore_priv");
-    qdev_prop_set_uint32(dev, "num-cpu", smp_cpus);
-    qdev_init_nofail(dev);
-    busdev = SYS_BUS_DEVICE(dev);
-    sysbus_mmio_map(busdev, 0, 0x2c000000);
-    for (n = 0; n < smp_cpus; n++) {
-        sysbus_connect_irq(busdev, n, cpu_irq[n]);
-    }
-    /* Interrupts [42:0] are from the motherboard;
-     * [47:43] are reserved; [63:48] are daughterboard
-     * peripherals. Note that some documentation numbers
-     * external interrupts starting from 32 (because there
-     * are internal interrupts 0..31).
-     */
-    for (n = 0; n < 64; n++) {
-        pic[n] = qdev_get_gpio_in(dev, n);
-    }
+    init_cpus(cpu_model, "a15mpcore_priv", 0x2c000000, pic);
 
     /* A15 daughterboard peripherals: */
 

From b5a3ca3e3028ab86131920b45a19d553f278bdb4 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:45 +0000
Subject: [PATCH 02/30] realview-pbx-a9: Set reset-cbar property for CPUs

If the CPU is a Cortex-A9 then we should set its reset-cbar property
so that the guest can read the correct PERIPHBASE/CBAR register value;
newer versions of the Linux kernel (as of commit bc41b8724 in 3.12)
will otherwise assume the CPU is a buggy single core A9 SoC. The
realview-pbx-a9 is the only one of the cluster of boards in realview.c
which works with the Cortex-A9 (ie which gets an a9mpcore_priv device);
make sure it also has reset-cbar set correctly.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
Message-id: 1394462692-8871-3-git-send-email-peter.maydell@linaro.org
---
 hw/arm/realview.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/hw/arm/realview.c b/hw/arm/realview.c
index 6ef7646002..7e04e507f9 100644
--- a/hw/arm/realview.c
+++ b/hw/arm/realview.c
@@ -18,6 +18,7 @@
 #include "hw/i2c/i2c.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
+#include "qemu/error-report.h"
 
 #define SMP_BOOT_ADDR 0xe0000000
 #define SMP_BOOTREG_ADDR 0x10000030
@@ -49,6 +50,7 @@ static void realview_init(QEMUMachineInitArgs *args,
 {
     ARMCPU *cpu = NULL;
     CPUARMState *env;
+    ObjectClass *cpu_oc;
     MemoryRegion *sysmem = get_system_memory();
     MemoryRegion *ram_lo = g_new(MemoryRegion, 1);
     MemoryRegion *ram_hi = g_new(MemoryRegion, 1);
@@ -70,12 +72,14 @@ static void realview_init(QEMUMachineInitArgs *args,
     uint32_t sys_id;
     ram_addr_t low_ram_size;
     ram_addr_t ram_size = args->ram_size;
+    hwaddr periphbase = 0;
 
     switch (board_type) {
     case BOARD_EB:
         break;
     case BOARD_EB_MPCORE:
         is_mpcore = 1;
+        periphbase = 0x10100000;
         break;
     case BOARD_PB_A8:
         is_pb = 1;
@@ -83,16 +87,37 @@ static void realview_init(QEMUMachineInitArgs *args,
     case BOARD_PBX_A9:
         is_mpcore = 1;
         is_pb = 1;
+        periphbase = 0x1f000000;
         break;
     }
+
+    cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, args->cpu_model);
+    if (!cpu_oc) {
+        fprintf(stderr, "Unable to find CPU definition\n");
+        exit(1);
+    }
+
     for (n = 0; n < smp_cpus; n++) {
-        cpu = cpu_arm_init(args->cpu_model);
-        if (!cpu) {
-            fprintf(stderr, "Unable to find CPU definition\n");
+        Object *cpuobj = object_new(object_class_get_name(cpu_oc));
+        Error *err = NULL;
+
+        if (is_pb && is_mpcore) {
+            object_property_set_int(cpuobj, periphbase, "reset-cbar", &err);
+            if (err) {
+                error_report("%s", error_get_pretty(err));
+                exit(1);
+            }
+        }
+
+        object_property_set_bool(cpuobj, true, "realized", &err);
+        if (err) {
+            error_report("%s", error_get_pretty(err));
             exit(1);
         }
-        cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ);
+
+        cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpuobj), ARM_CPU_IRQ);
     }
+    cpu = ARM_CPU(first_cpu);
     env = &cpu->env;
     if (arm_feature(env, ARM_FEATURE_V7)) {
         if (is_mpcore) {
@@ -141,16 +166,10 @@ static void realview_init(QEMUMachineInitArgs *args,
     sysbus_mmio_map(SYS_BUS_DEVICE(sysctl), 0, 0x10000000);
 
     if (is_mpcore) {
-        hwaddr periphbase;
         dev = qdev_create(NULL, is_pb ? "a9mpcore_priv": "realview_mpcore");
         qdev_prop_set_uint32(dev, "num-cpu", smp_cpus);
         qdev_init_nofail(dev);
         busdev = SYS_BUS_DEVICE(dev);
-        if (is_pb) {
-            periphbase = 0x1f000000;
-        } else {
-            periphbase = 0x10100000;
-        }
         sysbus_mmio_map(busdev, 0, periphbase);
         for (n = 0; n < smp_cpus; n++) {
             sysbus_connect_irq(busdev, n, cpu_irq[n]);

From 4719ab918a837fb12f34599139f2c7c0137ca703 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:46 +0000
Subject: [PATCH 03/30] exynos4210: Set reset-cbar property of Cortex-A9 CPUs

Set the reset-cbar property of the Exynos4210 SoC's Cortex-A9
CPUs, so that Linux doesn't misrecognize them as a broken
uniprocessor SoC.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
Message-id: 1394462692-8871-4-git-send-email-peter.maydell@linaro.org
---
 hw/arm/exynos4210.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/hw/arm/exynos4210.c b/hw/arm/exynos4210.c
index 9f137e9acd..6426d168d2 100644
--- a/hw/arm/exynos4210.c
+++ b/hw/arm/exynos4210.c
@@ -143,11 +143,21 @@ Exynos4210State *exynos4210_init(MemoryRegion *system_mem,
     unsigned long mem_size;
     DeviceState *dev;
     SysBusDevice *busdev;
+    ObjectClass *cpu_oc;
+
+    cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, "cortex-a9");
+    assert(cpu_oc);
 
     for (n = 0; n < EXYNOS4210_NCPUS; n++) {
-        s->cpu[n] = cpu_arm_init("cortex-a9");
-        if (!s->cpu[n]) {
-            fprintf(stderr, "Unable to find CPU %d definition\n", n);
+        Object *cpuobj = object_new(object_class_get_name(cpu_oc));
+        Error *err = NULL;
+
+        s->cpu[n] = ARM_CPU(cpuobj);
+        object_property_set_int(cpuobj, EXYNOS4210_SMP_PRIVATE_BASE_ADDR,
+                                "reset-cbar", &error_abort);
+        object_property_set_bool(cpuobj, true, "realized", &err);
+        if (err) {
+            error_report("%s", error_get_pretty(err));
             exit(1);
         }
     }

From ba7500852d8c3926a732892236765eee1bcaea93 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:46 +0000
Subject: [PATCH 04/30] virt: Set reset-cbar on CPUs

Set the reset-cbar property on CPUs used by the virt board,
if they have it. This isn't necessary for correct functioning
under Linux (since the A9 isn't a valid CPU for the virt board),
but it is the correct behaviour.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
Message-id: 1394462692-8871-5-git-send-email-peter.maydell@linaro.org
---
 hw/arm/virt.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 517f2fe30f..2bbc9313d2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -390,6 +390,12 @@ static void machvirt_init(QEMUMachineInitArgs *args)
         if (n > 0) {
             object_property_set_bool(cpuobj, true, "start-powered-off", NULL);
         }
+
+        if (object_property_find(cpuobj, "reset-cbar", NULL)) {
+            object_property_set_int(cpuobj, vbi->memmap[VIRT_CPUPERIPHS].base,
+                                    "reset-cbar", &error_abort);
+        }
+
         object_property_set_bool(cpuobj, true, "realized", NULL);
     }
     fdt_add_cpu_nodes(vbi);

From d6d60581f3f6778de85ee23427006151b5226667 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:46 +0000
Subject: [PATCH 05/30] target-arm: Add ARM_CP_IO notation to PMCR reginfo

Now that the PMCR writefn makes timer accesses, its reginfo needs
the ARM_CP_IO flag, so that icount mode works correctly. (Fixes
the bug accidentally introduced in commit 7c2cb42b).

Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Message-id: 1394908291-16546-1-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index f0a1fd48e6..5080372bad 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -1983,6 +1983,7 @@ void register_cp_regs_for_features(ARMCPU *cpu)
         ARMCPRegInfo pmcr = {
             .name = "PMCR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 0,
             .access = PL0_RW, .resetvalue = cpu->midr & 0xff000000,
+            .type = ARM_CP_IO,
             .fieldoffset = offsetof(CPUARMState, cp15.c9_pmcr),
             .accessfn = pmreg_access, .writefn = pmcr_write,
             .raw_writefn = raw_write,

From a984e42c916ad5afdf3f8660f284857547943aa4 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:47 +0000
Subject: [PATCH 06/30] target-arm: A64: Implement PMULL instruction

Implement the PMULL instruction; this is the last unimplemented insn
in the three-reg-diff group.

Note that PMULL with size 3 is considered part of the AES part
of the crypto extensions (see the ID_AA64ISAR0_EL1 register definition
in the v8 ARM ARM), so it isn't necessary to burn an extra feature
bit on it, even though we're using more feature bits than a single
"crypto extension present/not present" toggle.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-2-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper-a64.c    | 30 ++++++++++++++++++++++++++++
 target-arm/helper-a64.h    |  2 ++
 target-arm/translate-a64.c | 41 ++++++++++++++++++++++++++++++++++++--
 target-arm/translate.c     |  1 +
 target-arm/translate.h     |  6 ++++++
 5 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index c2ce33ee88..80ed029053 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -180,6 +180,36 @@ uint64_t HELPER(simd_tbl)(CPUARMState *env, uint64_t result, uint64_t indices,
     return result;
 }
 
+/* Helper function for 64 bit polynomial multiply case:
+ * perform PolynomialMult(op1, op2) and return either the top or
+ * bottom half of the 128 bit result.
+ */
+uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
+{
+    int bitnum;
+    uint64_t res = 0;
+
+    for (bitnum = 0; bitnum < 64; bitnum++) {
+        if (op1 & (1ULL << bitnum)) {
+            res ^= op2 << bitnum;
+        }
+    }
+    return res;
+}
+uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
+{
+    int bitnum;
+    uint64_t res = 0;
+
+    /* bit 0 of op1 can't influence the high 64 bits at all */
+    for (bitnum = 1; bitnum < 64; bitnum++) {
+        if (op1 & (1ULL << bitnum)) {
+            res ^= op2 >> (64 - bitnum);
+        }
+    }
+    return res;
+}
+
 /* 64bit/double versions of the neon float compare functions */
 uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
 {
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index ab9933cab0..43d8bbfc73 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -27,6 +27,8 @@ DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
 DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
 DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
+DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 2fd9113628..37252b7479 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -7124,6 +7124,10 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
                 gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
                                                   tcg_passres, tcg_passres);
                 break;
+            case 14: /* PMULL */
+                assert(size == 0);
+                gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2);
+                break;
             default:
                 g_assert_not_reached();
             }
@@ -7243,6 +7247,30 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,
     }
 }
 
+static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm)
+{
+    /* PMULL of 64 x 64 -> 128 is an odd special case because it
+     * is the only three-reg-diff instruction which produces a
+     * 128-bit wide result from a single operation. However since
+     * it's possible to calculate the two halves more or less
+     * separately we just use two helper calls.
+     */
+    TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+    TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+    TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+    read_vec_element(s, tcg_op1, rn, is_q, MO_64);
+    read_vec_element(s, tcg_op2, rm, is_q, MO_64);
+    gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2);
+    write_vec_element(s, tcg_res, rd, 0, MO_64);
+    gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2);
+    write_vec_element(s, tcg_res, rd, 1, MO_64);
+
+    tcg_temp_free_i64(tcg_op1);
+    tcg_temp_free_i64(tcg_op2);
+    tcg_temp_free_i64(tcg_res);
+}
+
 /* C3.6.15 AdvSIMD three different
  *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
  * +---+---+---+-----------+------+---+------+--------+-----+------+------+
@@ -7293,8 +7321,15 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             return;
         }
-        unsupported_encoding(s, insn);
-        break;
+        if (size == 3) {
+            if (!arm_dc_feature(s, ARM_FEATURE_V8_AES)) {
+                unallocated_encoding(s);
+                return;
+            }
+            handle_pmull_64(s, is_q, rd, rn, rm);
+            return;
+        }
+        goto is_widening;
     case 9: /* SQDMLAL, SQDMLAL2 */
     case 11: /* SQDMLSL, SQDMLSL2 */
     case 13: /* SQDMULL, SQDMULL2 */
@@ -7315,6 +7350,7 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             return;
         }
+    is_widening:
         handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm);
         break;
     default:
@@ -9045,6 +9081,7 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
     dc->vec_stride = 0;
     dc->cp_regs = cpu->cp_regs;
     dc->current_pl = arm_current_pl(env);
+    dc->features = env->features;
 
     init_tmp_a64_array(dc);
 
diff --git a/target-arm/translate.c b/target-arm/translate.c
index fbe513b40d..20042976f2 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -10654,6 +10654,7 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     dc->vec_stride = ARM_TBFLAG_VECSTRIDE(tb->flags);
     dc->cp_regs = cpu->cp_regs;
     dc->current_pl = arm_current_pl(env);
+    dc->features = env->features;
 
     cpu_F0s = tcg_temp_new_i32();
     cpu_F1s = tcg_temp_new_i32();
diff --git a/target-arm/translate.h b/target-arm/translate.h
index 2f491f9ff6..3525ffcecb 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -26,6 +26,7 @@ typedef struct DisasContext {
     int aarch64;
     int current_pl;
     GHashTable *cp_regs;
+    uint64_t features; /* CPU features bits */
 #define TMP_A64_MAX 16
     int tmp_a64_count;
     TCGv_i64 tmp_a64[TMP_A64_MAX];
@@ -33,6 +34,11 @@ typedef struct DisasContext {
 
 extern TCGv_ptr cpu_env;
 
+static inline int arm_dc_feature(DisasContext *dc, int feature)
+{
+    return (dc->features & (1ULL << feature)) != 0;
+}
+
 /* target-specific extra values for is_jmp */
 /* These instructions trap after executing, so the A32/T32 decoder must
  * defer them until after the conditional execution state has been updated.

From cf4ab1af296b8ef5d5a1dc65fda804b88ddd0553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:47 +0000
Subject: [PATCH 07/30] target-arm: A64: Fix bug in add_sub_ext handling of rn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rn == 31 always means SP (not XZR) whether an add_sub_ext
instruction is setting the flags or not; only rd has behaviour
dependent on whether we are setting flags.

Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-3-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 37252b7479..444dd8504d 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -3096,12 +3096,11 @@ static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn)
 
     /* non-flag setting ops may use SP */
     if (!setflags) {
-        tcg_rn = read_cpu_reg_sp(s, rn, sf);
         tcg_rd = cpu_reg_sp(s, rd);
     } else {
-        tcg_rn = read_cpu_reg(s, rn, sf);
         tcg_rd = cpu_reg(s, rd);
     }
+    tcg_rn = read_cpu_reg_sp(s, rn, sf);
 
     tcg_rm = read_cpu_reg(s, rm, sf);
     ext_and_shift_reg(tcg_rm, tcg_rm, option, imm3);

From 10113b6903c0256c1741918430b0304c5a60b7a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:47 +0000
Subject: [PATCH 08/30] target-arm: A64: Add last AdvSIMD Integer to FP ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds the remaining [US]CVTF operations to the SIMD
shift-immediate, scalar-shift-immediate, two-reg-misc and
scalar-two-reg-misc groups of opcodes.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Message-id: 1394822294-14837-4-git-send-email-peter.maydell@linaro.org
[PMM: added scalar 2-misc and scalar-shift-imm encodings]
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target-arm/translate-a64.c | 132 ++++++++++++++++++++++++++++++++++---
 1 file changed, 123 insertions(+), 9 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 444dd8504d..550decc800 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5907,6 +5907,95 @@ static void handle_scalar_simd_shli(DisasContext *s, bool insert,
     tcg_temp_free_i64(tcg_rd);
 }
 
+/* Common vector code for handling integer to FP conversion */
+static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
+                                   int elements, int is_signed,
+                                   int fracbits, int size)
+{
+    bool is_double = size == 3 ? true : false;
+    TCGv_ptr tcg_fpst = get_fpstatus_ptr();
+    TCGv_i32 tcg_shift = tcg_const_i32(fracbits);
+    TCGv_i64 tcg_int = tcg_temp_new_i64();
+    TCGMemOp mop = size | (is_signed ? MO_SIGN : 0);
+    int pass;
+
+    for (pass = 0; pass < elements; pass++) {
+        read_vec_element(s, tcg_int, rn, pass, mop);
+
+        if (is_double) {
+            TCGv_i64 tcg_double = tcg_temp_new_i64();
+            if (is_signed) {
+                gen_helper_vfp_sqtod(tcg_double, tcg_int,
+                                     tcg_shift, tcg_fpst);
+            } else {
+                gen_helper_vfp_uqtod(tcg_double, tcg_int,
+                                     tcg_shift, tcg_fpst);
+            }
+            if (elements == 1) {
+                write_fp_dreg(s, rd, tcg_double);
+            } else {
+                write_vec_element(s, tcg_double, rd, pass, MO_64);
+            }
+            tcg_temp_free_i64(tcg_double);
+        } else {
+            TCGv_i32 tcg_single = tcg_temp_new_i32();
+            if (is_signed) {
+                gen_helper_vfp_sqtos(tcg_single, tcg_int,
+                                     tcg_shift, tcg_fpst);
+            } else {
+                gen_helper_vfp_uqtos(tcg_single, tcg_int,
+                                     tcg_shift, tcg_fpst);
+            }
+            if (elements == 1) {
+                write_fp_sreg(s, rd, tcg_single);
+            } else {
+                write_vec_element_i32(s, tcg_single, rd, pass, MO_32);
+            }
+            tcg_temp_free_i32(tcg_single);
+        }
+    }
+
+    if (!is_double && elements == 2) {
+        clear_vec_high(s, rd);
+    }
+
+    tcg_temp_free_i64(tcg_int);
+    tcg_temp_free_ptr(tcg_fpst);
+    tcg_temp_free_i32(tcg_shift);
+}
+
+/* UCVTF/SCVTF - Integer to FP conversion */
+static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar,
+                                         bool is_q, bool is_u,
+                                         int immh, int immb, int opcode,
+                                         int rn, int rd)
+{
+    bool is_double = extract32(immh, 3, 1);
+    int size = is_double ? MO_64 : MO_32;
+    int elements;
+    int immhb = immh << 3 | immb;
+    int fracbits = (is_double ? 128 : 64) - immhb;
+
+    if (!extract32(immh, 2, 2)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_scalar) {
+        elements = 1;
+    } else {
+        elements = is_double ? 2 : is_q ? 4 : 2;
+        if (is_double && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+    /* immh == 0 would be a failure of the decode logic */
+    g_assert(immh);
+
+    handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size);
+}
+
 /* C3.6.9 AdvSIMD scalar shift by immediate
  *  31 30  29 28         23 22  19 18  16 15    11  10 9    5 4    0
  * +-----+---+-------------+------+------+--------+---+------+------+
@@ -5934,6 +6023,10 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
     case 0x0a: /* SHL / SLI */
         handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
         break;
+    case 0x1c: /* SCVTF, UCVTF */
+        handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
+                                     opcode, rn, rd);
+        break;
     default:
         unsupported_encoding(s, insn);
         break;
@@ -6689,10 +6782,16 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x6d: /* FCMLE (zero) */
             handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd);
             return;
+        case 0x1d: /* SCVTF */
+        case 0x5d: /* UCVTF */
+        {
+            bool is_signed = (opcode == 0x1d);
+            handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size);
+            return;
+        }
         case 0x1a: /* FCVTNS */
         case 0x1b: /* FCVTMS */
         case 0x1c: /* FCVTAS */
-        case 0x1d: /* SCVTF */
         case 0x3a: /* FCVTPS */
         case 0x3b: /* FCVTZS */
         case 0x3d: /* FRECPE */
@@ -6701,7 +6800,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x5a: /* FCVTNU */
         case 0x5b: /* FCVTMU */
         case 0x5c: /* FCVTAU */
-        case 0x5d: /* UCVTF */
         case 0x7a: /* FCVTPU */
         case 0x7b: /* FCVTZU */
         case 0x7d: /* FRSQRTE */
@@ -6877,7 +6975,6 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
     }
 }
 
-
 /* C3.6.14 AdvSIMD shift by immediate
  *  31  30   29 28         23 22  19 18  16 15    11  10 9    5 4    0
  * +---+---+---+-------------+------+------+--------+---+------+------+
@@ -6907,10 +7004,16 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
     case 0x14: /* SSHLL / USHLL */
         handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
         break;
+    case 0x1c: /* SCVTF / UCVTF */
+        handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
+                                     opcode, rn, rd);
+        break;
+    case 0x1f: /* FCVTZS/ FCVTZU */
+        unsupported_encoding(s, insn);
+        return;
     default:
-        /* We don't currently implement any of the Narrow or saturating shifts;
-         * nor do we implement the fixed-point conversions in this
-         * encoding group (SCVTF, FCVTZS, UCVTF, FCVTZU).
+        /* We don't currently implement any of the Narrow or
+         * saturating shifts.
          */
         unsupported_encoding(s, insn);
         return;
@@ -8255,8 +8358,9 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         /* Floating point: U, size[1] and opcode indicate operation;
          * size[0] indicates single or double precision.
          */
+        int is_double = extract32(size, 0, 1);
         opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
-        size = extract32(size, 0, 1) ? 3 : 2;
+        size = is_double ? 3 : 2;
         switch (opcode) {
         case 0x2f: /* FABS */
         case 0x6f: /* FNEG */
@@ -8265,6 +8369,18 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 return;
             }
             break;
+        case 0x1d: /* SCVTF */
+        case 0x5d: /* UCVTF */
+        {
+            bool is_signed = (opcode == 0x1d) ? true : false;
+            int elements = is_double ? 2 : is_q ? 4 : 2;
+            if (is_double && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            handle_simd_intfp_conv(s, rd, rn, elements, is_signed, 0, size);
+            return;
+        }
         case 0x2c: /* FCMGT (zero) */
         case 0x2d: /* FCMEQ (zero) */
         case 0x2e: /* FCMLT (zero) */
@@ -8283,7 +8399,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x1a: /* FCVTNS */
         case 0x1b: /* FCVTMS */
         case 0x1c: /* FCVTAS */
-        case 0x1d: /* SCVTF */
         case 0x38: /* FRINTP */
         case 0x39: /* FRINTZ */
         case 0x3a: /* FCVTPS */
@@ -8296,7 +8411,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x5a: /* FCVTNU */
         case 0x5b: /* FCVTMU */
         case 0x5c: /* FCVTAU */
-        case 0x5d: /* UCVTF */
         case 0x79: /* FRINTI */
         case 0x7a: /* FCVTPU */
         case 0x7b: /* FCVTZU */

From f612537e0706761d5692deaa72516695ef0a2ac8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:47 +0000
Subject: [PATCH 09/30] target-arm: A64: Add FSQRT to C3.6.17 (two misc)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement FSQRT in the two-reg-misc category.
GCC uses this instruction form.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-5-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 550decc800..427f4847fa 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6623,6 +6623,9 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u,
     case 0x6f: /* FNEG */
         gen_helper_vfp_negd(tcg_rd, tcg_rn);
         break;
+    case 0x7f: /* FSQRT */
+        gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env);
+        break;
     default:
         g_assert_not_reached();
     }
@@ -8392,6 +8395,12 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             }
             handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd);
             return;
+        case 0x7f: /* FSQRT */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
         case 0x16: /* FCVTN, FCVTN2 */
         case 0x17: /* FCVTL, FCVTL2 */
         case 0x18: /* FRINTN */
@@ -8416,7 +8425,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x7b: /* FCVTZU */
         case 0x7c: /* URSQRTE */
         case 0x7d: /* FRSQRTE */
-        case 0x7f: /* FSQRT */
             unsupported_encoding(s, insn);
             return;
         default:
@@ -8493,6 +8501,9 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 case 0x6f: /* FNEG */
                     gen_helper_vfp_negs(tcg_res, tcg_op);
                     break;
+                case 0x7f: /* FSQRT */
+                    gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
+                    break;
                 default:
                     g_assert_not_reached();
                 }

From b05c3068577f6caea6f1911b9e03d52dbf84f475 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:48 +0000
Subject: [PATCH 10/30] target-arm: A64: Add remaining CLS/Z vector ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the CLS, CLZ operations in the 2-reg-misc category.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-6-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper-a64.c    |  5 +++++
 target-arm/helper-a64.h    |  1 +
 target-arm/translate-a64.c | 36 +++++++++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 80ed029053..8f53223cee 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -60,6 +60,11 @@ uint32_t HELPER(cls32)(uint32_t x)
     return clrsb32(x);
 }
 
+uint32_t HELPER(clz32)(uint32_t x)
+{
+    return clz32(x);
+}
+
 uint64_t HELPER(rbit64)(uint64_t x)
 {
     /* assign the correct byte position */
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 43d8bbfc73..a113d22584 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -21,6 +21,7 @@ DEF_HELPER_FLAGS_2(sdiv64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_1(clz64, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(cls64, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(cls32, TCG_CALL_NO_RWG_SE, i32, i32)
+DEF_HELPER_FLAGS_1(clz32, TCG_CALL_NO_RWG_SE, i32, i32)
 DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 427f4847fa..4d40fb0615 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6584,6 +6584,13 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u,
     TCGCond cond;
 
     switch (opcode) {
+    case 0x4: /* CLS, CLZ */
+        if (u) {
+            gen_helper_clz64(tcg_rd, tcg_rn);
+        } else {
+            gen_helper_cls64(tcg_rd, tcg_rn);
+        }
+        break;
     case 0x5: /* NOT */
         /* This opcode is shared with CNT and RBIT but we have earlier
          * enforced that size == 3 if and only if this is the NOT insn.
@@ -8316,8 +8323,13 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         }
         handle_2misc_narrow(s, opcode, u, is_q, size, rn, rd);
         return;
-    case 0x2: /* SADDLP, UADDLP */
     case 0x4: /* CLS, CLZ */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x2: /* SADDLP, UADDLP */
     case 0x6: /* SADALP, UADALP */
         if (size == 3) {
             unallocated_encoding(s);
@@ -8484,6 +8496,13 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 case 0x9: /* CMEQ, CMLE */
                     cond = u ? TCG_COND_LE : TCG_COND_EQ;
                     goto do_cmop;
+                case 0x4: /* CLS */
+                    if (u) {
+                        gen_helper_clz32(tcg_res, tcg_op);
+                    } else {
+                        gen_helper_cls32(tcg_res, tcg_op);
+                    }
+                    break;
                 case 0xb: /* ABS, NEG */
                     if (u) {
                         tcg_gen_neg_i32(tcg_res, tcg_op);
@@ -8567,6 +8586,21 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                         }
                     }
                     break;
+                case 0x4: /* CLS, CLZ */
+                    if (u) {
+                        if (size == 0) {
+                            gen_helper_neon_clz_u8(tcg_res, tcg_op);
+                        } else {
+                            gen_helper_neon_clz_u16(tcg_res, tcg_op);
+                        }
+                    } else {
+                        if (size == 0) {
+                            gen_helper_neon_cls_s8(tcg_res, tcg_op);
+                        } else {
+                            gen_helper_neon_cls_s16(tcg_res, tcg_op);
+                        }
+                    }
+                    break;
                 default:
                     g_assert_not_reached();
                 }

From c1b876b2e9a47e3e36be57a4f3d167b19c5f586a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:48 +0000
Subject: [PATCH 11/30] target-arm: A64: Saturating and narrowing shift ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements the remaining [US][Q][R]SHR[U][N][2] opcodes, which are
saturating and narrowing shift right operations. These are used in
things like libav. Note signed shifts can have an "unsigned" saturating
narrow operation which will floor negative values.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Message-id: 1394822294-14837-7-git-send-email-peter.maydell@linaro.org
[PMM: Added the scalar encodings, style tweaks]
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target-arm/translate-a64.c | 181 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 178 insertions(+), 3 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 4d40fb0615..f8cae69e01 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5907,6 +5907,94 @@ static void handle_scalar_simd_shli(DisasContext *s, bool insert,
     tcg_temp_free_i64(tcg_rd);
 }
 
+/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with
+ * (signed/unsigned) narrowing */
+static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
+                                   bool is_u_shift, bool is_u_narrow,
+                                   int immh, int immb, int opcode,
+                                   int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int size = 32 - clz32(immh) - 1;
+    int esize = 8 << size;
+    int shift = (2 * esize) - immhb;
+    int elements = is_scalar ? 1 : (64 / esize);
+    bool round = extract32(opcode, 0, 1);
+    TCGMemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
+    TCGv_i64 tcg_rn, tcg_rd, tcg_round;
+    TCGv_i32 tcg_rd_narrowed;
+    TCGv_i64 tcg_final;
+
+    static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = {
+        { gen_helper_neon_narrow_sat_s8,
+          gen_helper_neon_unarrow_sat8 },
+        { gen_helper_neon_narrow_sat_s16,
+          gen_helper_neon_unarrow_sat16 },
+        { gen_helper_neon_narrow_sat_s32,
+          gen_helper_neon_unarrow_sat32 },
+        { NULL, NULL },
+    };
+    static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = {
+        gen_helper_neon_narrow_sat_u8,
+        gen_helper_neon_narrow_sat_u16,
+        gen_helper_neon_narrow_sat_u32,
+        NULL
+    };
+    NeonGenNarrowEnvFn *narrowfn;
+
+    int i;
+
+    assert(size < 4);
+
+    if (extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_u_shift) {
+        narrowfn = unsigned_narrow_fns[size];
+    } else {
+        narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0];
+    }
+
+    tcg_rn = tcg_temp_new_i64();
+    tcg_rd = tcg_temp_new_i64();
+    tcg_rd_narrowed = tcg_temp_new_i32();
+    tcg_final = tcg_const_i64(0);
+
+    if (round) {
+        uint64_t round_const = 1ULL << (shift - 1);
+        tcg_round = tcg_const_i64(round_const);
+    } else {
+        TCGV_UNUSED_I64(tcg_round);
+    }
+
+    for (i = 0; i < elements; i++) {
+        read_vec_element(s, tcg_rn, rn, i, ldop);
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                false, is_u_shift, size+1, shift);
+        narrowfn(tcg_rd_narrowed, cpu_env, tcg_rd);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
+        tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
+    }
+
+    if (!is_q) {
+        clear_vec_high(s, rd);
+        write_vec_element(s, tcg_final, rd, 0, MO_64);
+    } else {
+        write_vec_element(s, tcg_final, rd, 1, MO_64);
+    }
+
+    if (round) {
+        tcg_temp_free_i64(tcg_round);
+    }
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+    tcg_temp_free_i32(tcg_rd_narrowed);
+    tcg_temp_free_i64(tcg_final);
+    return;
+}
+
 /* Common vector code for handling integer to FP conversion */
 static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
                                    int elements, int is_signed,
@@ -6013,6 +6101,11 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
     int immh = extract32(insn, 19, 4);
     bool is_u = extract32(insn, 29, 1);
 
+    if (immh == 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
     switch (opcode) {
     case 0x00: /* SSHR / USHR */
     case 0x02: /* SSRA / USRA */
@@ -6027,6 +6120,20 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
         handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
                                      opcode, rn, rd);
         break;
+    case 0x10: /* SQSHRUN, SQSHRUN2 */
+    case 0x11: /* SQRSHRUN, SQRSHRUN2 */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_vec_simd_sqshrn(s, true, false, false, true,
+                               immh, immb, opcode, rn, rd);
+        break;
+    case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */
+    case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */
+        handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
+                               immh, immb, opcode, rn, rd);
+        break;
     default:
         unsupported_encoding(s, insn);
         break;
@@ -6985,6 +7092,63 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
     }
 }
 
+/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
+static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int size = 32 - clz32(immh) - 1;
+    int dsize = 64;
+    int esize = 8 << size;
+    int elements = dsize/esize;
+    int shift = (2 * esize) - immhb;
+    bool round = extract32(opcode, 0, 1);
+    TCGv_i64 tcg_rn, tcg_rd, tcg_final;
+    TCGv_i64 tcg_round;
+    int i;
+
+    if (extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rn = tcg_temp_new_i64();
+    tcg_rd = tcg_temp_new_i64();
+    tcg_final = tcg_temp_new_i64();
+    read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64);
+
+    if (round) {
+        uint64_t round_const = 1ULL << (shift - 1);
+        tcg_round = tcg_const_i64(round_const);
+    } else {
+        TCGV_UNUSED_I64(tcg_round);
+    }
+
+    for (i = 0; i < elements; i++) {
+        read_vec_element(s, tcg_rn, rn, i, size+1);
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                false, true, size+1, shift);
+
+        tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
+    }
+
+    if (!is_q) {
+        clear_vec_high(s, rd);
+        write_vec_element(s, tcg_final, rd, 0, MO_64);
+    } else {
+        write_vec_element(s, tcg_final, rd, 1, MO_64);
+    }
+
+    if (round) {
+        tcg_temp_free_i64(tcg_round);
+    }
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+    tcg_temp_free_i64(tcg_final);
+    return;
+}
+
+
 /* C3.6.14 AdvSIMD shift by immediate
  *  31  30   29 28         23 22  19 18  16 15    11  10 9    5 4    0
  * +---+---+---+-------------+------+------+--------+---+------+------+
@@ -7011,6 +7175,20 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
     case 0x0a: /* SHL / SLI */
         handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
         break;
+    case 0x10: /* SHRN */
+    case 0x11: /* RSHRN / SQRSHRUN */
+        if (is_u) {
+            handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb,
+                                   opcode, rn, rd);
+        } else {
+            handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd);
+        }
+        break;
+    case 0x12: /* SQSHRN / UQSHRN */
+    case 0x13: /* SQRSHRN / UQRSHRN */
+        handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb,
+                               opcode, rn, rd);
+        break;
     case 0x14: /* SSHLL / USHLL */
         handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
         break;
@@ -7022,9 +7200,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
         unsupported_encoding(s, insn);
         return;
     default:
-        /* We don't currently implement any of the Narrow or
-         * saturating shifts.
-         */
         unsupported_encoding(s, insn);
         return;
     }

From 6781fa119f3f403bcab59142faa9581aff974358 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:48 +0000
Subject: [PATCH 12/30] target-arm: A64: Implement SADDLP, UADDLP, SADALP,
 UADALP

Implement the SADDLP, UADDLP, SADALP and UADALP instructions
in the SIMD 2-reg misc category.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-8-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper-a64.c    | 61 +++++++++++++++++++++++++++++++
 target-arm/helper-a64.h    |  4 ++
 target-arm/translate-a64.c | 75 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 8f53223cee..c31c45e729 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -293,3 +293,64 @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
     }
     return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
 }
+
+/* Pairwise long add: add pairs of adjacent elements into
+ * double-width elements in the result (eg _s8 is an 8x8->16 op)
+ */
+uint64_t HELPER(neon_addlp_s8)(uint64_t a)
+{
+    uint64_t nsignmask = 0x0080008000800080ULL;
+    uint64_t wsignmask = 0x8000800080008000ULL;
+    uint64_t elementmask = 0x00ff00ff00ff00ffULL;
+    uint64_t tmp1, tmp2;
+    uint64_t res, signres;
+
+    /* Extract odd elements, sign extend each to a 16 bit field */
+    tmp1 = a & elementmask;
+    tmp1 ^= nsignmask;
+    tmp1 |= wsignmask;
+    tmp1 = (tmp1 - nsignmask) ^ wsignmask;
+    /* Ditto for the even elements */
+    tmp2 = (a >> 8) & elementmask;
+    tmp2 ^= nsignmask;
+    tmp2 |= wsignmask;
+    tmp2 = (tmp2 - nsignmask) ^ wsignmask;
+
+    /* calculate the result by summing bits 0..14, 16..22, etc,
+     * and then adjusting the sign bits 15, 23, etc manually.
+     * This ensures the addition can't overflow the 16 bit field.
+     */
+    signres = (tmp1 ^ tmp2) & wsignmask;
+    res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
+    res ^= signres;
+
+    return res;
+}
+
+uint64_t HELPER(neon_addlp_u8)(uint64_t a)
+{
+    uint64_t tmp;
+
+    tmp = a & 0x00ff00ff00ff00ffULL;
+    tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
+    return tmp;
+}
+
+uint64_t HELPER(neon_addlp_s16)(uint64_t a)
+{
+    int32_t reslo, reshi;
+
+    reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
+    reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
+
+    return (uint32_t)reslo | (((uint64_t)reshi) << 32);
+}
+
+uint64_t HELPER(neon_addlp_u16)(uint64_t a)
+{
+    uint64_t tmp;
+
+    tmp = a & 0x0000ffff0000ffffULL;
+    tmp += (a >> 16) & 0x0000ffff0000ffffULL;
+    return tmp;
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index a113d22584..88fc9fe9e7 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -39,3 +39,7 @@ DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
+DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(neon_addlp_u8, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f8cae69e01..4562facba2 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -81,6 +81,7 @@ typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
 typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
 typedef void NeonGenTwoSingleOPFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
 typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
+typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64);
 
 /* initialize TCG globals.  */
 void a64_translate_init(void)
@@ -8456,6 +8457,78 @@ static void handle_rev(DisasContext *s, int opcode, bool u,
     }
 }
 
+static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u,
+                                  bool is_q, int size, int rn, int rd)
+{
+    /* Implement the pairwise operations from 2-misc:
+     * SADDLP, UADDLP, SADALP, UADALP.
+     * These all add pairs of elements in the input to produce a
+     * double-width result element in the output (possibly accumulating).
+     */
+    bool accum = (opcode == 0x6);
+    int maxpass = is_q ? 2 : 1;
+    int pass;
+    TCGv_i64 tcg_res[2];
+
+    if (size == 2) {
+        /* 32 + 32 -> 64 op */
+        TCGMemOp memop = size + (u ? 0 : MO_SIGN);
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op1, rn, pass * 2, memop);
+            read_vec_element(s, tcg_op2, rn, pass * 2 + 1, memop);
+            tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+            if (accum) {
+                read_vec_element(s, tcg_op1, rd, pass, MO_64);
+                tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+            }
+
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+    } else {
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+            NeonGenOneOpFn *genfn;
+            static NeonGenOneOpFn * const fns[2][2] = {
+                { gen_helper_neon_addlp_s8,  gen_helper_neon_addlp_u8 },
+                { gen_helper_neon_addlp_s16,  gen_helper_neon_addlp_u16 },
+            };
+
+            genfn = fns[size][u];
+
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            genfn(tcg_res[pass], tcg_op);
+
+            if (accum) {
+                read_vec_element(s, tcg_op, rd, pass, MO_64);
+                if (size == 0) {
+                    gen_helper_neon_addl_u16(tcg_res[pass],
+                                             tcg_res[pass], tcg_op);
+                } else {
+                    gen_helper_neon_addl_u32(tcg_res[pass],
+                                             tcg_res[pass], tcg_op);
+                }
+            }
+            tcg_temp_free_i64(tcg_op);
+        }
+    }
+    if (!is_q) {
+        tcg_res[1] = tcg_const_i64(0);
+    }
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+        tcg_temp_free_i64(tcg_res[pass]);
+    }
+}
+
 /* C3.6.17 AdvSIMD two reg misc
  *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
  * +---+---+---+-----------+------+-----------+--------+-----+------+------+
@@ -8510,7 +8583,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             return;
         }
-        unsupported_encoding(s, insn);
+        handle_2misc_pairwise(s, opcode, u, is_q, size, rn, rd);
         return;
     case 0x13: /* SHLL, SHLL2 */
         if (u == 0 || size == 3) {

From 73a81d10fda3cb45e62efd6829f19debb9f54073 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:49 +0000
Subject: [PATCH 13/30] target-arm: A64: Implement SHLL, SHLL2

Implement the SHLL and SHLL2 instructions from the 2-reg-misc
category.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-9-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 4562facba2..7fca9ff5dc 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -8529,6 +8529,36 @@ static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u,
     }
 }
 
+static void handle_shll(DisasContext *s, bool is_q, int size, int rn, int rd)
+{
+    /* Implement SHLL and SHLL2 */
+    int pass;
+    int part = is_q ? 2 : 0;
+    TCGv_i64 tcg_res[2];
+
+    for (pass = 0; pass < 2; pass++) {
+        static NeonGenWidenFn * const widenfns[3] = {
+            gen_helper_neon_widen_u8,
+            gen_helper_neon_widen_u16,
+            tcg_gen_extu_i32_i64,
+        };
+        NeonGenWidenFn *widenfn = widenfns[size];
+        TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+        read_vec_element_i32(s, tcg_op, rn, part + pass, MO_32);
+        tcg_res[pass] = tcg_temp_new_i64();
+        widenfn(tcg_res[pass], tcg_op);
+        tcg_gen_shli_i64(tcg_res[pass], tcg_res[pass], 8 << size);
+
+        tcg_temp_free_i32(tcg_op);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+        tcg_temp_free_i64(tcg_res[pass]);
+    }
+}
+
 /* C3.6.17 AdvSIMD two reg misc
  *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
  * +---+---+---+-----------+------+-----------+--------+-----+------+------+
@@ -8590,7 +8620,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             return;
         }
-        unsupported_encoding(s, insn);
+        handle_shll(s, is_q, size, rn, rd);
         return;
     case 0xa: /* CMLT */
         if (u == 1) {

From 04c7c6c261c3000c851eb177a7d32236828f4be2 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:49 +0000
Subject: [PATCH 14/30] target-arm: A64: Implement FCVT[NMAPZ][SU] SIMD
 instructions

Implement the floating-point-to-integer conversion instructions
FCVT[NMAPZ][SU] in the 2-reg-misc and scalar-2-reg-misc
categories.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-10-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 190 +++++++++++++++++++++++++++++++++----
 1 file changed, 170 insertions(+), 20 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 7fca9ff5dc..ec77c8b301 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6683,11 +6683,14 @@ static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
 }
 
 static void handle_2misc_64(DisasContext *s, int opcode, bool u,
-                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn)
+                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn,
+                            TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus)
 {
     /* Handle 64->64 opcodes which are shared between the scalar and
      * vector 2-reg-misc groups. We cover every integer opcode where size == 3
      * is valid in either group and also the double-precision fp ops.
+     * The caller only need provide tcg_rmode and tcg_fpstatus if the op
+     * requires them.
      */
     TCGCond cond;
 
@@ -6741,6 +6744,28 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u,
     case 0x7f: /* FSQRT */
         gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env);
         break;
+    case 0x1a: /* FCVTNS */
+    case 0x1b: /* FCVTMS */
+    case 0x1c: /* FCVTAS */
+    case 0x3a: /* FCVTPS */
+    case 0x3b: /* FCVTZS */
+    {
+        TCGv_i32 tcg_shift = tcg_const_i32(0);
+        gen_helper_vfp_tosqd(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus);
+        tcg_temp_free_i32(tcg_shift);
+        break;
+    }
+    case 0x5a: /* FCVTNU */
+    case 0x5b: /* FCVTMU */
+    case 0x5c: /* FCVTAU */
+    case 0x7a: /* FCVTPU */
+    case 0x7b: /* FCVTZU */
+    {
+        TCGv_i32 tcg_shift = tcg_const_i32(0);
+        gen_helper_vfp_touqd(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus);
+        tcg_temp_free_i32(tcg_shift);
+        break;
+    }
     default:
         g_assert_not_reached();
     }
@@ -6868,6 +6893,10 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
     int opcode = extract32(insn, 12, 5);
     int size = extract32(insn, 22, 2);
     bool u = extract32(insn, 29, 1);
+    bool is_fcvt = false;
+    int rmode;
+    TCGv_i32 tcg_rmode;
+    TCGv_ptr tcg_fpstatus;
 
     switch (opcode) {
     case 0xa: /* CMLT */
@@ -6909,17 +6938,24 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
         }
         case 0x1a: /* FCVTNS */
         case 0x1b: /* FCVTMS */
-        case 0x1c: /* FCVTAS */
         case 0x3a: /* FCVTPS */
         case 0x3b: /* FCVTZS */
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            is_fcvt = true;
+            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+            break;
+        case 0x1c: /* FCVTAS */
+        case 0x5c: /* FCVTAU */
+            /* TIEAWAY doesn't fit in the usual rounding mode encoding */
+            is_fcvt = true;
+            rmode = FPROUNDING_TIEAWAY;
+            break;
         case 0x3d: /* FRECPE */
         case 0x3f: /* FRECPX */
         case 0x56: /* FCVTXN, FCVTXN2 */
-        case 0x5a: /* FCVTNU */
-        case 0x5b: /* FCVTMU */
-        case 0x5c: /* FCVTAU */
-        case 0x7a: /* FCVTPU */
-        case 0x7b: /* FCVTZU */
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);
             return;
@@ -6938,18 +6974,66 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
         return;
     }
 
+    if (is_fcvt) {
+        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+        tcg_fpstatus = get_fpstatus_ptr();
+    } else {
+        TCGV_UNUSED_I32(tcg_rmode);
+        TCGV_UNUSED_PTR(tcg_fpstatus);
+    }
+
     if (size == 3) {
         TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
         TCGv_i64 tcg_rd = tcg_temp_new_i64();
 
-        handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn);
+        handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rmode, tcg_fpstatus);
         write_fp_dreg(s, rd, tcg_rd);
         tcg_temp_free_i64(tcg_rd);
         tcg_temp_free_i64(tcg_rn);
+    } else if (size == 2) {
+        TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
+        TCGv_i32 tcg_rd = tcg_temp_new_i32();
+
+        switch (opcode) {
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x1c: /* FCVTAS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+        {
+            TCGv_i32 tcg_shift = tcg_const_i32(0);
+            gen_helper_vfp_tosls(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus);
+            tcg_temp_free_i32(tcg_shift);
+            break;
+        }
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x5c: /* FCVTAU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+        {
+            TCGv_i32 tcg_shift = tcg_const_i32(0);
+            gen_helper_vfp_touls(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus);
+            tcg_temp_free_i32(tcg_shift);
+            break;
+        }
+        default:
+            g_assert_not_reached();
+        }
+
+        write_fp_sreg(s, rd, tcg_rd);
+        tcg_temp_free_i32(tcg_rd);
+        tcg_temp_free_i32(tcg_rn);
     } else {
-        /* the 'size might not be 64' ops aren't implemented yet */
         g_assert_not_reached();
     }
+
+    if (is_fcvt) {
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+        tcg_temp_free_i32(tcg_rmode);
+        tcg_temp_free_ptr(tcg_fpstatus);
+    }
 }
 
 /* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
@@ -8573,6 +8657,11 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
     bool is_q = extract32(insn, 30, 1);
     int rn = extract32(insn, 5, 5);
     int rd = extract32(insn, 0, 5);
+    bool need_fpstatus = false;
+    bool need_rmode = false;
+    int rmode = -1;
+    TCGv_i32 tcg_rmode;
+    TCGv_ptr tcg_fpstatus;
 
     switch (opcode) {
     case 0x0: /* REV64, REV32 */
@@ -8691,28 +8780,44 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 return;
             }
             break;
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            need_fpstatus = true;
+            need_rmode = true;
+            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x5c: /* FCVTAU */
+        case 0x1c: /* FCVTAS */
+            need_fpstatus = true;
+            need_rmode = true;
+            rmode = FPROUNDING_TIEAWAY;
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
         case 0x16: /* FCVTN, FCVTN2 */
         case 0x17: /* FCVTL, FCVTL2 */
         case 0x18: /* FRINTN */
         case 0x19: /* FRINTM */
-        case 0x1a: /* FCVTNS */
-        case 0x1b: /* FCVTMS */
-        case 0x1c: /* FCVTAS */
         case 0x38: /* FRINTP */
         case 0x39: /* FRINTZ */
-        case 0x3a: /* FCVTPS */
-        case 0x3b: /* FCVTZS */
         case 0x3c: /* URECPE */
         case 0x3d: /* FRECPE */
         case 0x56: /* FCVTXN, FCVTXN2 */
         case 0x58: /* FRINTA */
         case 0x59: /* FRINTX */
-        case 0x5a: /* FCVTNU */
-        case 0x5b: /* FCVTMU */
-        case 0x5c: /* FCVTAU */
         case 0x79: /* FRINTI */
-        case 0x7a: /* FCVTPU */
-        case 0x7b: /* FCVTZU */
         case 0x7c: /* URSQRTE */
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);
@@ -8728,6 +8833,18 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         return;
     }
 
+    if (need_fpstatus) {
+        tcg_fpstatus = get_fpstatus_ptr();
+    } else {
+        TCGV_UNUSED_PTR(tcg_fpstatus);
+    }
+    if (need_rmode) {
+        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    } else {
+        TCGV_UNUSED_I32(tcg_rmode);
+    }
+
     if (size == 3) {
         /* All 64-bit element operations can be shared with scalar 2misc */
         int pass;
@@ -8738,7 +8855,8 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
 
             read_vec_element(s, tcg_op, rn, pass, MO_64);
 
-            handle_2misc_64(s, opcode, u, tcg_res, tcg_op);
+            handle_2misc_64(s, opcode, u, tcg_res, tcg_op,
+                            tcg_rmode, tcg_fpstatus);
 
             write_vec_element(s, tcg_res, rd, pass, MO_64);
 
@@ -8801,6 +8919,30 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 case 0x7f: /* FSQRT */
                     gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
                     break;
+                case 0x1a: /* FCVTNS */
+                case 0x1b: /* FCVTMS */
+                case 0x1c: /* FCVTAS */
+                case 0x3a: /* FCVTPS */
+                case 0x3b: /* FCVTZS */
+                {
+                    TCGv_i32 tcg_shift = tcg_const_i32(0);
+                    gen_helper_vfp_tosls(tcg_res, tcg_op,
+                                         tcg_shift, tcg_fpstatus);
+                    tcg_temp_free_i32(tcg_shift);
+                    break;
+                }
+                case 0x5a: /* FCVTNU */
+                case 0x5b: /* FCVTMU */
+                case 0x5c: /* FCVTAU */
+                case 0x7a: /* FCVTPU */
+                case 0x7b: /* FCVTZU */
+                {
+                    TCGv_i32 tcg_shift = tcg_const_i32(0);
+                    gen_helper_vfp_touls(tcg_res, tcg_op,
+                                         tcg_shift, tcg_fpstatus);
+                    tcg_temp_free_i32(tcg_shift);
+                    break;
+                }
                 default:
                     g_assert_not_reached();
                 }
@@ -8893,6 +9035,14 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
     if (!is_q) {
         clear_vec_high(s, rd);
     }
+
+    if (need_rmode) {
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+        tcg_temp_free_i32(tcg_rmode);
+    }
+    if (need_fpstatus) {
+        tcg_temp_free_ptr(tcg_fpstatus);
+    }
 }
 
 /* C3.6.13 AdvSIMD scalar x indexed element

From 261a5b4dd1dc6c68b274cc39bb5d4d236b24d4cd Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:49 +0000
Subject: [PATCH 15/30] target-arm: A64: Implement FCVTN

Implement FCVTN (narrowing fp-to-fp conversions) from the SIMD
2-reg-misc category.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-11-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index ec77c8b301..f6a4ce752f 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -8450,13 +8450,30 @@ static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q,
             genenvfn = fns[size][u];
             break;
         }
+        case 0x16: /* FCVTN, FCVTN2 */
+            /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
+            if (size == 2) {
+                gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env);
+            } else {
+                TCGv_i32 tcg_lo = tcg_temp_new_i32();
+                TCGv_i32 tcg_hi = tcg_temp_new_i32();
+                tcg_gen_trunc_i64_i32(tcg_lo, tcg_op);
+                gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, cpu_env);
+                tcg_gen_shri_i64(tcg_op, tcg_op, 32);
+                tcg_gen_trunc_i64_i32(tcg_hi, tcg_op);
+                gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, cpu_env);
+                tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
+                tcg_temp_free_i32(tcg_lo);
+                tcg_temp_free_i32(tcg_hi);
+            }
+            break;
         default:
             g_assert_not_reached();
         }
 
         if (genfn) {
             genfn(tcg_res[pass], tcg_op);
-        } else {
+        } else if (genenvfn) {
             genenvfn(tcg_res[pass], cpu_env, tcg_op);
         }
 
@@ -8807,6 +8824,11 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             }
             break;
         case 0x16: /* FCVTN, FCVTN2 */
+            /* handle_2misc_narrow does a 2*size -> size operation, but these
+             * instructions encode the source size rather than dest size.
+             */
+            handle_2misc_narrow(s, opcode, 0, is_q, size - 1, rn, rd);
+            return;
         case 0x17: /* FCVTL, FCVTL2 */
         case 0x18: /* FRINTN */
         case 0x19: /* FRINTM */

From 931c8cc270793877f8d7bf9934ac9fa3fb7800be Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:49 +0000
Subject: [PATCH 16/30] target-arm: A64: Implement FCVTL

Implement FCVTL, the only instruction in the 2-reg-misc group
which widens from size to 2*size elements.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-12-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 47 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f6a4ce752f..e6addf45f1 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -8489,6 +8489,51 @@ static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q,
     }
 }
 
+static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q,
+                                  int size, int rn, int rd)
+{
+    /* Handle 2-reg-misc ops which are widening (so each size element
+     * in the source becomes a 2*size element in the destination.
+     * The only instruction like this is FCVTL.
+     */
+    int pass;
+
+    if (size == 3) {
+        /* 32 -> 64 bit fp conversion */
+        TCGv_i64 tcg_res[2];
+        int srcelt = is_q ? 2 : 0;
+
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            read_vec_element_i32(s, tcg_op, rn, srcelt + pass, MO_32);
+            gen_helper_vfp_fcvtds(tcg_res[pass], tcg_op, cpu_env);
+            tcg_temp_free_i32(tcg_op);
+        }
+        for (pass = 0; pass < 2; pass++) {
+            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_res[pass]);
+        }
+    } else {
+        /* 16 -> 32 bit fp conversion */
+        int srcelt = is_q ? 4 : 0;
+        TCGv_i32 tcg_res[4];
+
+        for (pass = 0; pass < 4; pass++) {
+            tcg_res[pass] = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_res[pass], rn, srcelt + pass, MO_16);
+            gen_helper_vfp_fcvt_f16_to_f32(tcg_res[pass], tcg_res[pass],
+                                           cpu_env);
+        }
+        for (pass = 0; pass < 4; pass++) {
+            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
+            tcg_temp_free_i32(tcg_res[pass]);
+        }
+    }
+}
+
 static void handle_rev(DisasContext *s, int opcode, bool u,
                        bool is_q, int size, int rn, int rd)
 {
@@ -8830,6 +8875,8 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             handle_2misc_narrow(s, opcode, 0, is_q, size - 1, rn, rd);
             return;
         case 0x17: /* FCVTL, FCVTL2 */
+            handle_2misc_widening(s, opcode, is_q, size, rn, rd);
+            return;
         case 0x18: /* FRINTN */
         case 0x19: /* FRINTM */
         case 0x38: /* FRINTP */

From a566da1b02704a79038043ddbe850f40b033cd63 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:50 +0000
Subject: [PATCH 17/30] target-arm: A64: List unsupported shift-imm opcodes

Add the remaining unsupported opcodes to the decode switches
for the shift-imm and scalar shift-imm categories so we can
see what is still to be implemented.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-13-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index e6addf45f1..2b1ca64b19 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6135,9 +6135,15 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
         handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
                                immh, immb, opcode, rn, rd);
         break;
-    default:
+    case 0x8: /* SRI */
+    case 0xc: /* SQSHLU */
+    case 0xe: /* SQSHL, UQSHL */
+    case 0x1f: /* FCVTZS, FCVTZU */
         unsupported_encoding(s, insn);
         break;
+    default:
+        unallocated_encoding(s);
+        break;
     }
 }
 
@@ -7281,11 +7287,14 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
         handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
                                      opcode, rn, rd);
         break;
+    case 0x8: /* SRI */
+    case 0xc: /* SQSHLU */
+    case 0xe: /* SQSHL, UQSHL */
     case 0x1f: /* FCVTZS/ FCVTZU */
         unsupported_encoding(s, insn);
         return;
     default:
-        unsupported_encoding(s, insn);
+        unallocated_encoding(s);
         return;
     }
 }

From 8f0c6758b0e1c3b9676e7c3ccea8a176537cf843 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:50 +0000
Subject: [PATCH 18/30] target-arm: A64: Add FRECPX (reciprocal exponent)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These are fairly simple exponent only estimation functions using helpers.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-14-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper-a64.c    | 59 ++++++++++++++++++++++++++++++++
 target-arm/helper-a64.h    |  2 ++
 target-arm/translate-a64.c | 70 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index c31c45e729..cea2468b30 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -354,3 +354,62 @@ uint64_t HELPER(neon_addlp_u16)(uint64_t a)
     tmp += (a >> 16) & 0x0000ffff0000ffffULL;
     return tmp;
 }
+
+/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
+float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    uint32_t val32, sbit;
+    int32_t exp;
+
+    if (float32_is_any_nan(a)) {
+        float32 nan = a;
+        if (float32_is_signaling_nan(a)) {
+            float_raise(float_flag_invalid, fpst);
+            nan = float32_maybe_silence_nan(a);
+        }
+        if (fpst->default_nan_mode) {
+            nan = float32_default_nan;
+        }
+        return nan;
+    }
+
+    val32 = float32_val(a);
+    sbit = 0x80000000ULL & val32;
+    exp = extract32(val32, 23, 8);
+
+    if (exp == 0) {
+        return make_float32(sbit | (0xfe << 23));
+    } else {
+        return make_float32(sbit | (~exp & 0xff) << 23);
+    }
+}
+
+float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    uint64_t val64, sbit;
+    int64_t exp;
+
+    if (float64_is_any_nan(a)) {
+        float64 nan = a;
+        if (float64_is_signaling_nan(a)) {
+            float_raise(float_flag_invalid, fpst);
+            nan = float64_maybe_silence_nan(a);
+        }
+        if (fpst->default_nan_mode) {
+            nan = float64_default_nan;
+        }
+        return nan;
+    }
+
+    val64 = float64_val(a);
+    sbit = 0x8000000000000000ULL & val64;
+    exp = extract64(float64_val(a), 52, 11);
+
+    if (exp == 0) {
+        return make_float64(sbit | (0x7feULL << 52));
+    } else {
+        return make_float64(sbit | (~exp & 0x7ffULL) << 52);
+    }
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 88fc9fe9e7..8cbc3492d4 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -43,3 +43,5 @@ DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(neon_addlp_u8, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
+DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 2b1ca64b19..86e5d3e1db 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6886,6 +6886,72 @@ static void handle_2misc_fcmp_zero(DisasContext *s, int opcode,
     tcg_temp_free_ptr(fpst);
 }
 
+static void handle_2misc_reciprocal(DisasContext *s, int opcode,
+                                    bool is_scalar, bool is_u, bool is_q,
+                                    int size, int rn, int rd)
+{
+    bool is_double = (size == 3);
+    TCGv_ptr fpst = get_fpstatus_ptr();
+
+    if (is_double) {
+        TCGv_i64 tcg_op = tcg_temp_new_i64();
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+        int pass;
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            switch (opcode) {
+            case 0x3f: /* FRECPX */
+                gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+        }
+        if (is_scalar) {
+            clear_vec_high(s, rd);
+        }
+
+        tcg_temp_free_i64(tcg_res);
+        tcg_temp_free_i64(tcg_op);
+    } else {
+        TCGv_i32 tcg_op = tcg_temp_new_i32();
+        TCGv_i32 tcg_res = tcg_temp_new_i32();
+        int pass, maxpasses;
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            maxpasses = is_q ? 4 : 2;
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+
+            switch (opcode) {
+            case 0x3f: /* FRECPX */
+                gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_res);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+        }
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_op);
+        if (!is_q && !is_scalar) {
+            clear_vec_high(s, rd);
+        }
+    }
+    tcg_temp_free_ptr(fpst);
+}
+
 /* C3.6.12 AdvSIMD scalar two reg misc
  *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
  * +-----+---+-----------+------+-----------+--------+-----+------+------+
@@ -6942,6 +7008,9 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size);
             return;
         }
+        case 0x3f: /* FRECPX */
+            handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
+            return;
         case 0x1a: /* FCVTNS */
         case 0x1b: /* FCVTMS */
         case 0x3a: /* FCVTPS */
@@ -6960,7 +7029,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             rmode = FPROUNDING_TIEAWAY;
             break;
         case 0x3d: /* FRECPE */
-        case 0x3f: /* FRECPX */
         case 0x56: /* FCVTXN, FCVTXN2 */
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);

From 37a706adbf96fbb05abbb8b17b14aebee266f2d1 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:50 +0000
Subject: [PATCH 19/30] target-arm: A64: Implement SRI

Implement SRI (shift right and insert).

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-15-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 57 ++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 8 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 86e5d3e1db..b67cd1df51 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5828,6 +5828,21 @@ static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
     }
 }
 
+/* SRI: shift right with insert */
+static void handle_shri_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+                                 int size, int shift)
+{
+    int esize = 8 << size;
+
+    /* shift count same as element size is valid but does nothing;
+     * special case to avoid potential shift by 64.
+     */
+    if (shift != esize) {
+        tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+        tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, 0, esize - shift);
+    }
+}
+
 /* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
 static void handle_scalar_simd_shri(DisasContext *s,
                                     bool is_u, int immh, int immb,
@@ -5838,6 +5853,7 @@ static void handle_scalar_simd_shri(DisasContext *s,
     int shift = 2 * (8 << size) - immhb;
     bool accumulate = false;
     bool round = false;
+    bool insert = false;
     TCGv_i64 tcg_rn;
     TCGv_i64 tcg_rd;
     TCGv_i64 tcg_round;
@@ -5857,6 +5873,9 @@ static void handle_scalar_simd_shri(DisasContext *s,
     case 0x06: /* SRSRA / URSRA (accum + rounding) */
         accumulate = round = true;
         break;
+    case 0x08: /* SRI */
+        insert = true;
+        break;
     }
 
     if (round) {
@@ -5867,10 +5886,14 @@ static void handle_scalar_simd_shri(DisasContext *s,
     }
 
     tcg_rn = read_fp_dreg(s, rn);
-    tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+    tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
 
-    handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
-                               accumulate, is_u, size, shift);
+    if (insert) {
+        handle_shri_with_ins(tcg_rd, tcg_rn, size, shift);
+    } else {
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                accumulate, is_u, size, shift);
+    }
 
     write_fp_dreg(s, rd, tcg_rd);
 
@@ -6108,6 +6131,12 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
     }
 
     switch (opcode) {
+    case 0x08: /* SRI */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
     case 0x00: /* SSHR / USHR */
     case 0x02: /* SSRA / USRA */
     case 0x04: /* SRSHR / URSHR */
@@ -6135,7 +6164,6 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
         handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
                                immh, immb, opcode, rn, rd);
         break;
-    case 0x8: /* SRI */
     case 0xc: /* SQSHLU */
     case 0xe: /* SQSHL, UQSHL */
     case 0x1f: /* FCVTZS, FCVTZU */
@@ -7119,6 +7147,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
     int shift = 2 * (8 << size) - immhb;
     bool accumulate = false;
     bool round = false;
+    bool insert = false;
     int dsize = is_q ? 128 : 64;
     int esize = 8 << size;
     int elements = dsize/esize;
@@ -7148,6 +7177,9 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
     case 0x06: /* SRSRA / URSRA (accum + rounding) */
         accumulate = round = true;
         break;
+    case 0x08: /* SRI */
+        insert = true;
+        break;
     }
 
     if (round) {
@@ -7159,12 +7191,16 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
 
     for (i = 0; i < elements; i++) {
         read_vec_element(s, tcg_rn, rn, i, memop);
-        if (accumulate) {
+        if (accumulate || insert) {
             read_vec_element(s, tcg_rd, rd, i, memop);
         }
 
-        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
-                                accumulate, is_u, size, shift);
+        if (insert) {
+            handle_shri_with_ins(tcg_rd, tcg_rn, size, shift);
+        } else {
+            handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                    accumulate, is_u, size, shift);
+        }
 
         write_vec_element(s, tcg_rd, rd, i, size);
     }
@@ -7325,6 +7361,12 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
     bool is_q = extract32(insn, 30, 1);
 
     switch (opcode) {
+    case 0x08: /* SRI */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
     case 0x00: /* SSHR / USHR */
     case 0x02: /* SSRA / USRA (accumulate) */
     case 0x04: /* SRSHR / URSHR (rounding) */
@@ -7355,7 +7397,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
         handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
                                      opcode, rn, rd);
         break;
-    case 0x8: /* SRI */
     case 0xc: /* SQSHLU */
     case 0xe: /* SQSHL, UQSHL */
     case 0x1f: /* FCVTZS/ FCVTZU */

From 03df01ed9a83a22790f3fd1cfbe7fc48caf9bfd0 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:50 +0000
Subject: [PATCH 20/30] target-arm: A64: Implement FRINT*

Implement the FRINT* round-to-integral operations from
the 2-reg-misc category.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-16-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 45 +++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index b67cd1df51..32eed418b0 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6800,6 +6800,17 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u,
         tcg_temp_free_i32(tcg_shift);
         break;
     }
+    case 0x18: /* FRINTN */
+    case 0x19: /* FRINTM */
+    case 0x38: /* FRINTP */
+    case 0x39: /* FRINTZ */
+    case 0x58: /* FRINTA */
+    case 0x79: /* FRINTI */
+        gen_helper_rintd(tcg_rd, tcg_rn, tcg_fpstatus);
+        break;
+    case 0x59: /* FRINTX */
+        gen_helper_rintd_exact(tcg_rd, tcg_rn, tcg_fpstatus);
+        break;
     default:
         g_assert_not_reached();
     }
@@ -8999,12 +9010,29 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x19: /* FRINTM */
         case 0x38: /* FRINTP */
         case 0x39: /* FRINTZ */
+            need_rmode = true;
+            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+            /* fall through */
+        case 0x59: /* FRINTX */
+        case 0x79: /* FRINTI */
+            need_fpstatus = true;
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x58: /* FRINTA */
+            need_rmode = true;
+            rmode = FPROUNDING_TIEAWAY;
+            need_fpstatus = true;
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
         case 0x3c: /* URECPE */
         case 0x3d: /* FRECPE */
         case 0x56: /* FCVTXN, FCVTXN2 */
-        case 0x58: /* FRINTA */
-        case 0x59: /* FRINTX */
-        case 0x79: /* FRINTI */
         case 0x7c: /* URSQRTE */
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);
@@ -9130,6 +9158,17 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                     tcg_temp_free_i32(tcg_shift);
                     break;
                 }
+                case 0x18: /* FRINTN */
+                case 0x19: /* FRINTM */
+                case 0x38: /* FRINTP */
+                case 0x39: /* FRINTZ */
+                case 0x58: /* FRINTA */
+                case 0x79: /* FRINTI */
+                    gen_helper_rints(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
+                case 0x59: /* FRINTX */
+                    gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
                 default:
                     g_assert_not_reached();
                 }

From 14dcdac82f398cbac874c8579b9583fab31c67bf Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:51 +0000
Subject: [PATCH 21/30] exec-all.h: Increase MAX_OP_PER_INSTR for ARM A64
 decoder

The ARM A64 decoder's worst case number of TCG ops per instruction
is 266 (for insn 0x4c800000, a post-indexed ST4 multiple-structures
store). Raise the MAX_OP_PER_INSTR define accordingly.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-17-git-send-email-peter.maydell@linaro.org
---
 include/exec/exec-all.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 502b7aa084..f9ac332f9d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -44,7 +44,7 @@ struct TranslationBlock;
 typedef struct TranslationBlock TranslationBlock;
 
 /* XXX: make safe guess about sizes */
-#define MAX_OP_PER_INSTR 208
+#define MAX_OP_PER_INSTR 266
 
 #if HOST_LONG_BITS == 32
 #define MAX_OPC_PARAM_PER_ARG 2

From a847f32c04538e92675c7b27f5f60d2eaad3e56c Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:51 +0000
Subject: [PATCH 22/30] target-arm: A64: Handle saturating left shifts SQSHL,
 SQSHLU, UQSHL

Implement the saturating left shift instructions SQSHL, SQSHLU
and UQSHL for the scalar-shift-imm and shift-imm categories.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-18-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 132 +++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 32eed418b0..61b234a2a4 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -76,6 +76,7 @@ typedef struct AArch64DecodeTable {
 typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
 typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
 typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
+typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
 typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
 typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
 typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
@@ -6019,6 +6020,121 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
     return;
 }
 
+/* SQSHLU, UQSHL, SQSHL: saturating left shifts */
+static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q,
+                             bool src_unsigned, bool dst_unsigned,
+                             int immh, int immb, int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int size = 32 - clz32(immh) - 1;
+    int shift = immhb - (8 << size);
+    int pass;
+
+    assert(immh != 0);
+    assert(!(scalar && is_q));
+
+    if (!scalar) {
+        if (!is_q && extract32(immh, 3, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+
+        /* Since we use the variable-shift helpers we must
+         * replicate the shift count into each element of
+         * the tcg_shift value.
+         */
+        switch (size) {
+        case 0:
+            shift |= shift << 8;
+            /* fall through */
+        case 1:
+            shift |= shift << 16;
+            break;
+        case 2:
+        case 3:
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+
+    if (size == 3) {
+        TCGv_i64 tcg_shift = tcg_const_i64(shift);
+        static NeonGenTwo64OpEnvFn * const fns[2][2] = {
+            { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 },
+            { NULL, gen_helper_neon_qshl_u64 },
+        };
+        NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned];
+        int maxpass = is_q ? 2 : 1;
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
+            write_vec_element(s, tcg_op, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_op);
+        }
+        tcg_temp_free_i64(tcg_shift);
+
+        if (!is_q) {
+            clear_vec_high(s, rd);
+        }
+    } else {
+        TCGv_i32 tcg_shift = tcg_const_i32(shift);
+        static NeonGenTwoOpEnvFn * const fns[2][2][3] = {
+            {
+                { gen_helper_neon_qshl_s8,
+                  gen_helper_neon_qshl_s16,
+                  gen_helper_neon_qshl_s32 },
+                { gen_helper_neon_qshlu_s8,
+                  gen_helper_neon_qshlu_s16,
+                  gen_helper_neon_qshlu_s32 }
+            }, {
+                { NULL, NULL, NULL },
+                { gen_helper_neon_qshl_u8,
+                  gen_helper_neon_qshl_u16,
+                  gen_helper_neon_qshl_u32 }
+            }
+        };
+        NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size];
+        TCGMemOp memop = scalar ? size : MO_32;
+        int maxpass = scalar ? 1 : is_q ? 4 : 2;
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, memop);
+            genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
+            if (scalar) {
+                switch (size) {
+                case 0:
+                    tcg_gen_ext8u_i32(tcg_op, tcg_op);
+                    break;
+                case 1:
+                    tcg_gen_ext16u_i32(tcg_op, tcg_op);
+                    break;
+                case 2:
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                write_fp_sreg(s, rd, tcg_op);
+            } else {
+                write_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+            }
+
+            tcg_temp_free_i32(tcg_op);
+        }
+        tcg_temp_free_i32(tcg_shift);
+
+        if (!is_q && !scalar) {
+            clear_vec_high(s, rd);
+        }
+    }
+}
+
 /* Common vector code for handling integer to FP conversion */
 static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
                                    int elements, int is_signed,
@@ -6165,7 +6281,15 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
                                immh, immb, opcode, rn, rd);
         break;
     case 0xc: /* SQSHLU */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd);
+        break;
     case 0xe: /* SQSHL, UQSHL */
+        handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd);
+        break;
     case 0x1f: /* FCVTZS, FCVTZU */
         unsupported_encoding(s, insn);
         break;
@@ -7409,7 +7533,15 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
                                      opcode, rn, rd);
         break;
     case 0xc: /* SQSHLU */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd);
+        break;
     case 0xe: /* SQSHL, UQSHL */
+        handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd);
+        break;
     case 0x1f: /* FCVTZS/ FCVTZU */
         unsupported_encoding(s, insn);
         return;

From 2ed3ea110f47a7e3639281edb1d6483b1efce6c3 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:51 +0000
Subject: [PATCH 23/30] target-arm: A64: Implement FCVTZS, FCVTZU in the
 shift-imm categories

Implement FCVTZS and FCVTZU in the shift-imm and scalar-shift-imm
categories; this completes the implementation of those two groups.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-19-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 80 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 61b234a2a4..0330ce9070 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6224,6 +6224,82 @@ static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar,
     handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size);
 }
 
+/* FCVTZS, FVCVTZU - FP to fixedpoint conversion */
+static void handle_simd_shift_fpint_conv(DisasContext *s, bool is_scalar,
+                                         bool is_q, bool is_u,
+                                         int immh, int immb, int rn, int rd)
+{
+    bool is_double = extract32(immh, 3, 1);
+    int immhb = immh << 3 | immb;
+    int fracbits = (is_double ? 128 : 64) - immhb;
+    int pass;
+    TCGv_ptr tcg_fpstatus;
+    TCGv_i32 tcg_rmode, tcg_shift;
+
+    if (!extract32(immh, 2, 2)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!is_scalar && !is_q && is_double) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    assert(!(is_scalar && is_q));
+
+    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(FPROUNDING_ZERO));
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    tcg_fpstatus = get_fpstatus_ptr();
+    tcg_shift = tcg_const_i32(fracbits);
+
+    if (is_double) {
+        int maxpass = is_scalar ? 1 : is_q ? 2 : 1;
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            if (is_u) {
+                gen_helper_vfp_touqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            } else {
+                gen_helper_vfp_tosqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            }
+            write_vec_element(s, tcg_op, rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_op);
+        }
+        if (!is_q) {
+            clear_vec_high(s, rd);
+        }
+    } else {
+        int maxpass = is_scalar ? 1 : is_q ? 4 : 2;
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+            if (is_u) {
+                gen_helper_vfp_touls(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            } else {
+                gen_helper_vfp_tosls(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            }
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_op);
+            } else {
+                write_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+            }
+            tcg_temp_free_i32(tcg_op);
+        }
+        if (!is_q && !is_scalar) {
+            clear_vec_high(s, rd);
+        }
+    }
+
+    tcg_temp_free_ptr(tcg_fpstatus);
+    tcg_temp_free_i32(tcg_shift);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    tcg_temp_free_i32(tcg_rmode);
+}
+
 /* C3.6.9 AdvSIMD scalar shift by immediate
  *  31 30  29 28         23 22  19 18  16 15    11  10 9    5 4    0
  * +-----+---+-------------+------+------+--------+---+------+------+
@@ -6291,7 +6367,7 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
         handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd);
         break;
     case 0x1f: /* FCVTZS, FCVTZU */
-        unsupported_encoding(s, insn);
+        handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
         break;
     default:
         unallocated_encoding(s);
@@ -7543,7 +7619,7 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
         handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd);
         break;
     case 0x1f: /* FCVTZS/ FCVTZU */
-        unsupported_encoding(s, insn);
+        handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd);
         return;
     default:
         unallocated_encoding(s);

From 7baeabce1d25c667d0ec7e4e74a1312e0b887b54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:51 +0000
Subject: [PATCH 24/30] softfloat: export squash_input_denormal functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I need these available outside of softfloat for some of the reciprocal
processing in aarch64 helper functions.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-20-git-send-email-peter.maydell@linaro.org
---
 fpu/softfloat.c         | 4 ++--
 include/fpu/softfloat.h | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index fc0b179df4..5f02c16d8d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -288,7 +288,7 @@ INLINE flag extractFloat32Sign( float32 a )
 | If `a' is denormal and we are in flush-to-zero mode then set the
 | input-denormal exception and return zero. Otherwise just return the value.
 *----------------------------------------------------------------------------*/
-static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
+float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 {
     if (STATUS(flush_inputs_to_zero)) {
         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
@@ -473,7 +473,7 @@ INLINE flag extractFloat64Sign( float64 a )
 | If `a' is denormal and we are in flush-to-zero mode then set the
 | input-denormal exception and return zero. Otherwise just return the value.
 *----------------------------------------------------------------------------*/
-static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
+float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 {
     if (STATUS(flush_inputs_to_zero)) {
         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 4b4df88527..db878c1313 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -244,6 +244,13 @@ INLINE flag get_default_nan_mode(float_status *status)
 *----------------------------------------------------------------------------*/
 void float_raise( int8 flags STATUS_PARAM);
 
+/*----------------------------------------------------------------------------
+| If `a' is denormal and we are in flush-to-zero mode then set the
+| input-denormal exception and return zero. Otherwise just return the value.
+*----------------------------------------------------------------------------*/
+float32 float32_squash_input_denormal(float32 a STATUS_PARAM);
+float64 float64_squash_input_denormal(float64 a STATUS_PARAM);
+
 /*----------------------------------------------------------------------------
 | Options to indicate which negations to perform in float*_muladd()
 | Using these differs from negating an input or output before calling

From b6d4443a7bf607c5ca5d4b8dabffc421e571f4eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:52 +0000
Subject: [PATCH 25/30] target-arm: A64: Implement AdvSIMD reciprocal estimate
 insns URECPE, FRECPE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement URECPE and FRECPE instructions in both scalar and vector forms.
The actual reciprocal estimate function is shared with the A32/T32 Neon
code. However in A64 we aren't using the Neon "standard FPSCR value"
so extra checks are necessary to handle non-squashed denormal inputs
which can never happen for A32/T32. Calling conventions for the helpers
are thus modified to pass the fpst directly; we mark the helpers as
TCG_CALL_NO_RWG since we're changing the declarations anyway.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-21-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper.c        | 202 ++++++++++++++++++++++++++++++-------
 target-arm/helper.h        |   5 +-
 target-arm/translate-a64.c |  22 +++-
 target-arm/translate.c     |  12 ++-
 4 files changed, 197 insertions(+), 44 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 5080372bad..535fc8f35e 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4520,16 +4520,21 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, CPUARMState *env)
  * int->float conversions at run-time.  */
 #define float64_256 make_float64(0x4070000000000000LL)
 #define float64_512 make_float64(0x4080000000000000LL)
+#define float32_maxnorm make_float32(0x7f7fffff)
+#define float64_maxnorm make_float64(0x7fefffffffffffffLL)
 
-/* The algorithm that must be used to calculate the estimate
- * is specified by the ARM ARM.
+/* Reciprocal functions
+ *
+ * The algorithm that must be used to calculate the estimate
+ * is specified by the ARM ARM, see FPRecipEstimate()
  */
-static float64 recip_estimate(float64 a, CPUARMState *env)
+
+static float64 recip_estimate(float64 a, float_status *real_fp_status)
 {
     /* These calculations mustn't set any fp exception flags,
      * so we use a local copy of the fp_status.
      */
-    float_status dummy_status = env->vfp.standard_fp_status;
+    float_status dummy_status = *real_fp_status;
     float_status *s = &dummy_status;
     /* q = (int)(a * 512.0) */
     float64 q = float64_mul(float64_512, a, s);
@@ -4550,45 +4555,167 @@ static float64 recip_estimate(float64 a, CPUARMState *env)
     return float64_div(int64_to_float64(q_int, s), float64_256, s);
 }
 
-float32 HELPER(recpe_f32)(float32 a, CPUARMState *env)
+/* Common wrapper to call recip_estimate */
+static float64 call_recip_estimate(float64 num, int off, float_status *fpst)
 {
-    float_status *s = &env->vfp.standard_fp_status;
-    float64 f64;
-    uint32_t val32 = float32_val(a);
+    uint64_t val64 = float64_val(num);
+    uint64_t frac = extract64(val64, 0, 52);
+    int64_t exp = extract64(val64, 52, 11);
+    uint64_t sbit;
+    float64 scaled, estimate;
 
-    int result_exp;
-    int a_exp = (val32  & 0x7f800000) >> 23;
-    int sign = val32 & 0x80000000;
-
-    if (float32_is_any_nan(a)) {
-        if (float32_is_signaling_nan(a)) {
-            float_raise(float_flag_invalid, s);
+    /* Generate the scaled number for the estimate function */
+    if (exp == 0) {
+        if (extract64(frac, 51, 1) == 0) {
+            exp = -1;
+            frac = extract64(frac, 0, 50) << 2;
+        } else {
+            frac = extract64(frac, 0, 51) << 1;
         }
-        return float32_default_nan;
-    } else if (float32_is_infinity(a)) {
-        return float32_set_sign(float32_zero, float32_is_neg(a));
-    } else if (float32_is_zero_or_denormal(a)) {
-        if (!float32_is_zero(a)) {
-            float_raise(float_flag_input_denormal, s);
-        }
-        float_raise(float_flag_divbyzero, s);
-        return float32_set_sign(float32_infinity, float32_is_neg(a));
-    } else if (a_exp >= 253) {
-        float_raise(float_flag_underflow, s);
-        return float32_set_sign(float32_zero, float32_is_neg(a));
     }
 
-    f64 = make_float64((0x3feULL << 52)
-                       | ((int64_t)(val32 & 0x7fffff) << 29));
+    /* scaled = '0' : '01111111110' : fraction<51:44> : Zeros(44); */
+    scaled = make_float64((0x3feULL << 52)
+                          | extract64(frac, 44, 8) << 44);
 
-    result_exp = 253 - a_exp;
+    estimate = recip_estimate(scaled, fpst);
 
-    f64 = recip_estimate(f64, env);
+    /* Build new result */
+    val64 = float64_val(estimate);
+    sbit = 0x8000000000000000ULL & val64;
+    exp = off - exp;
+    frac = extract64(val64, 0, 52);
 
-    val32 = sign
-        | ((result_exp & 0xff) << 23)
-        | ((float64_val(f64) >> 29) & 0x7fffff);
-    return make_float32(val32);
+    if (exp == 0) {
+        frac = 1ULL << 51 | extract64(frac, 1, 51);
+    } else if (exp == -1) {
+        frac = 1ULL << 50 | extract64(frac, 2, 50);
+        exp = 0;
+    }
+
+    return make_float64(sbit | (exp << 52) | frac);
+}
+
+static bool round_to_inf(float_status *fpst, bool sign_bit)
+{
+    switch (fpst->float_rounding_mode) {
+    case float_round_nearest_even: /* Round to Nearest */
+        return true;
+    case float_round_up: /* Round to +Inf */
+        return !sign_bit;
+    case float_round_down: /* Round to -Inf */
+        return sign_bit;
+    case float_round_to_zero: /* Round to Zero */
+        return false;
+    }
+
+    g_assert_not_reached();
+}
+
+float32 HELPER(recpe_f32)(float32 input, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float32 f32 = float32_squash_input_denormal(input, fpst);
+    uint32_t f32_val = float32_val(f32);
+    uint32_t f32_sbit = 0x80000000ULL & f32_val;
+    int32_t f32_exp = extract32(f32_val, 23, 8);
+    uint32_t f32_frac = extract32(f32_val, 0, 23);
+    float64 f64, r64;
+    uint64_t r64_val;
+    int64_t r64_exp;
+    uint64_t r64_frac;
+
+    if (float32_is_any_nan(f32)) {
+        float32 nan = f32;
+        if (float32_is_signaling_nan(f32)) {
+            float_raise(float_flag_invalid, fpst);
+            nan = float32_maybe_silence_nan(f32);
+        }
+        if (fpst->default_nan_mode) {
+            nan =  float32_default_nan;
+        }
+        return nan;
+    } else if (float32_is_infinity(f32)) {
+        return float32_set_sign(float32_zero, float32_is_neg(f32));
+    } else if (float32_is_zero(f32)) {
+        float_raise(float_flag_divbyzero, fpst);
+        return float32_set_sign(float32_infinity, float32_is_neg(f32));
+    } else if ((f32_val & ~(1ULL << 31)) < (1ULL << 21)) {
+        /* Abs(value) < 2.0^-128 */
+        float_raise(float_flag_overflow | float_flag_inexact, fpst);
+        if (round_to_inf(fpst, f32_sbit)) {
+            return float32_set_sign(float32_infinity, float32_is_neg(f32));
+        } else {
+            return float32_set_sign(float32_maxnorm, float32_is_neg(f32));
+        }
+    } else if (f32_exp >= 253 && fpst->flush_to_zero) {
+        float_raise(float_flag_underflow, fpst);
+        return float32_set_sign(float32_zero, float32_is_neg(f32));
+    }
+
+
+    f64 = make_float64(((int64_t)(f32_exp) << 52) | (int64_t)(f32_frac) << 29);
+    r64 = call_recip_estimate(f64, 253, fpst);
+    r64_val = float64_val(r64);
+    r64_exp = extract64(r64_val, 52, 11);
+    r64_frac = extract64(r64_val, 0, 52);
+
+    /* result = sign : result_exp<7:0> : fraction<51:29>; */
+    return make_float32(f32_sbit |
+                        (r64_exp & 0xff) << 23 |
+                        extract64(r64_frac, 29, 24));
+}
+
+float64 HELPER(recpe_f64)(float64 input, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float64 f64 = float64_squash_input_denormal(input, fpst);
+    uint64_t f64_val = float64_val(f64);
+    uint64_t f64_sbit = 0x8000000000000000ULL & f64_val;
+    int64_t f64_exp = extract64(f64_val, 52, 11);
+    float64 r64;
+    uint64_t r64_val;
+    int64_t r64_exp;
+    uint64_t r64_frac;
+
+    /* Deal with any special cases */
+    if (float64_is_any_nan(f64)) {
+        float64 nan = f64;
+        if (float64_is_signaling_nan(f64)) {
+            float_raise(float_flag_invalid, fpst);
+            nan = float64_maybe_silence_nan(f64);
+        }
+        if (fpst->default_nan_mode) {
+            nan =  float64_default_nan;
+        }
+        return nan;
+    } else if (float64_is_infinity(f64)) {
+        return float64_set_sign(float64_zero, float64_is_neg(f64));
+    } else if (float64_is_zero(f64)) {
+        float_raise(float_flag_divbyzero, fpst);
+        return float64_set_sign(float64_infinity, float64_is_neg(f64));
+    } else if ((f64_val & ~(1ULL << 63)) < (1ULL << 50)) {
+        /* Abs(value) < 2.0^-1024 */
+        float_raise(float_flag_overflow | float_flag_inexact, fpst);
+        if (round_to_inf(fpst, f64_sbit)) {
+            return float64_set_sign(float64_infinity, float64_is_neg(f64));
+        } else {
+            return float64_set_sign(float64_maxnorm, float64_is_neg(f64));
+        }
+    } else if (f64_exp >= 1023 && fpst->flush_to_zero) {
+        float_raise(float_flag_underflow, fpst);
+        return float64_set_sign(float64_zero, float64_is_neg(f64));
+    }
+
+    r64 = call_recip_estimate(f64, 2045, fpst);
+    r64_val = float64_val(r64);
+    r64_exp = extract64(r64_val, 52, 11);
+    r64_frac = extract64(r64_val, 0, 52);
+
+    /* result = sign : result_exp<10:0> : fraction<51:0> */
+    return make_float64(f64_sbit |
+                        ((r64_exp & 0x7ff) << 52) |
+                        r64_frac);
 }
 
 /* The algorithm that must be used to calculate the estimate
@@ -4697,8 +4824,9 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env)
     return make_float32(val);
 }
 
-uint32_t HELPER(recpe_u32)(uint32_t a, CPUARMState *env)
+uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
 {
+    float_status *s = fpstp;
     float64 f64;
 
     if ((a & 0x80000000) == 0) {
@@ -4708,7 +4836,7 @@ uint32_t HELPER(recpe_u32)(uint32_t a, CPUARMState *env)
     f64 = make_float64((0x3feULL << 52)
                        | ((int64_t)(a & 0x7fffffff) << 21));
 
-    f64 = recip_estimate (f64, env);
+    f64 = recip_estimate(f64, s);
 
     return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 8923f8ae71..f96a82415a 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -167,9 +167,10 @@ DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
 
 DEF_HELPER_3(recps_f32, f32, f32, f32, env)
 DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env)
-DEF_HELPER_2(recpe_f32, f32, f32, env)
+DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
+DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_2(rsqrte_f32, f32, f32, env)
-DEF_HELPER_2(recpe_u32, i32, i32, env)
+DEF_HELPER_2(recpe_u32, i32, i32, ptr)
 DEF_HELPER_2(rsqrte_u32, i32, i32, env)
 DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
 
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 0330ce9070..4402287bf1 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -7140,6 +7140,9 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
         for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
             read_vec_element(s, tcg_op, rn, pass, MO_64);
             switch (opcode) {
+            case 0x3d: /* FRECPE */
+                gen_helper_recpe_f64(tcg_res, tcg_op, fpst);
+                break;
             case 0x3f: /* FRECPX */
                 gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
                 break;
@@ -7169,6 +7172,12 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
             read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
 
             switch (opcode) {
+            case 0x3c: /* URECPE */
+                gen_helper_recpe_u32(tcg_res, tcg_op, fpst);
+                break;
+            case 0x3d: /* FRECPE */
+                gen_helper_recpe_f32(tcg_res, tcg_op, fpst);
+                break;
             case 0x3f: /* FRECPX */
                 gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
                 break;
@@ -7247,6 +7256,7 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size);
             return;
         }
+        case 0x3d: /* FRECPE */
         case 0x3f: /* FRECPX */
             handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
             return;
@@ -7267,7 +7277,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             is_fcvt = true;
             rmode = FPROUNDING_TIEAWAY;
             break;
-        case 0x3d: /* FRECPE */
         case 0x56: /* FCVTXN, FCVTXN2 */
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);
@@ -9205,6 +9214,15 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 return;
             }
             break;
+        case 0x3c: /* URECPE */
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+            /* fall through */
+        case 0x3d: /* FRECPE */
+            handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
+            return;
         case 0x16: /* FCVTN, FCVTN2 */
             /* handle_2misc_narrow does a 2*size -> size operation, but these
              * instructions encode the source size rather than dest size.
@@ -9238,8 +9256,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 return;
             }
             break;
-        case 0x3c: /* URECPE */
-        case 0x3d: /* FRECPE */
         case 0x56: /* FCVTXN, FCVTXN2 */
         case 0x7c: /* URSQRTE */
         case 0x7d: /* FRSQRTE */
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 20042976f2..3771953e02 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -6682,14 +6682,22 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                             break;
                         }
                         case NEON_2RM_VRECPE:
-                            gen_helper_recpe_u32(tmp, tmp, cpu_env);
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_recpe_u32(tmp, tmp, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VRSQRTE:
                             gen_helper_rsqrte_u32(tmp, tmp, cpu_env);
                             break;
                         case NEON_2RM_VRECPE_F:
-                            gen_helper_recpe_f32(cpu_F0s, cpu_F0s, cpu_env);
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_recpe_f32(cpu_F0s, cpu_F0s, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VRSQRTE_F:
                             gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, cpu_env);
                             break;

From 8b092ca9ef06fd308ecf1d46c805f938a95acc21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:52 +0000
Subject: [PATCH 26/30] target-arm: A64: Move handle_2misc_narrow function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the handle_2misc_narrow() function up the file so that it can
be called from disas_simd_scalar_two_reg_misc().

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-22-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 180 ++++++++++++++++++-------------------
 1 file changed, 90 insertions(+), 90 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 4402287bf1..d88ebe224a 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -7200,6 +7200,96 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
     tcg_temp_free_ptr(fpst);
 }
 
+static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q,
+                                int size, int rn, int rd)
+{
+    /* Handle 2-reg-misc ops which are narrowing (so each 2*size element
+     * in the source becomes a size element in the destination).
+     */
+    int pass;
+    TCGv_i32 tcg_res[2];
+    int destelt = is_q ? 2 : 0;
+
+    for (pass = 0; pass < 2; pass++) {
+        TCGv_i64 tcg_op = tcg_temp_new_i64();
+        NeonGenNarrowFn *genfn = NULL;
+        NeonGenNarrowEnvFn *genenvfn = NULL;
+
+        read_vec_element(s, tcg_op, rn, pass, MO_64);
+        tcg_res[pass] = tcg_temp_new_i32();
+
+        switch (opcode) {
+        case 0x12: /* XTN, SQXTUN */
+        {
+            static NeonGenNarrowFn * const xtnfns[3] = {
+                gen_helper_neon_narrow_u8,
+                gen_helper_neon_narrow_u16,
+                tcg_gen_trunc_i64_i32,
+            };
+            static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
+                gen_helper_neon_unarrow_sat8,
+                gen_helper_neon_unarrow_sat16,
+                gen_helper_neon_unarrow_sat32,
+            };
+            if (u) {
+                genenvfn = sqxtunfns[size];
+            } else {
+                genfn = xtnfns[size];
+            }
+            break;
+        }
+        case 0x14: /* SQXTN, UQXTN */
+        {
+            static NeonGenNarrowEnvFn * const fns[3][2] = {
+                { gen_helper_neon_narrow_sat_s8,
+                  gen_helper_neon_narrow_sat_u8 },
+                { gen_helper_neon_narrow_sat_s16,
+                  gen_helper_neon_narrow_sat_u16 },
+                { gen_helper_neon_narrow_sat_s32,
+                  gen_helper_neon_narrow_sat_u32 },
+            };
+            genenvfn = fns[size][u];
+            break;
+        }
+        case 0x16: /* FCVTN, FCVTN2 */
+            /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
+            if (size == 2) {
+                gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env);
+            } else {
+                TCGv_i32 tcg_lo = tcg_temp_new_i32();
+                TCGv_i32 tcg_hi = tcg_temp_new_i32();
+                tcg_gen_trunc_i64_i32(tcg_lo, tcg_op);
+                gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, cpu_env);
+                tcg_gen_shri_i64(tcg_op, tcg_op, 32);
+                tcg_gen_trunc_i64_i32(tcg_hi, tcg_op);
+                gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, cpu_env);
+                tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
+                tcg_temp_free_i32(tcg_lo);
+                tcg_temp_free_i32(tcg_hi);
+            }
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        if (genfn) {
+            genfn(tcg_res[pass], tcg_op);
+        } else if (genenvfn) {
+            genenvfn(tcg_res[pass], cpu_env, tcg_op);
+        }
+
+        tcg_temp_free_i64(tcg_op);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32);
+        tcg_temp_free_i32(tcg_res[pass]);
+    }
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+}
+
 /* C3.6.12 AdvSIMD scalar two reg misc
  *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
  * +-----+---+-----------+------+-----------+--------+-----+------+------+
@@ -8745,96 +8835,6 @@ static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
     }
 }
 
-static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q,
-                                int size, int rn, int rd)
-{
-    /* Handle 2-reg-misc ops which are narrowing (so each 2*size element
-     * in the source becomes a size element in the destination).
-     */
-    int pass;
-    TCGv_i32 tcg_res[2];
-    int destelt = is_q ? 2 : 0;
-
-    for (pass = 0; pass < 2; pass++) {
-        TCGv_i64 tcg_op = tcg_temp_new_i64();
-        NeonGenNarrowFn *genfn = NULL;
-        NeonGenNarrowEnvFn *genenvfn = NULL;
-
-        read_vec_element(s, tcg_op, rn, pass, MO_64);
-        tcg_res[pass] = tcg_temp_new_i32();
-
-        switch (opcode) {
-        case 0x12: /* XTN, SQXTUN */
-        {
-            static NeonGenNarrowFn * const xtnfns[3] = {
-                gen_helper_neon_narrow_u8,
-                gen_helper_neon_narrow_u16,
-                tcg_gen_trunc_i64_i32,
-            };
-            static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
-                gen_helper_neon_unarrow_sat8,
-                gen_helper_neon_unarrow_sat16,
-                gen_helper_neon_unarrow_sat32,
-            };
-            if (u) {
-                genenvfn = sqxtunfns[size];
-            } else {
-                genfn = xtnfns[size];
-            }
-            break;
-        }
-        case 0x14: /* SQXTN, UQXTN */
-        {
-            static NeonGenNarrowEnvFn * const fns[3][2] = {
-                { gen_helper_neon_narrow_sat_s8,
-                  gen_helper_neon_narrow_sat_u8 },
-                { gen_helper_neon_narrow_sat_s16,
-                  gen_helper_neon_narrow_sat_u16 },
-                { gen_helper_neon_narrow_sat_s32,
-                  gen_helper_neon_narrow_sat_u32 },
-            };
-            genenvfn = fns[size][u];
-            break;
-        }
-        case 0x16: /* FCVTN, FCVTN2 */
-            /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
-            if (size == 2) {
-                gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env);
-            } else {
-                TCGv_i32 tcg_lo = tcg_temp_new_i32();
-                TCGv_i32 tcg_hi = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tcg_lo, tcg_op);
-                gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, cpu_env);
-                tcg_gen_shri_i64(tcg_op, tcg_op, 32);
-                tcg_gen_trunc_i64_i32(tcg_hi, tcg_op);
-                gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, cpu_env);
-                tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
-                tcg_temp_free_i32(tcg_lo);
-                tcg_temp_free_i32(tcg_hi);
-            }
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        if (genfn) {
-            genfn(tcg_res[pass], tcg_op);
-        } else if (genenvfn) {
-            genenvfn(tcg_res[pass], cpu_env, tcg_op);
-        }
-
-        tcg_temp_free_i64(tcg_op);
-    }
-
-    for (pass = 0; pass < 2; pass++) {
-        write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32);
-        tcg_temp_free_i32(tcg_res[pass]);
-    }
-    if (!is_q) {
-        clear_vec_high(s, rd);
-    }
-}
-
 static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q,
                                   int size, int rn, int rd)
 {

From 5201c13654c35e5e0173a9947848f3a9f9a5a8bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:52 +0000
Subject: [PATCH 27/30] target-arm: A64: Implement scalar saturating narrow ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This completes the set of integer narrowing saturating ops including:
     SQXTN, SQXTN2
     SQXTUN, SQXTUN2
     UQXTN, UQXTN2

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-23-git-send-email-peter.maydell@linaro.org
---
 target-arm/translate-a64.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index d88ebe224a..5f4c6bf22f 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -7200,7 +7200,8 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
     tcg_temp_free_ptr(fpst);
 }
 
-static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q,
+static void handle_2misc_narrow(DisasContext *s, bool scalar,
+                                int opcode, bool u, bool is_q,
                                 int size, int rn, int rd)
 {
     /* Handle 2-reg-misc ops which are narrowing (so each 2*size element
@@ -7209,13 +7210,22 @@ static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q,
     int pass;
     TCGv_i32 tcg_res[2];
     int destelt = is_q ? 2 : 0;
+    int passes = scalar ? 1 : 2;
 
-    for (pass = 0; pass < 2; pass++) {
+    if (scalar) {
+        tcg_res[1] = tcg_const_i32(0);
+    }
+
+    for (pass = 0; pass < passes; pass++) {
         TCGv_i64 tcg_op = tcg_temp_new_i64();
         NeonGenNarrowFn *genfn = NULL;
         NeonGenNarrowEnvFn *genenvfn = NULL;
 
-        read_vec_element(s, tcg_op, rn, pass, MO_64);
+        if (scalar) {
+            read_vec_element(s, tcg_op, rn, pass, size + 1);
+        } else {
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+        }
         tcg_res[pass] = tcg_temp_new_i32();
 
         switch (opcode) {
@@ -7323,6 +7333,19 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             return;
         }
         break;
+    case 0x12: /* SQXTUN */
+        if (u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x14: /* SQXTN, UQXTN */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_2misc_narrow(s, true, opcode, u, false, size, rn, rd);
+        return;
     case 0xc ... 0xf:
     case 0x16 ... 0x1d:
     case 0x1f:
@@ -7379,8 +7402,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
     default:
         /* Other categories of encoding in this class:
          *  + SUQADD/USQADD/SQABS/SQNEG : size 8, 16, 32 or 64
-         *  + SQXTN/SQXTN2/SQXTUN/SQXTUN2/UQXTN/UQXTN2:
-         *    narrowing saturate ops: size 64/32/16 -> 32/16/8
          */
         unsupported_encoding(s, insn);
         return;
@@ -9096,7 +9117,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             return;
         }
-        handle_2misc_narrow(s, opcode, u, is_q, size, rn, rd);
+        handle_2misc_narrow(s, false, opcode, u, is_q, size, rn, rd);
         return;
     case 0x4: /* CLS, CLZ */
         if (size == 3) {
@@ -9227,7 +9248,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             /* handle_2misc_narrow does a 2*size -> size operation, but these
              * instructions encode the source size rather than dest size.
              */
-            handle_2misc_narrow(s, opcode, 0, is_q, size - 1, rn, rd);
+            handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
             return;
         case 0x17: /* FCVTL, FCVTL2 */
             handle_2misc_widening(s, opcode, is_q, size, rn, rd);

From 5553955eb6ec890f324a2ff6c6cc1365b98b981f Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:53 +0000
Subject: [PATCH 28/30] target-arm: A64: Implement FCVTXN

Implement the FCVTXN operation, which does a narrowing fp precision
conversion using the "round to odd" (von Neumann) mode. This can
conveniently be implemented as "do operation using round to zero;
then set the LSB of the mantissa to 1 if the Inexact flag was set".

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-24-git-send-email-peter.maydell@linaro.org
---
 target-arm/helper-a64.c    | 23 +++++++++++++++++++++++
 target-arm/helper-a64.h    |  1 +
 target-arm/translate-a64.c | 20 +++++++++++++++++++-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index cea2468b30..ec0258295f 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -413,3 +413,26 @@ float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
         return make_float64(sbit | (~exp & 0x7ffULL) << 52);
     }
 }
+
+float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
+{
+    /* Von Neumann rounding is implemented by using round-to-zero
+     * and then setting the LSB of the result if Inexact was raised.
+     */
+    float32 r;
+    float_status *fpst = &env->vfp.fp_status;
+    float_status tstat = *fpst;
+    int exflags;
+
+    set_float_rounding_mode(float_round_to_zero, &tstat);
+    set_float_exception_flags(0, &tstat);
+    r = float64_to_float32(a, &tstat);
+    r = float32_maybe_silence_nan(r);
+    exflags = get_float_exception_flags(&tstat);
+    if (exflags & float_flag_inexact) {
+        r = make_float32(float32_val(r) | 1);
+    }
+    exflags |= get_float_exception_flags(fpst);
+    set_float_exception_flags(exflags, fpst);
+    return r;
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 8cbc3492d4..3f05bedcca 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -45,3 +45,4 @@ DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
+DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 5f4c6bf22f..235f880589 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -7278,6 +7278,13 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
                 tcg_temp_free_i32(tcg_hi);
             }
             break;
+        case 0x56:  /* FCVTXN, FCVTXN2 */
+            /* 64 bit to 32 bit float conversion
+             * with von Neumann rounding (round to odd)
+             */
+            assert(size == 2);
+            gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, cpu_env);
+            break;
         default:
             g_assert_not_reached();
         }
@@ -7391,6 +7398,12 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             rmode = FPROUNDING_TIEAWAY;
             break;
         case 0x56: /* FCVTXN, FCVTXN2 */
+            if (size == 2) {
+                unallocated_encoding(s);
+                return;
+            }
+            handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd);
+            return;
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);
             return;
@@ -9244,6 +9257,12 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x3d: /* FRECPE */
             handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
             return;
+        case 0x56: /* FCVTXN, FCVTXN2 */
+            if (size == 2) {
+                unallocated_encoding(s);
+                return;
+            }
+            /* fall through */
         case 0x16: /* FCVTN, FCVTN2 */
             /* handle_2misc_narrow does a 2*size -> size operation, but these
              * instructions encode the source size rather than dest size.
@@ -9277,7 +9296,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 return;
             }
             break;
-        case 0x56: /* FCVTXN, FCVTXN2 */
         case 0x7c: /* URSQRTE */
         case 0x7d: /* FRSQRTE */
             unsupported_encoding(s, insn);

From c2fb418e35be3eb1f60987174f94c029f7d4dd7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 17 Mar 2014 16:31:53 +0000
Subject: [PATCH 29/30] target-arm: A64: Add [UF]RSQRTE (reciprocal root
 estimate)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for [UF]RSQRTE instructions. It utilises the existing
NEON helpers with some changes. The changes include an explicit passing
of fpstatus (so the correct one is used between arm32 and aarch64),
denormilzation, more correct error handling and also proper scaling of
the fraction going into the estimate.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-25-git-send-email-peter.maydell@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target-arm/helper.c        | 139 +++++++++++++++++++++++++++++--------
 target-arm/helper.h        |   5 +-
 target-arm/translate-a64.c |  27 +++++--
 target-arm/translate.c     |  12 +++-
 4 files changed, 143 insertions(+), 40 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 535fc8f35e..55077ed1b6 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4721,12 +4721,12 @@ float64 HELPER(recpe_f64)(float64 input, void *fpstp)
 /* The algorithm that must be used to calculate the estimate
  * is specified by the ARM ARM.
  */
-static float64 recip_sqrt_estimate(float64 a, CPUARMState *env)
+static float64 recip_sqrt_estimate(float64 a, float_status *real_fp_status)
 {
     /* These calculations mustn't set any fp exception flags,
      * so we use a local copy of the fp_status.
      */
-    float_status dummy_status = env->vfp.standard_fp_status;
+    float_status dummy_status = *real_fp_status;
     float_status *s = &dummy_status;
     float64 q;
     int64_t q_int;
@@ -4773,49 +4773,64 @@ static float64 recip_sqrt_estimate(float64 a, CPUARMState *env)
     return float64_div(int64_to_float64(q_int, s), float64_256, s);
 }
 
-float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env)
+float32 HELPER(rsqrte_f32)(float32 input, void *fpstp)
 {
-    float_status *s = &env->vfp.standard_fp_status;
+    float_status *s = fpstp;
+    float32 f32 = float32_squash_input_denormal(input, s);
+    uint32_t val = float32_val(f32);
+    uint32_t f32_sbit = 0x80000000 & val;
+    int32_t f32_exp = extract32(val, 23, 8);
+    uint32_t f32_frac = extract32(val, 0, 23);
+    uint64_t f64_frac;
+    uint64_t val64;
     int result_exp;
     float64 f64;
-    uint32_t val;
-    uint64_t val64;
 
-    val = float32_val(a);
-
-    if (float32_is_any_nan(a)) {
-        if (float32_is_signaling_nan(a)) {
+    if (float32_is_any_nan(f32)) {
+        float32 nan = f32;
+        if (float32_is_signaling_nan(f32)) {
             float_raise(float_flag_invalid, s);
+            nan = float32_maybe_silence_nan(f32);
         }
-        return float32_default_nan;
-    } else if (float32_is_zero_or_denormal(a)) {
-        if (!float32_is_zero(a)) {
-            float_raise(float_flag_input_denormal, s);
+        if (s->default_nan_mode) {
+            nan =  float32_default_nan;
         }
+        return nan;
+    } else if (float32_is_zero(f32)) {
         float_raise(float_flag_divbyzero, s);
-        return float32_set_sign(float32_infinity, float32_is_neg(a));
-    } else if (float32_is_neg(a)) {
+        return float32_set_sign(float32_infinity, float32_is_neg(f32));
+    } else if (float32_is_neg(f32)) {
         float_raise(float_flag_invalid, s);
         return float32_default_nan;
-    } else if (float32_is_infinity(a)) {
+    } else if (float32_is_infinity(f32)) {
         return float32_zero;
     }
 
-    /* Normalize to a double-precision value between 0.25 and 1.0,
+    /* Scale and normalize to a double-precision value between 0.25 and 1.0,
      * preserving the parity of the exponent.  */
-    if ((val & 0x800000) == 0) {
-        f64 = make_float64(((uint64_t)(val & 0x80000000) << 32)
-                           | (0x3feULL << 52)
-                           | ((uint64_t)(val & 0x7fffff) << 29));
-    } else {
-        f64 = make_float64(((uint64_t)(val & 0x80000000) << 32)
-                           | (0x3fdULL << 52)
-                           | ((uint64_t)(val & 0x7fffff) << 29));
+
+    f64_frac = ((uint64_t) f32_frac) << 29;
+    if (f32_exp == 0) {
+        while (extract64(f64_frac, 51, 1) == 0) {
+            f64_frac = f64_frac << 1;
+            f32_exp = f32_exp-1;
+        }
+        f64_frac = extract64(f64_frac, 0, 51) << 1;
     }
 
-    result_exp = (380 - ((val & 0x7f800000) >> 23)) / 2;
+    if (extract64(f32_exp, 0, 1) == 0) {
+        f64 = make_float64(((uint64_t) f32_sbit) << 32
+                           | (0x3feULL << 52)
+                           | f64_frac);
+    } else {
+        f64 = make_float64(((uint64_t) f32_sbit) << 32
+                           | (0x3fdULL << 52)
+                           | f64_frac);
+    }
 
-    f64 = recip_sqrt_estimate(f64, env);
+    result_exp = (380 - f32_exp) / 2;
+
+    f64 = recip_sqrt_estimate(f64, s);
 
     val64 = float64_val(f64);
 
@@ -4824,6 +4839,69 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env)
     return make_float32(val);
 }
 
+float64 HELPER(rsqrte_f64)(float64 input, void *fpstp)
+{
+    float_status *s = fpstp;
+    float64 f64 = float64_squash_input_denormal(input, s);
+    uint64_t val = float64_val(f64);
+    uint64_t f64_sbit = 0x8000000000000000ULL & val;
+    int64_t f64_exp = extract64(val, 52, 11);
+    uint64_t f64_frac = extract64(val, 0, 52);
+    int64_t result_exp;
+    uint64_t result_frac;
+
+    if (float64_is_any_nan(f64)) {
+        float64 nan = f64;
+        if (float64_is_signaling_nan(f64)) {
+            float_raise(float_flag_invalid, s);
+            nan = float64_maybe_silence_nan(f64);
+        }
+        if (s->default_nan_mode) {
+            nan =  float64_default_nan;
+        }
+        return nan;
+    } else if (float64_is_zero(f64)) {
+        float_raise(float_flag_divbyzero, s);
+        return float64_set_sign(float64_infinity, float64_is_neg(f64));
+    } else if (float64_is_neg(f64)) {
+        float_raise(float_flag_invalid, s);
+        return float64_default_nan;
+    } else if (float64_is_infinity(f64)) {
+        return float64_zero;
+    }
+
+    /* Scale and normalize to a double-precision value between 0.25 and 1.0,
+     * preserving the parity of the exponent.  */
+
+    if (f64_exp == 0) {
+        while (extract64(f64_frac, 51, 1) == 0) {
+            f64_frac = f64_frac << 1;
+            f64_exp = f64_exp - 1;
+        }
+        f64_frac = extract64(f64_frac, 0, 51) << 1;
+    }
+
+    if (extract64(f64_exp, 0, 1) == 0) {
+        f64 = make_float64(f64_sbit
+                           | (0x3feULL << 52)
+                           | f64_frac);
+    } else {
+        f64 = make_float64(f64_sbit
+                           | (0x3fdULL << 52)
+                           | f64_frac);
+    }
+
+    result_exp = (3068 - f64_exp) / 2;
+
+    f64 = recip_sqrt_estimate(f64, s);
+
+    result_frac = extract64(float64_val(f64), 0, 52);
+
+    return make_float64(f64_sbit |
+                        ((result_exp & 0x7ff) << 52) |
+                        result_frac);
+}
+
 uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
 {
     float_status *s = fpstp;
@@ -4841,8 +4919,9 @@ uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
     return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }
 
-uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUARMState *env)
+uint32_t HELPER(rsqrte_u32)(uint32_t a, void *fpstp)
 {
+    float_status *fpst = fpstp;
     float64 f64;
 
     if ((a & 0xc0000000) == 0) {
@@ -4857,7 +4936,7 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUARMState *env)
                            | ((uint64_t)(a & 0x3fffffff) << 22));
     }
 
-    f64 = recip_sqrt_estimate(f64, env);
+    f64 = recip_sqrt_estimate(f64, fpst);
 
     return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }
diff --git a/target-arm/helper.h b/target-arm/helper.h
index f96a82415a..a3d6f32b06 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -169,9 +169,10 @@ DEF_HELPER_3(recps_f32, f32, f32, f32, env)
 DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env)
 DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
-DEF_HELPER_2(rsqrte_f32, f32, f32, env)
+DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
+DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_2(recpe_u32, i32, i32, ptr)
-DEF_HELPER_2(rsqrte_u32, i32, i32, env)
+DEF_HELPER_FLAGS_2(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32, ptr)
 DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
 
 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 235f880589..befffac2e3 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -7146,6 +7146,9 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
             case 0x3f: /* FRECPX */
                 gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
                 break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst);
+                break;
             default:
                 g_assert_not_reached();
             }
@@ -7181,6 +7184,9 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
             case 0x3f: /* FRECPX */
                 gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
                 break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst);
+                break;
             default:
                 g_assert_not_reached();
             }
@@ -7378,6 +7384,7 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
         }
         case 0x3d: /* FRECPE */
         case 0x3f: /* FRECPX */
+        case 0x7d: /* FRSQRTE */
             handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
             return;
         case 0x1a: /* FCVTNS */
@@ -7404,9 +7411,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             }
             handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd);
             return;
-        case 0x7d: /* FRSQRTE */
-            unsupported_encoding(s, insn);
-            return;
         default:
             unallocated_encoding(s);
             return;
@@ -9255,6 +9259,11 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             }
             /* fall through */
         case 0x3d: /* FRECPE */
+        case 0x7d: /* FRSQRTE */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
             handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
             return;
         case 0x56: /* FCVTXN, FCVTXN2 */
@@ -9297,9 +9306,12 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
             }
             break;
         case 0x7c: /* URSQRTE */
-        case 0x7d: /* FRSQRTE */
-            unsupported_encoding(s, insn);
-            return;
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+            need_fpstatus = true;
+            break;
         default:
             unallocated_encoding(s);
             return;
@@ -9432,6 +9444,9 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 case 0x59: /* FRINTX */
                     gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
                     break;
+                case 0x7c: /* URSQRTE */
+                    gen_helper_rsqrte_u32(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
                 default:
                     g_assert_not_reached();
                 }
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 3771953e02..56e3b4bf7f 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -6689,8 +6689,12 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                             break;
                         }
                         case NEON_2RM_VRSQRTE:
-                            gen_helper_rsqrte_u32(tmp, tmp, cpu_env);
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_rsqrte_u32(tmp, tmp, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VRECPE_F:
                         {
                             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
@@ -6699,8 +6703,12 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                             break;
                         }
                         case NEON_2RM_VRSQRTE_F:
-                            gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, cpu_env);
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VCVT_FS: /* VCVT.F32.S32 */
                             gen_vfp_sito(0, 1);
                             break;

From 1ed27a17cd9d9ebec8963bc358d74060b1dd6127 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 17 Mar 2014 16:31:53 +0000
Subject: [PATCH 30/30] scripts/qemu-binfmt-conf.sh: Add AArch64 registration

Add the binfmt-misc magic needed to register QEMU for handling AArch64
ELF binaries.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1394822294-14837-26-git-send-email-peter.maydell@linaro.org
---
 scripts/qemu-binfmt-conf.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/qemu-binfmt-conf.sh b/scripts/qemu-binfmt-conf.sh
index 0da2618883..289b1a3963 100644
--- a/scripts/qemu-binfmt-conf.sh
+++ b/scripts/qemu-binfmt-conf.sh
@@ -41,6 +41,9 @@ if [ $cpu != "arm" ] ; then
     echo   ':arm:M::\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/local/bin/qemu-arm:' > /proc/sys/fs/binfmt_misc/register
     echo   ':armeb:M::\x7fELF\x01\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/local/bin/qemu-armeb:' > /proc/sys/fs/binfmt_misc/register
 fi
+if [ $cpu != "aarch64" ] ; then
+    echo ':aarch64:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/local/bin/qemu-aarch64:' > /proc/sys/fs/binfmt_misc/register
+fi
 if [ $cpu != "sparc" ] ; then
     echo   ':sparc:M::\x7fELF\x01\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x02:\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/local/bin/qemu-sparc:' > /proc/sys/fs/binfmt_misc/register
 fi