diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 6711b58e0b..245fd6327d 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -682,13 +682,8 @@ static inline bool cpu_handle_halt(CPUState *cpu) #ifndef CONFIG_USER_ONLY if (cpu->halted) { const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops; - bool leave_halt; + bool leave_halt = tcg_ops->cpu_exec_halt(cpu); - if (tcg_ops->cpu_exec_halt) { - leave_halt = tcg_ops->cpu_exec_halt(cpu); - } else { - leave_halt = cpu_has_work(cpu); - } if (!leave_halt) { return true; } @@ -1082,6 +1077,10 @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp) static bool tcg_target_initialized; if (!tcg_target_initialized) { + /* Check mandatory TCGCPUOps handlers */ +#ifndef CONFIG_USER_ONLY + assert(cpu->cc->tcg_ops->cpu_exec_halt); +#endif /* !CONFIG_USER_ONLY */ cpu->cc->tcg_ops->initialize(); tcg_target_initialized = true; } diff --git a/hw/arm/stm32l4x5_soc.c b/hw/arm/stm32l4x5_soc.c index 38f7a2d5d9..fac83d349c 100644 --- a/hw/arm/stm32l4x5_soc.c +++ b/hw/arm/stm32l4x5_soc.c @@ -81,6 +81,10 @@ static const int exti_irq[NUM_EXTI_IRQ] = { #define RCC_BASE_ADDRESS 0x40021000 #define RCC_IRQ 5 +#define EXTI_USART1_IRQ 26 +#define EXTI_UART4_IRQ 29 +#define EXTI_LPUART1_IRQ 31 + static const int exti_or_gates_out[NUM_EXTI_OR_GATES] = { 23, 40, 63, 1, }; @@ -129,10 +133,6 @@ static const hwaddr uart_addr[] = { #define LPUART_BASE_ADDRESS 0x40008000 -static const int usart_irq[] = { 37, 38, 39 }; -static const int uart_irq[] = { 52, 53 }; -#define LPUART_IRQ 70 - static void stm32l4x5_soc_initfn(Object *obj) { Stm32l4x5SocState *s = STM32L4X5_SOC(obj); @@ -297,6 +297,7 @@ static void stm32l4x5_soc_realize(DeviceState *dev_soc, Error **errp) } } + /* Connect SYSCFG to EXTI */ for (unsigned i = 0; i < GPIO_NUM_PINS; i++) { qdev_connect_gpio_out(DEVICE(&s->syscfg), i, qdev_get_gpio_in(DEVICE(&s->exti), i)); @@ -322,15 +323,10 @@ static void stm32l4x5_soc_realize(DeviceState *dev_soc, Error **errp) return; } sysbus_mmio_map(busdev, 0, usart_addr[i]); - sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, usart_irq[i])); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(DEVICE(&s->exti), + EXTI_USART1_IRQ + i)); } - /* - * TODO: Connect the USARTs, UARTs and LPUART to the EXTI once the EXTI - * can handle other gpio-in than the gpios. (e.g. Direct Lines for the - * usarts) - */ - /* UART devices */ for (int i = 0; i < STM_NUM_UARTS; i++) { g_autofree char *name = g_strdup_printf("uart%d-out", STM_NUM_USARTS + i + 1); @@ -343,7 +339,8 @@ static void stm32l4x5_soc_realize(DeviceState *dev_soc, Error **errp) return; } sysbus_mmio_map(busdev, 0, uart_addr[i]); - sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, uart_irq[i])); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(DEVICE(&s->exti), + EXTI_UART4_IRQ + i)); } /* LPUART device*/ @@ -356,7 +353,8 @@ static void stm32l4x5_soc_realize(DeviceState *dev_soc, Error **errp) return; } sysbus_mmio_map(busdev, 0, LPUART_BASE_ADDRESS); - sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, LPUART_IRQ)); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(DEVICE(&s->exti), + EXTI_LPUART1_IRQ)); /* APB1 BUS */ create_unimplemented_device("TIM2", 0x40000000, 0x400); diff --git a/hw/char/pl011.c b/hw/char/pl011.c index 8753b84a84..f8078aa216 100644 --- a/hw/char/pl011.c +++ b/hw/char/pl011.c @@ -87,6 +87,12 @@ DeviceState *pl011_create(hwaddr addr, qemu_irq irq, Chardev *chr) #define CR_DTR (1 << 10) #define CR_LBE (1 << 7) +/* Integer Baud Rate Divider, UARTIBRD */ +#define IBRD_MASK 0x3f + +/* Fractional Baud Rate Divider, UARTFBRD */ +#define FBRD_MASK 0xffff + static const unsigned char pl011_id_arm[8] = { 0x11, 0x10, 0x14, 0x00, 0x0d, 0xf0, 0x05, 0xb1 }; static const unsigned char pl011_id_luminary[8] = @@ -374,11 +380,11 @@ static void pl011_write(void *opaque, hwaddr offset, s->ilpr = value; break; case 9: /* UARTIBRD */ - s->ibrd = value; + s->ibrd = value & IBRD_MASK; pl011_trace_baudrate_change(s); break; case 10: /* UARTFBRD */ - s->fbrd = value; + s->fbrd = value & FBRD_MASK; pl011_trace_baudrate_change(s); break; case 11: /* UARTLCR_H */ @@ -531,6 +537,9 @@ static int pl011_post_load(void *opaque, int version_id) s->read_pos = 0; } + s->ibrd &= IBRD_MASK; + s->fbrd &= FBRD_MASK; + return 0; } diff --git a/hw/misc/bcm2835_thermal.c b/hw/misc/bcm2835_thermal.c index ee7816b8a5..0c49c088a7 100644 --- a/hw/misc/bcm2835_thermal.c +++ b/hw/misc/bcm2835_thermal.c @@ -80,8 +80,10 @@ static void bcm2835_thermal_write(void *opaque, hwaddr addr, static const MemoryRegionOps bcm2835_thermal_ops = { .read = bcm2835_thermal_read, .write = bcm2835_thermal_write, + .impl.min_access_size = 4, .impl.max_access_size = 4, .valid.min_access_size = 4, + .valid.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; diff --git a/hw/misc/stm32l4x5_exti.c b/hw/misc/stm32l4x5_exti.c index 6a2ec62d78..e281841dcf 100644 --- a/hw/misc/stm32l4x5_exti.c +++ b/hw/misc/stm32l4x5_exti.c @@ -42,7 +42,6 @@ #define EXTI_SWIER2 0x30 #define EXTI_PR2 0x34 -#define EXTI_NUM_GPIO_EVENT_IN_LINES 16 #define EXTI_MAX_IRQ_PER_BANK 32 #define EXTI_IRQS_BANK0 32 #define EXTI_IRQS_BANK1 8 @@ -114,6 +113,13 @@ static void stm32l4x5_exti_set_irq(void *opaque, int irq, int level) return; } + /* In case of a direct line interrupt */ + if (extract32(exti_romask[bank], irq, 1)) { + qemu_set_irq(s->irq[oirq], level); + return; + } + + /* In case of a configurable interrupt */ if ((level && extract32(s->rtsr[bank], irq, 1)) || (!level && extract32(s->ftsr[bank], irq, 1))) { @@ -238,7 +244,7 @@ static void stm32l4x5_exti_init(Object *obj) { Stm32l4x5ExtiState *s = STM32L4X5_EXTI(obj); - for (size_t i = 0; i < EXTI_NUM_INTERRUPT_OUT_LINES; i++) { + for (size_t i = 0; i < EXTI_NUM_LINES; i++) { sysbus_init_irq(SYS_BUS_DEVICE(obj), &s->irq[i]); } @@ -246,8 +252,7 @@ static void stm32l4x5_exti_init(Object *obj) TYPE_STM32L4X5_EXTI, 0x400); sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->mmio); - qdev_init_gpio_in(DEVICE(obj), stm32l4x5_exti_set_irq, - EXTI_NUM_GPIO_EVENT_IN_LINES); + qdev_init_gpio_in(DEVICE(obj), stm32l4x5_exti_set_irq, EXTI_NUM_LINES); } static const VMStateDescription vmstate_stm32l4x5_exti = { diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h index 099de3375e..34318cf0e6 100644 --- a/include/hw/core/tcg-cpu-ops.h +++ b/include/hw/core/tcg-cpu-ops.h @@ -122,10 +122,13 @@ struct TCGCPUOps { * to do when the CPU is in the halted state. * * Return true to indicate that the CPU should now leave halt, false - * if it should remain in the halted state. + * if it should remain in the halted state. (This should generally + * be the same value that cpu_has_work() would return.) * - * If this method is not provided, the default is to do nothing, and - * to leave halt if cpu_has_work() returns true. + * This method must be provided. If the target does not need to + * do anything special for halt, the same function used for its + * CPUClass::has_work method can be used here, as they have the + * same function signature. */ bool (*cpu_exec_halt)(CPUState *cpu); /** diff --git a/include/hw/misc/stm32l4x5_exti.h b/include/hw/misc/stm32l4x5_exti.h index 55f763fa37..62f79362f2 100644 --- a/include/hw/misc/stm32l4x5_exti.h +++ b/include/hw/misc/stm32l4x5_exti.h @@ -30,7 +30,7 @@ #define TYPE_STM32L4X5_EXTI "stm32l4x5-exti" OBJECT_DECLARE_SIMPLE_TYPE(Stm32l4x5ExtiState, STM32L4X5_EXTI) -#define EXTI_NUM_INTERRUPT_OUT_LINES 40 +#define EXTI_NUM_LINES 40 #define EXTI_NUM_REGISTER 2 struct Stm32l4x5ExtiState { @@ -47,7 +47,7 @@ struct Stm32l4x5ExtiState { /* used for edge detection */ uint32_t irq_levels[EXTI_NUM_REGISTER]; - qemu_irq irq[EXTI_NUM_INTERRUPT_OUT_LINES]; + qemu_irq irq[EXTI_NUM_LINES]; }; #endif diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c index 0e2fbcb397..9db1dffc03 100644 --- a/target/alpha/cpu.c +++ b/target/alpha/cpu.c @@ -219,6 +219,7 @@ static const TCGCPUOps alpha_tcg_ops = { #else .tlb_fill = alpha_cpu_tlb_fill, .cpu_exec_interrupt = alpha_cpu_exec_interrupt, + .cpu_exec_halt = alpha_cpu_has_work, .do_interrupt = alpha_cpu_do_interrupt, .do_transaction_failed = alpha_cpu_do_transaction_failed, .do_unaligned_access = alpha_cpu_do_unaligned_access, diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 14d4eca127..19191c2391 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1133,7 +1133,7 @@ static bool arm_cpu_virtio_is_big_endian(CPUState *cs) } #ifdef CONFIG_TCG -static bool arm_cpu_exec_halt(CPUState *cs) +bool arm_cpu_exec_halt(CPUState *cs) { bool leave_halt = cpu_has_work(cs); diff --git a/target/arm/cpu.h b/target/arm/cpu.h index d8eb986a04..a12859fc53 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -619,6 +619,13 @@ typedef struct CPUArchState { int vec_len; int vec_stride; + /* + * Floating point status and control registers. Some bits are + * stored separately in other fields or in the float_status below. + */ + uint64_t fpsr; + uint64_t fpcr; + uint32_t xregs[16]; /* Scratch space for aa32 neon expansion. */ @@ -1680,61 +1687,99 @@ static inline void xpsr_write(CPUARMState *env, uint32_t val, uint32_t mask) uint32_t vfp_get_fpscr(CPUARMState *env); void vfp_set_fpscr(CPUARMState *env, uint32_t val); -/* FPCR, Floating Point Control Register - * FPSR, Floating Poiht Status Register +/* + * FPCR, Floating Point Control Register + * FPSR, Floating Point Status Register * - * For A64 the FPSCR is split into two logically distinct registers, - * FPCR and FPSR. However since they still use non-overlapping bits - * we store the underlying state in fpscr and just mask on read/write. + * For A64 floating point control and status bits are stored in + * two logically distinct registers, FPCR and FPSR. We store these + * in QEMU in vfp.fpcr and vfp.fpsr. + * For A32 there was only one register, FPSCR. The bits are arranged + * such that FPSCR bits map to FPCR or FPSR bits in the same bit positions, + * so we can use appropriate masking to handle FPSCR reads and writes. + * Note that the FPCR has some bits which are not visible in the + * AArch32 view (for FEAT_AFP). Writing the FPSCR leaves these unchanged. */ -#define FPSR_MASK 0xf800009f -#define FPCR_MASK 0x07ff9f00 +/* FPCR bits */ #define FPCR_IOE (1 << 8) /* Invalid Operation exception trap enable */ #define FPCR_DZE (1 << 9) /* Divide by Zero exception trap enable */ #define FPCR_OFE (1 << 10) /* Overflow exception trap enable */ #define FPCR_UFE (1 << 11) /* Underflow exception trap enable */ #define FPCR_IXE (1 << 12) /* Inexact exception trap enable */ #define FPCR_IDE (1 << 15) /* Input Denormal exception trap enable */ +#define FPCR_LEN_MASK (7 << 16) /* LEN, A-profile only */ #define FPCR_FZ16 (1 << 19) /* ARMv8.2+, FP16 flush-to-zero */ +#define FPCR_STRIDE_MASK (3 << 20) /* Stride */ #define FPCR_RMODE_MASK (3 << 22) /* Rounding mode */ #define FPCR_FZ (1 << 24) /* Flush-to-zero enable bit */ #define FPCR_DN (1 << 25) /* Default NaN enable bit */ #define FPCR_AHP (1 << 26) /* Alternative half-precision */ -#define FPCR_QC (1 << 27) /* Cumulative saturation bit */ -#define FPCR_V (1 << 28) /* FP overflow flag */ -#define FPCR_C (1 << 29) /* FP carry flag */ -#define FPCR_Z (1 << 30) /* FP zero flag */ -#define FPCR_N (1 << 31) /* FP negative flag */ #define FPCR_LTPSIZE_SHIFT 16 /* LTPSIZE, M-profile only */ #define FPCR_LTPSIZE_MASK (7 << FPCR_LTPSIZE_SHIFT) #define FPCR_LTPSIZE_LENGTH 3 -#define FPCR_NZCV_MASK (FPCR_N | FPCR_Z | FPCR_C | FPCR_V) -#define FPCR_NZCVQC_MASK (FPCR_NZCV_MASK | FPCR_QC) +/* Cumulative exception trap enable bits */ +#define FPCR_EEXC_MASK (FPCR_IOE | FPCR_DZE | FPCR_OFE | FPCR_UFE | FPCR_IXE | FPCR_IDE) -static inline uint32_t vfp_get_fpsr(CPUARMState *env) -{ - return vfp_get_fpscr(env) & FPSR_MASK; -} +/* FPSR bits */ +#define FPSR_IOC (1 << 0) /* Invalid Operation cumulative exception */ +#define FPSR_DZC (1 << 1) /* Divide by Zero cumulative exception */ +#define FPSR_OFC (1 << 2) /* Overflow cumulative exception */ +#define FPSR_UFC (1 << 3) /* Underflow cumulative exception */ +#define FPSR_IXC (1 << 4) /* Inexact cumulative exception */ +#define FPSR_IDC (1 << 7) /* Input Denormal cumulative exception */ +#define FPSR_QC (1 << 27) /* Cumulative saturation bit */ +#define FPSR_V (1 << 28) /* FP overflow flag */ +#define FPSR_C (1 << 29) /* FP carry flag */ +#define FPSR_Z (1 << 30) /* FP zero flag */ +#define FPSR_N (1 << 31) /* FP negative flag */ -static inline void vfp_set_fpsr(CPUARMState *env, uint32_t val) -{ - uint32_t new_fpscr = (vfp_get_fpscr(env) & ~FPSR_MASK) | (val & FPSR_MASK); - vfp_set_fpscr(env, new_fpscr); -} +/* Cumulative exception status bits */ +#define FPSR_CEXC_MASK (FPSR_IOC | FPSR_DZC | FPSR_OFC | FPSR_UFC | FPSR_IXC | FPSR_IDC) -static inline uint32_t vfp_get_fpcr(CPUARMState *env) -{ - return vfp_get_fpscr(env) & FPCR_MASK; -} +#define FPSR_NZCV_MASK (FPSR_N | FPSR_Z | FPSR_C | FPSR_V) +#define FPSR_NZCVQC_MASK (FPSR_NZCV_MASK | FPSR_QC) -static inline void vfp_set_fpcr(CPUARMState *env, uint32_t val) -{ - uint32_t new_fpscr = (vfp_get_fpscr(env) & ~FPCR_MASK) | (val & FPCR_MASK); - vfp_set_fpscr(env, new_fpscr); -} +/* A32 FPSCR bits which architecturally map to FPSR bits */ +#define FPSCR_FPSR_MASK (FPSR_NZCVQC_MASK | FPSR_CEXC_MASK) +/* A32 FPSCR bits which architecturally map to FPCR bits */ +#define FPSCR_FPCR_MASK (FPCR_EEXC_MASK | FPCR_LEN_MASK | FPCR_FZ16 | \ + FPCR_STRIDE_MASK | FPCR_RMODE_MASK | \ + FPCR_FZ | FPCR_DN | FPCR_AHP) +/* These masks don't overlap: each bit lives in only one place */ +QEMU_BUILD_BUG_ON(FPSCR_FPSR_MASK & FPSCR_FPCR_MASK); + +/** + * vfp_get_fpsr: read the AArch64 FPSR + * @env: CPU context + * + * Return the current AArch64 FPSR value + */ +uint32_t vfp_get_fpsr(CPUARMState *env); + +/** + * vfp_get_fpcr: read the AArch64 FPCR + * @env: CPU context + * + * Return the current AArch64 FPCR value + */ +uint32_t vfp_get_fpcr(CPUARMState *env); + +/** + * vfp_set_fpsr: write the AArch64 FPSR + * @env: CPU context + * @value: new value + */ +void vfp_set_fpsr(CPUARMState *env, uint32_t value); + +/** + * vfp_set_fpcr: write the AArch64 FPCR + * @env: CPU context + * @value: new value + */ +void vfp_set_fpcr(CPUARMState *env, uint32_t value); enum arm_cpu_mode { ARM_CPU_MODE_USR = 0x10, @@ -3309,8 +3354,8 @@ extern const uint64_t pred_esz_masks[5]; */ static inline target_ulong cpu_untagged_addr(CPUState *cs, target_ulong x) { - ARMCPU *cpu = ARM_CPU(cs); - if (cpu->env.tagged_addr_enable) { + CPUARMState *env = cpu_env(cs); + if (env->tagged_addr_enable) { /* * TBI is enabled for userspace but not kernelspace addresses. * Only clear the tag if bit 55 is clear. diff --git a/target/arm/internals.h b/target/arm/internals.h index e1aa1a63b9..da22d04121 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -368,6 +368,9 @@ void arm_restore_state_to_opc(CPUState *cs, #ifdef CONFIG_TCG void arm_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb); + +/* Our implementation of TCGCPUOps::cpu_exec_halt */ +bool arm_cpu_exec_halt(CPUState *cs); #endif /* CONFIG_TCG */ typedef enum ARMFPRounding { diff --git a/target/arm/machine.c b/target/arm/machine.c index 0a722ca7e7..a3c1e05e65 100644 --- a/target/arm/machine.c +++ b/target/arm/machine.c @@ -18,6 +18,35 @@ static bool vfp_needed(void *opaque) : cpu_isar_feature(aa32_vfp_simd, cpu)); } +static bool vfp_fpcr_fpsr_needed(void *opaque) +{ + /* + * If either the FPCR or the FPSR include set bits that are not + * visible in the AArch32 FPSCR view of floating point control/status + * then we must send the FPCR and FPSR as two separate fields in the + * cpu/vfp/fpcr_fpsr subsection, and we will send a 0 for the old + * FPSCR field in cpu/vfp. + * + * If all the set bits are representable in an AArch32 FPSCR then we + * send that value as the cpu/vfp FPSCR field, and don't send the + * cpu/vfp/fpcr_fpsr subsection. + * + * On incoming migration, if the cpu/vfp FPSCR field is non-zero we + * use it, and if the fpcr_fpsr subsection is present we use that. + * (The subsection will never be present with a non-zero FPSCR field, + * and if FPSCR is zero and the subsection is not present that means + * that FPSCR/FPSR/FPCR are zero.) + * + * This preserves migration compatibility with older QEMU versions, + * in both directions. + */ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + + return (vfp_get_fpcr(env) & ~FPSCR_FPCR_MASK) || + (vfp_get_fpsr(env) & ~FPSCR_FPSR_MASK); +} + static int get_fpscr(QEMUFile *f, void *opaque, size_t size, const VMStateField *field) { @@ -25,7 +54,10 @@ static int get_fpscr(QEMUFile *f, void *opaque, size_t size, CPUARMState *env = &cpu->env; uint32_t val = qemu_get_be32(f); - vfp_set_fpscr(env, val); + if (val) { + /* 0 means we might have the data in the fpcr_fpsr subsection */ + vfp_set_fpscr(env, val); + } return 0; } @@ -34,8 +66,9 @@ static int put_fpscr(QEMUFile *f, void *opaque, size_t size, { ARMCPU *cpu = opaque; CPUARMState *env = &cpu->env; + uint32_t fpscr = vfp_fpcr_fpsr_needed(opaque) ? 0 : vfp_get_fpscr(env); - qemu_put_be32(f, vfp_get_fpscr(env)); + qemu_put_be32(f, fpscr); return 0; } @@ -45,6 +78,86 @@ static const VMStateInfo vmstate_fpscr = { .put = put_fpscr, }; +static int get_fpcr(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + uint64_t val = qemu_get_be64(f); + + vfp_set_fpcr(env, val); + return 0; +} + +static int put_fpcr(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + + qemu_put_be64(f, vfp_get_fpcr(env)); + return 0; +} + +static const VMStateInfo vmstate_fpcr = { + .name = "fpcr", + .get = get_fpcr, + .put = put_fpcr, +}; + +static int get_fpsr(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + uint64_t val = qemu_get_be64(f); + + vfp_set_fpsr(env, val); + return 0; +} + +static int put_fpsr(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + + qemu_put_be64(f, vfp_get_fpsr(env)); + return 0; +} + +static const VMStateInfo vmstate_fpsr = { + .name = "fpsr", + .get = get_fpsr, + .put = put_fpsr, +}; + +static const VMStateDescription vmstate_vfp_fpcr_fpsr = { + .name = "cpu/vfp/fpcr_fpsr", + .version_id = 1, + .minimum_version_id = 1, + .needed = vfp_fpcr_fpsr_needed, + .fields = (const VMStateField[]) { + { + .name = "fpcr", + .version_id = 0, + .size = sizeof(uint64_t), + .info = &vmstate_fpcr, + .flags = VMS_SINGLE, + .offset = 0, + }, + { + .name = "fpsr", + .version_id = 0, + .size = sizeof(uint64_t), + .info = &vmstate_fpsr, + .flags = VMS_SINGLE, + .offset = 0, + }, + VMSTATE_END_OF_LIST() + }, +}; + static const VMStateDescription vmstate_vfp = { .name = "cpu/vfp", .version_id = 3, @@ -100,6 +213,10 @@ static const VMStateDescription vmstate_vfp = { .offset = 0, }, VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_vfp_fpcr_fpsr, + NULL } }; @@ -784,6 +901,20 @@ static int cpu_pre_load(void *opaque) ARMCPU *cpu = opaque; CPUARMState *env = &cpu->env; + /* + * In an inbound migration where on the source FPSCR/FPSR/FPCR are 0, + * there will be no fpcr_fpsr subsection so we won't call vfp_set_fpcr() + * and vfp_set_fpsr() from get_fpcr() and get_fpsr(); also the get_fpscr() + * function will not call vfp_set_fpscr() because it will see a 0 in the + * inbound data. Ensure that in this case we have a correctly set up + * zero FPSCR/FPCR/FPSR. + * + * This is not strictly needed because FPSCR is zero out of reset, but + * it avoids the possibility of future confusing migration bugs if some + * future architecture change makes the reset value non-zero. + */ + vfp_set_fpscr(env, 0); + /* * Pre-initialize irq_line_state to a value that's never valid as * real data, so cpu_post_load() can tell whether we've seen the diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode index 223eac3cac..2922de700c 100644 --- a/target/arm/tcg/a64.decode +++ b/target/arm/tcg/a64.decode @@ -785,6 +785,14 @@ SQRDMULH_s 0111 1110 ..1 ..... 10110 1 ..... ..... @rrr_e SQRDMLAH_s 0111 1110 ..0 ..... 10000 1 ..... ..... @rrr_e SQRDMLSH_s 0111 1110 ..0 ..... 10001 1 ..... ..... @rrr_e +# Decode scalar x scalar as scalar x indexed, with index 0. +SQDMULL_si 0101 1110 011 rm:5 11010 0 rn:5 rd:5 &rrx_e idx=0 esz=1 +SQDMULL_si 0101 1110 101 rm:5 11010 0 rn:5 rd:5 &rrx_e idx=0 esz=2 +SQDMLAL_si 0101 1110 011 rm:5 10010 0 rn:5 rd:5 &rrx_e idx=0 esz=1 +SQDMLAL_si 0101 1110 101 rm:5 10010 0 rn:5 rd:5 &rrx_e idx=0 esz=2 +SQDMLSL_si 0101 1110 011 rm:5 10110 0 rn:5 rd:5 &rrx_e idx=0 esz=1 +SQDMLSL_si 0101 1110 101 rm:5 10110 0 rn:5 rd:5 &rrx_e idx=0 esz=2 + ### Advanced SIMD scalar pairwise FADDP_s 0101 1110 0011 0000 1101 10 ..... ..... @rr_h @@ -962,6 +970,42 @@ FCADD_270 0.10 1110 ..0 ..... 11110 1 ..... ..... @qrrr_e FCMLA_v 0 q:1 10 1110 esz:2 0 rm:5 110 rot:2 1 rn:5 rd:5 +SMULL_v 0.00 1110 ..1 ..... 11000 0 ..... ..... @qrrr_e +UMULL_v 0.10 1110 ..1 ..... 11000 0 ..... ..... @qrrr_e +SMLAL_v 0.00 1110 ..1 ..... 10000 0 ..... ..... @qrrr_e +UMLAL_v 0.10 1110 ..1 ..... 10000 0 ..... ..... @qrrr_e +SMLSL_v 0.00 1110 ..1 ..... 10100 0 ..... ..... @qrrr_e +UMLSL_v 0.10 1110 ..1 ..... 10100 0 ..... ..... @qrrr_e + +SADDL_v 0.00 1110 ..1 ..... 00000 0 ..... ..... @qrrr_e +UADDL_v 0.10 1110 ..1 ..... 00000 0 ..... ..... @qrrr_e +SSUBL_v 0.00 1110 ..1 ..... 00100 0 ..... ..... @qrrr_e +USUBL_v 0.10 1110 ..1 ..... 00100 0 ..... ..... @qrrr_e +SABAL_v 0.00 1110 ..1 ..... 01010 0 ..... ..... @qrrr_e +UABAL_v 0.10 1110 ..1 ..... 01010 0 ..... ..... @qrrr_e +SABDL_v 0.00 1110 ..1 ..... 01110 0 ..... ..... @qrrr_e +UABDL_v 0.10 1110 ..1 ..... 01110 0 ..... ..... @qrrr_e + +SQDMULL_v 0.00 1110 011 ..... 11010 0 ..... ..... @qrrr_h +SQDMULL_v 0.00 1110 101 ..... 11010 0 ..... ..... @qrrr_s +SQDMLAL_v 0.00 1110 011 ..... 10010 0 ..... ..... @qrrr_h +SQDMLAL_v 0.00 1110 101 ..... 10010 0 ..... ..... @qrrr_s +SQDMLSL_v 0.00 1110 011 ..... 10110 0 ..... ..... @qrrr_h +SQDMLSL_v 0.00 1110 101 ..... 10110 0 ..... ..... @qrrr_s + +SADDW 0.00 1110 ..1 ..... 00010 0 ..... ..... @qrrr_e +UADDW 0.10 1110 ..1 ..... 00010 0 ..... ..... @qrrr_e +SSUBW 0.00 1110 ..1 ..... 00110 0 ..... ..... @qrrr_e +USUBW 0.10 1110 ..1 ..... 00110 0 ..... ..... @qrrr_e + +ADDHN 0.00 1110 ..1 ..... 01000 0 ..... ..... @qrrr_e +RADDHN 0.10 1110 ..1 ..... 01000 0 ..... ..... @qrrr_e +SUBHN 0.00 1110 ..1 ..... 01100 0 ..... ..... @qrrr_e +RSUBHN 0.10 1110 ..1 ..... 01100 0 ..... ..... @qrrr_e + +PMULL_p8 0.00 1110 001 ..... 11100 0 ..... ..... @qrrr_b +PMULL_p64 0.00 1110 111 ..... 11100 0 ..... ..... @qrrr_b + ### Advanced SIMD scalar x indexed element FMUL_si 0101 1111 00 .. .... 1001 . 0 ..... ..... @rrx_h @@ -992,6 +1036,15 @@ SQRDMLAH_si 0111 1111 10 .. .... 1101 . 0 ..... ..... @rrx_s SQRDMLSH_si 0111 1111 01 .. .... 1111 . 0 ..... ..... @rrx_h SQRDMLSH_si 0111 1111 10 .. .... 1111 . 0 ..... ..... @rrx_s +SQDMULL_si 0101 1111 01 .. .... 1011 . 0 ..... ..... @rrx_h +SQDMULL_si 0101 1111 10 . ..... 1011 . 0 ..... ..... @rrx_s + +SQDMLAL_si 0101 1111 01 .. .... 0011 . 0 ..... ..... @rrx_h +SQDMLAL_si 0101 1111 10 . ..... 0011 . 0 ..... ..... @rrx_s + +SQDMLSL_si 0101 1111 01 .. .... 0111 . 0 ..... ..... @rrx_h +SQDMLSL_si 0101 1111 10 . ..... 0111 . 0 ..... ..... @rrx_s + ### Advanced SIMD vector x indexed element FMUL_vi 0.00 1111 00 .. .... 1001 . 0 ..... ..... @qrrx_h @@ -1047,6 +1100,30 @@ FCMLA_vi 0 0 10 1111 01 idx:1 rm:5 0 rot:2 1 0 0 rn:5 rd:5 esz=1 q=0 FCMLA_vi 0 1 10 1111 01 . rm:5 0 rot:2 1 . 0 rn:5 rd:5 esz=1 idx=%hl q=1 FCMLA_vi 0 1 10 1111 10 0 rm:5 0 rot:2 1 idx:1 0 rn:5 rd:5 esz=2 q=1 +SMULL_vi 0.00 1111 01 .. .... 1010 . 0 ..... ..... @qrrx_h +SMULL_vi 0.00 1111 10 . ..... 1010 . 0 ..... ..... @qrrx_s +UMULL_vi 0.10 1111 01 .. .... 1010 . 0 ..... ..... @qrrx_h +UMULL_vi 0.10 1111 10 . ..... 1010 . 0 ..... ..... @qrrx_s + +SMLAL_vi 0.00 1111 01 .. .... 0010 . 0 ..... ..... @qrrx_h +SMLAL_vi 0.00 1111 10 . ..... 0010 . 0 ..... ..... @qrrx_s +UMLAL_vi 0.10 1111 01 .. .... 0010 . 0 ..... ..... @qrrx_h +UMLAL_vi 0.10 1111 10 . ..... 0010 . 0 ..... ..... @qrrx_s + +SMLSL_vi 0.00 1111 01 .. .... 0110 . 0 ..... ..... @qrrx_h +SMLSL_vi 0.00 1111 10 . ..... 0110 . 0 ..... ..... @qrrx_s +UMLSL_vi 0.10 1111 01 .. .... 0110 . 0 ..... ..... @qrrx_h +UMLSL_vi 0.10 1111 10 . ..... 0110 . 0 ..... ..... @qrrx_s + +SQDMULL_vi 0.00 1111 01 .. .... 1011 . 0 ..... ..... @qrrx_h +SQDMULL_vi 0.00 1111 10 . ..... 1011 . 0 ..... ..... @qrrx_s + +SQDMLAL_vi 0.00 1111 01 .. .... 0011 . 0 ..... ..... @qrrx_h +SQDMLAL_vi 0.00 1111 10 . ..... 0011 . 0 ..... ..... @qrrx_s + +SQDMLSL_vi 0.00 1111 01 .. .... 0111 . 0 ..... ..... @qrrx_h +SQDMLSL_vi 0.00 1111 10 . ..... 0111 . 0 ..... ..... @qrrx_s + # Floating-point conditional select FCSEL 0001 1110 .. 1 rm:5 cond:4 11 rn:5 rd:5 esz=%esz_hsd diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c index c059c681e9..5496f14dc1 100644 --- a/target/arm/tcg/cpu-v7m.c +++ b/target/arm/tcg/cpu-v7m.c @@ -244,6 +244,7 @@ static const TCGCPUOps arm_v7m_tcg_ops = { #else .tlb_fill = arm_cpu_tlb_fill, .cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt, + .cpu_exec_halt = arm_cpu_exec_halt, .do_interrupt = arm_v7m_cpu_do_interrupt, .do_transaction_failed = arm_cpu_do_transaction_failed, .do_unaligned_access = arm_cpu_do_unaligned_access, diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index 8b99736aad..03ebef5ef2 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -1115,21 +1115,21 @@ static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, if (update_flags) { /* Store C, clear NZV. */ - env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK; - env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C; + env->vfp.fpsr &= ~FPSR_NZCV_MASK; + env->vfp.fpsr |= carry_in * FPSR_C; } mve_advance_vpt(env); } void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm) { - bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + bool carry_in = env->vfp.fpsr & FPSR_C; do_vadc(env, vd, vn, vm, 0, carry_in, false); } void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm) { - bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + bool carry_in = env->vfp.fpsr & FPSR_C; do_vadc(env, vd, vn, vm, -1, carry_in, false); } @@ -3343,7 +3343,7 @@ static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top) uint32_t *m = vm; uint16_t r; uint16_t mask = mve_element_mask(env); - bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP); + bool ieee = !(env->vfp.fpcr & FPCR_AHP); unsigned e; float_status *fpst; float_status scratch_fpst; @@ -3373,7 +3373,7 @@ static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top) uint16_t *m = vm; uint32_t r; uint16_t mask = mve_element_mask(env); - bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP); + bool ieee = !(env->vfp.fpcr & FPCR_AHP); unsigned e; float_status *fpst; float_status scratch_fpst; diff --git a/target/arm/tcg/translate-a32.h b/target/arm/tcg/translate-a32.h index 19de6e0a1a..0b1fa57965 100644 --- a/target/arm/tcg/translate-a32.h +++ b/target/arm/tcg/translate-a32.h @@ -83,6 +83,13 @@ void store_cpu_offset(TCGv_i32 var, int offset, int size); sizeof_field(CPUARMState, name)); \ }) +/* Store to the low half of a 64-bit field from a TCGv_i32 */ +#define store_cpu_field_low32(val, name) \ + ({ \ + QEMU_BUILD_BUG_ON(sizeof_field(CPUARMState, name) != 8); \ + store_cpu_offset(val, offsetoflow32(CPUARMState, name), 4); \ + }) + #define store_cpu_field_constant(val, name) \ store_cpu_field(tcg_constant_i32(val), name) diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 6c07aeaf3b..559a6cd799 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -5664,6 +5664,357 @@ static bool trans_FCMLA_v(DisasContext *s, arg_FCMLA_v *a) return true; } +/* + * Widening vector x vector/indexed. + * + * These read from the top or bottom half of a 128-bit vector. + * After widening, optionally accumulate with a 128-bit vector. + * Implement these inline, as the number of elements are limited + * and the related SVE and SME operations on larger vectors use + * even/odd elements instead of top/bottom half. + * + * If idx >= 0, operand 2 is indexed, otherwise vector. + * If acc, operand 0 is loaded with rd. + */ + +/* For low half, iterating up. */ +static bool do_3op_widening(DisasContext *s, MemOp memop, int top, + int rd, int rn, int rm, int idx, + NeonGenTwo64OpFn *fn, bool acc) +{ + TCGv_i64 tcg_op0 = tcg_temp_new_i64(); + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + MemOp esz = memop & MO_SIZE; + int half = 8 >> esz; + int top_swap, top_half; + + /* There are no 64x64->128 bit operations. */ + if (esz >= MO_64) { + return false; + } + if (!fp_access_check(s)) { + return true; + } + + if (idx >= 0) { + read_vec_element(s, tcg_op2, rm, idx, memop); + } + + /* + * For top half inputs, iterate forward; backward for bottom half. + * This means the store to the destination will not occur until + * overlapping input inputs are consumed. + * Use top_swap to conditionally invert the forward iteration index. + */ + top_swap = top ? 0 : half - 1; + top_half = top ? half : 0; + + for (int elt_fwd = 0; elt_fwd < half; ++elt_fwd) { + int elt = elt_fwd ^ top_swap; + + read_vec_element(s, tcg_op1, rn, elt + top_half, memop); + if (idx < 0) { + read_vec_element(s, tcg_op2, rm, elt + top_half, memop); + } + if (acc) { + read_vec_element(s, tcg_op0, rd, elt, memop + 1); + } + fn(tcg_op0, tcg_op1, tcg_op2); + write_vec_element(s, tcg_op0, rd, elt, esz + 1); + } + clear_vec_high(s, 1, rd); + return true; +} + +static void gen_muladd_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + tcg_gen_mul_i64(t, n, m); + tcg_gen_add_i64(d, d, t); +} + +static void gen_mulsub_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + tcg_gen_mul_i64(t, n, m); + tcg_gen_sub_i64(d, d, t); +} + +TRANS(SMULL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + tcg_gen_mul_i64, false) +TRANS(UMULL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + tcg_gen_mul_i64, false) +TRANS(SMLAL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + gen_muladd_i64, true) +TRANS(UMLAL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + gen_muladd_i64, true) +TRANS(SMLSL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + gen_mulsub_i64, true) +TRANS(UMLSL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + gen_mulsub_i64, true) + +TRANS(SMULL_vi, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, a->idx, + tcg_gen_mul_i64, false) +TRANS(UMULL_vi, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, a->idx, + tcg_gen_mul_i64, false) +TRANS(SMLAL_vi, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, a->idx, + gen_muladd_i64, true) +TRANS(UMLAL_vi, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, a->idx, + gen_muladd_i64, true) +TRANS(SMLSL_vi, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, a->idx, + gen_mulsub_i64, true) +TRANS(UMLSL_vi, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, a->idx, + gen_mulsub_i64, true) + +static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_sub_i64(t1, n, m); + tcg_gen_sub_i64(t2, m, n); + tcg_gen_movcond_i64(TCG_COND_GE, d, n, m, t1, t2); +} + +static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_sub_i64(t1, n, m); + tcg_gen_sub_i64(t2, m, n); + tcg_gen_movcond_i64(TCG_COND_GEU, d, n, m, t1, t2); +} + +static void gen_saba_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + gen_sabd_i64(t, n, m); + tcg_gen_add_i64(d, d, t); +} + +static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + gen_uabd_i64(t, n, m); + tcg_gen_add_i64(d, d, t); +} + +TRANS(SADDL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + tcg_gen_add_i64, false) +TRANS(UADDL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + tcg_gen_add_i64, false) +TRANS(SSUBL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + tcg_gen_sub_i64, false) +TRANS(USUBL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + tcg_gen_sub_i64, false) +TRANS(SABDL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + gen_sabd_i64, false) +TRANS(UABDL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + gen_uabd_i64, false) +TRANS(SABAL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + gen_saba_i64, true) +TRANS(UABAL_v, do_3op_widening, + a->esz, a->q, a->rd, a->rn, a->rm, -1, + gen_uaba_i64, true) + +static void gen_sqdmull_h(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + tcg_gen_mul_i64(d, n, m); + gen_helper_neon_addl_saturate_s32(d, tcg_env, d, d); +} + +static void gen_sqdmull_s(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + tcg_gen_mul_i64(d, n, m); + gen_helper_neon_addl_saturate_s64(d, tcg_env, d, d); +} + +static void gen_sqdmlal_h(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_mul_i64(t, n, m); + gen_helper_neon_addl_saturate_s32(t, tcg_env, t, t); + gen_helper_neon_addl_saturate_s32(d, tcg_env, d, t); +} + +static void gen_sqdmlal_s(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_mul_i64(t, n, m); + gen_helper_neon_addl_saturate_s64(t, tcg_env, t, t); + gen_helper_neon_addl_saturate_s64(d, tcg_env, d, t); +} + +static void gen_sqdmlsl_h(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_mul_i64(t, n, m); + gen_helper_neon_addl_saturate_s32(t, tcg_env, t, t); + tcg_gen_neg_i64(t, t); + gen_helper_neon_addl_saturate_s32(d, tcg_env, d, t); +} + +static void gen_sqdmlsl_s(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_mul_i64(t, n, m); + gen_helper_neon_addl_saturate_s64(t, tcg_env, t, t); + tcg_gen_neg_i64(t, t); + gen_helper_neon_addl_saturate_s64(d, tcg_env, d, t); +} + +TRANS(SQDMULL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + a->esz == MO_16 ? gen_sqdmull_h : gen_sqdmull_s, false) +TRANS(SQDMLAL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + a->esz == MO_16 ? gen_sqdmlal_h : gen_sqdmlal_s, true) +TRANS(SQDMLSL_v, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, -1, + a->esz == MO_16 ? gen_sqdmlsl_h : gen_sqdmlsl_s, true) + +TRANS(SQDMULL_vi, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, a->idx, + a->esz == MO_16 ? gen_sqdmull_h : gen_sqdmull_s, false) +TRANS(SQDMLAL_vi, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, a->idx, + a->esz == MO_16 ? gen_sqdmlal_h : gen_sqdmlal_s, true) +TRANS(SQDMLSL_vi, do_3op_widening, + a->esz | MO_SIGN, a->q, a->rd, a->rn, a->rm, a->idx, + a->esz == MO_16 ? gen_sqdmlsl_h : gen_sqdmlsl_s, true) + +static bool do_addsub_wide(DisasContext *s, arg_qrrr_e *a, + MemOp sign, bool sub) +{ + TCGv_i64 tcg_op0, tcg_op1; + MemOp esz = a->esz; + int half = 8 >> esz; + bool top = a->q; + int top_swap = top ? 0 : half - 1; + int top_half = top ? half : 0; + + /* There are no 64x64->128 bit operations. */ + if (esz >= MO_64) { + return false; + } + if (!fp_access_check(s)) { + return true; + } + tcg_op0 = tcg_temp_new_i64(); + tcg_op1 = tcg_temp_new_i64(); + + for (int elt_fwd = 0; elt_fwd < half; ++elt_fwd) { + int elt = elt_fwd ^ top_swap; + + read_vec_element(s, tcg_op1, a->rm, elt + top_half, esz | sign); + read_vec_element(s, tcg_op0, a->rn, elt, esz + 1); + if (sub) { + tcg_gen_sub_i64(tcg_op0, tcg_op0, tcg_op1); + } else { + tcg_gen_add_i64(tcg_op0, tcg_op0, tcg_op1); + } + write_vec_element(s, tcg_op0, a->rd, elt, esz + 1); + } + clear_vec_high(s, 1, a->rd); + return true; +} + +TRANS(SADDW, do_addsub_wide, a, MO_SIGN, false) +TRANS(UADDW, do_addsub_wide, a, 0, false) +TRANS(SSUBW, do_addsub_wide, a, MO_SIGN, true) +TRANS(USUBW, do_addsub_wide, a, 0, true) + +static bool do_addsub_highnarrow(DisasContext *s, arg_qrrr_e *a, + bool sub, bool round) +{ + TCGv_i64 tcg_op0, tcg_op1; + MemOp esz = a->esz; + int half = 8 >> esz; + bool top = a->q; + int ebits = 8 << esz; + uint64_t rbit = 1ull << (ebits - 1); + int top_swap, top_half; + + /* There are no 128x128->64 bit operations. */ + if (esz >= MO_64) { + return false; + } + if (!fp_access_check(s)) { + return true; + } + tcg_op0 = tcg_temp_new_i64(); + tcg_op1 = tcg_temp_new_i64(); + + /* + * For top half inputs, iterate backward; forward for bottom half. + * This means the store to the destination will not occur until + * overlapping input inputs are consumed. + */ + top_swap = top ? half - 1 : 0; + top_half = top ? half : 0; + + for (int elt_fwd = 0; elt_fwd < half; ++elt_fwd) { + int elt = elt_fwd ^ top_swap; + + read_vec_element(s, tcg_op1, a->rm, elt, esz + 1); + read_vec_element(s, tcg_op0, a->rn, elt, esz + 1); + if (sub) { + tcg_gen_sub_i64(tcg_op0, tcg_op0, tcg_op1); + } else { + tcg_gen_add_i64(tcg_op0, tcg_op0, tcg_op1); + } + if (round) { + tcg_gen_addi_i64(tcg_op0, tcg_op0, rbit); + } + tcg_gen_shri_i64(tcg_op0, tcg_op0, ebits); + write_vec_element(s, tcg_op0, a->rd, elt + top_half, esz); + } + clear_vec_high(s, top, a->rd); + return true; +} + +TRANS(ADDHN, do_addsub_highnarrow, a, false, false) +TRANS(SUBHN, do_addsub_highnarrow, a, true, false) +TRANS(RADDHN, do_addsub_highnarrow, a, false, true) +TRANS(RSUBHN, do_addsub_highnarrow, a, true, true) + +static bool do_pmull(DisasContext *s, arg_qrrr_e *a, gen_helper_gvec_3 *fn) +{ + if (fp_access_check(s)) { + /* The Q field specifies lo/hi half input for these insns. */ + gen_gvec_op3_ool(s, true, a->rd, a->rn, a->rm, a->q, fn); + } + return true; +} + +TRANS(PMULL_p8, do_pmull, a, gen_helper_neon_pmull_h) +TRANS_FEAT(PMULL_p64, aa64_pmull, do_pmull, a, gen_helper_gvec_pmull_q) + /* * Advanced SIMD scalar/vector x indexed element */ @@ -5815,6 +6166,38 @@ static bool do_env_scalar3_idx_hs(DisasContext *s, arg_rrx_e *a, TRANS_FEAT(SQRDMLAH_si, aa64_rdm, do_env_scalar3_idx_hs, a, &f_scalar_sqrdmlah) TRANS_FEAT(SQRDMLSH_si, aa64_rdm, do_env_scalar3_idx_hs, a, &f_scalar_sqrdmlsh) +static bool do_scalar_muladd_widening_idx(DisasContext *s, arg_rrx_e *a, + NeonGenTwo64OpFn *fn, bool acc) +{ + if (fp_access_check(s)) { + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + unsigned vsz, dofs; + + if (acc) { + read_vec_element(s, t0, a->rd, 0, a->esz + 1); + } + read_vec_element(s, t1, a->rn, 0, a->esz | MO_SIGN); + read_vec_element(s, t2, a->rm, a->idx, a->esz | MO_SIGN); + fn(t0, t1, t2); + + /* Clear the whole register first, then store scalar. */ + vsz = vec_full_reg_size(s); + dofs = vec_full_reg_offset(s, a->rd); + tcg_gen_gvec_dup_imm(MO_64, dofs, vsz, vsz, 0); + write_vec_element(s, t0, a->rd, 0, a->esz + 1); + } + return true; +} + +TRANS(SQDMULL_si, do_scalar_muladd_widening_idx, a, + a->esz == MO_16 ? gen_sqdmull_h : gen_sqdmull_s, false) +TRANS(SQDMLAL_si, do_scalar_muladd_widening_idx, a, + a->esz == MO_16 ? gen_sqdmlal_h : gen_sqdmlal_s, true) +TRANS(SQDMLSL_si, do_scalar_muladd_widening_idx, a, + a->esz == MO_16 ? gen_sqdmlsl_h : gen_sqdmlsl_s, true) + static bool do_fp3_vector_idx(DisasContext *s, arg_qrrx_e *a, gen_helper_gvec_3_ptr * const fns[3]) { @@ -9647,102 +10030,6 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn) } } -/* AdvSIMD scalar three different - * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 - * +-----+---+-----------+------+---+------+--------+-----+------+------+ - * | 0 1 | U | 1 1 1 1 0 | size | 1 | Rm | opcode | 0 0 | Rn | Rd | - * +-----+---+-----------+------+---+------+--------+-----+------+------+ - */ -static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn) -{ - bool is_u = extract32(insn, 29, 1); - int size = extract32(insn, 22, 2); - int opcode = extract32(insn, 12, 4); - int rm = extract32(insn, 16, 5); - int rn = extract32(insn, 5, 5); - int rd = extract32(insn, 0, 5); - - if (is_u) { - unallocated_encoding(s); - return; - } - - switch (opcode) { - case 0x9: /* SQDMLAL, SQDMLAL2 */ - case 0xb: /* SQDMLSL, SQDMLSL2 */ - case 0xd: /* SQDMULL, SQDMULL2 */ - if (size == 0 || size == 3) { - unallocated_encoding(s); - return; - } - break; - default: - unallocated_encoding(s); - return; - } - - if (!fp_access_check(s)) { - return; - } - - if (size == 2) { - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); - TCGv_i64 tcg_op2 = tcg_temp_new_i64(); - TCGv_i64 tcg_res = tcg_temp_new_i64(); - - read_vec_element(s, tcg_op1, rn, 0, MO_32 | MO_SIGN); - read_vec_element(s, tcg_op2, rm, 0, MO_32 | MO_SIGN); - - tcg_gen_mul_i64(tcg_res, tcg_op1, tcg_op2); - gen_helper_neon_addl_saturate_s64(tcg_res, tcg_env, tcg_res, tcg_res); - - switch (opcode) { - case 0xd: /* SQDMULL, SQDMULL2 */ - break; - case 0xb: /* SQDMLSL, SQDMLSL2 */ - tcg_gen_neg_i64(tcg_res, tcg_res); - /* fall through */ - case 0x9: /* SQDMLAL, SQDMLAL2 */ - read_vec_element(s, tcg_op1, rd, 0, MO_64); - gen_helper_neon_addl_saturate_s64(tcg_res, tcg_env, - tcg_res, tcg_op1); - break; - default: - g_assert_not_reached(); - } - - write_fp_dreg(s, rd, tcg_res); - } else { - TCGv_i32 tcg_op1 = read_fp_hreg(s, rn); - TCGv_i32 tcg_op2 = read_fp_hreg(s, rm); - TCGv_i64 tcg_res = tcg_temp_new_i64(); - - gen_helper_neon_mull_s16(tcg_res, tcg_op1, tcg_op2); - gen_helper_neon_addl_saturate_s32(tcg_res, tcg_env, tcg_res, tcg_res); - - switch (opcode) { - case 0xd: /* SQDMULL, SQDMULL2 */ - break; - case 0xb: /* SQDMLSL, SQDMLSL2 */ - gen_helper_neon_negl_u32(tcg_res, tcg_res); - /* fall through */ - case 0x9: /* SQDMLAL, SQDMLAL2 */ - { - TCGv_i64 tcg_op3 = tcg_temp_new_i64(); - read_vec_element(s, tcg_op3, rd, 0, MO_32); - gen_helper_neon_addl_saturate_s32(tcg_res, tcg_env, - tcg_res, tcg_op3); - break; - } - default: - g_assert_not_reached(); - } - - tcg_gen_ext32u_i64(tcg_res, tcg_res); - write_fp_dreg(s, rd, tcg_res); - } -} - static void handle_2misc_64(DisasContext *s, int opcode, bool u, TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus) @@ -10592,416 +10879,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn) } } -/* Generate code to do a "long" addition or subtraction, ie one done in - * TCGv_i64 on vector lanes twice the width specified by size. - */ -static void gen_neon_addl(int size, bool is_sub, TCGv_i64 tcg_res, - TCGv_i64 tcg_op1, TCGv_i64 tcg_op2) -{ - static NeonGenTwo64OpFn * const fns[3][2] = { - { gen_helper_neon_addl_u16, gen_helper_neon_subl_u16 }, - { gen_helper_neon_addl_u32, gen_helper_neon_subl_u32 }, - { tcg_gen_add_i64, tcg_gen_sub_i64 }, - }; - NeonGenTwo64OpFn *genfn; - assert(size < 3); - - genfn = fns[size][is_sub]; - genfn(tcg_res, tcg_op1, tcg_op2); -} - -static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size, - int opcode, int rd, int rn, int rm) -{ - /* 3-reg-different widening insns: 64 x 64 -> 128 */ - TCGv_i64 tcg_res[2]; - int pass, accop; - - tcg_res[0] = tcg_temp_new_i64(); - tcg_res[1] = tcg_temp_new_i64(); - - /* Does this op do an adding accumulate, a subtracting accumulate, - * or no accumulate at all? - */ - switch (opcode) { - case 5: - case 8: - case 9: - accop = 1; - break; - case 10: - case 11: - accop = -1; - break; - default: - accop = 0; - break; - } - - if (accop != 0) { - read_vec_element(s, tcg_res[0], rd, 0, MO_64); - read_vec_element(s, tcg_res[1], rd, 1, MO_64); - } - - /* size == 2 means two 32x32->64 operations; this is worth special - * casing because we can generally handle it inline. - */ - if (size == 2) { - for (pass = 0; pass < 2; pass++) { - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); - TCGv_i64 tcg_op2 = tcg_temp_new_i64(); - TCGv_i64 tcg_passres; - MemOp memop = MO_32 | (is_u ? 0 : MO_SIGN); - - int elt = pass + is_q * 2; - - read_vec_element(s, tcg_op1, rn, elt, memop); - read_vec_element(s, tcg_op2, rm, elt, memop); - - if (accop == 0) { - tcg_passres = tcg_res[pass]; - } else { - tcg_passres = tcg_temp_new_i64(); - } - - switch (opcode) { - case 0: /* SADDL, SADDL2, UADDL, UADDL2 */ - tcg_gen_add_i64(tcg_passres, tcg_op1, tcg_op2); - break; - case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */ - tcg_gen_sub_i64(tcg_passres, tcg_op1, tcg_op2); - break; - case 5: /* SABAL, SABAL2, UABAL, UABAL2 */ - case 7: /* SABDL, SABDL2, UABDL, UABDL2 */ - { - TCGv_i64 tcg_tmp1 = tcg_temp_new_i64(); - TCGv_i64 tcg_tmp2 = tcg_temp_new_i64(); - - tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2); - tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1); - tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE, - tcg_passres, - tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2); - break; - } - case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ - case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ - case 12: /* UMULL, UMULL2, SMULL, SMULL2 */ - tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2); - break; - case 9: /* SQDMLAL, SQDMLAL2 */ - case 11: /* SQDMLSL, SQDMLSL2 */ - case 13: /* SQDMULL, SQDMULL2 */ - tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2); - gen_helper_neon_addl_saturate_s64(tcg_passres, tcg_env, - tcg_passres, tcg_passres); - break; - default: - g_assert_not_reached(); - } - - if (opcode == 9 || opcode == 11) { - /* saturating accumulate ops */ - if (accop < 0) { - tcg_gen_neg_i64(tcg_passres, tcg_passres); - } - gen_helper_neon_addl_saturate_s64(tcg_res[pass], tcg_env, - tcg_res[pass], tcg_passres); - } else if (accop > 0) { - tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres); - } else if (accop < 0) { - tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres); - } - } - } else { - /* size 0 or 1, generally helper functions */ - for (pass = 0; pass < 2; pass++) { - TCGv_i32 tcg_op1 = tcg_temp_new_i32(); - TCGv_i32 tcg_op2 = tcg_temp_new_i32(); - TCGv_i64 tcg_passres; - int elt = pass + is_q * 2; - - read_vec_element_i32(s, tcg_op1, rn, elt, MO_32); - read_vec_element_i32(s, tcg_op2, rm, elt, MO_32); - - if (accop == 0) { - tcg_passres = tcg_res[pass]; - } else { - tcg_passres = tcg_temp_new_i64(); - } - - switch (opcode) { - case 0: /* SADDL, SADDL2, UADDL, UADDL2 */ - case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */ - { - TCGv_i64 tcg_op2_64 = tcg_temp_new_i64(); - static NeonGenWidenFn * const widenfns[2][2] = { - { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 }, - { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 }, - }; - NeonGenWidenFn *widenfn = widenfns[size][is_u]; - - widenfn(tcg_op2_64, tcg_op2); - widenfn(tcg_passres, tcg_op1); - gen_neon_addl(size, (opcode == 2), tcg_passres, - tcg_passres, tcg_op2_64); - break; - } - case 5: /* SABAL, SABAL2, UABAL, UABAL2 */ - case 7: /* SABDL, SABDL2, UABDL, UABDL2 */ - if (size == 0) { - if (is_u) { - gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2); - } else { - gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2); - } - } else { - if (is_u) { - gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2); - } else { - gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2); - } - } - break; - case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ - case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ - case 12: /* UMULL, UMULL2, SMULL, SMULL2 */ - if (size == 0) { - if (is_u) { - gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2); - } else { - gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2); - } - } else { - if (is_u) { - gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2); - } else { - gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2); - } - } - break; - case 9: /* SQDMLAL, SQDMLAL2 */ - case 11: /* SQDMLSL, SQDMLSL2 */ - case 13: /* SQDMULL, SQDMULL2 */ - assert(size == 1); - gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2); - gen_helper_neon_addl_saturate_s32(tcg_passres, tcg_env, - tcg_passres, tcg_passres); - break; - default: - g_assert_not_reached(); - } - - if (accop != 0) { - if (opcode == 9 || opcode == 11) { - /* saturating accumulate ops */ - if (accop < 0) { - gen_helper_neon_negl_u32(tcg_passres, tcg_passres); - } - gen_helper_neon_addl_saturate_s32(tcg_res[pass], tcg_env, - tcg_res[pass], - tcg_passres); - } else { - gen_neon_addl(size, (accop < 0), tcg_res[pass], - tcg_res[pass], tcg_passres); - } - } - } - } - - write_vec_element(s, tcg_res[0], rd, 0, MO_64); - write_vec_element(s, tcg_res[1], rd, 1, MO_64); -} - -static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size, - int opcode, int rd, int rn, int rm) -{ - TCGv_i64 tcg_res[2]; - int part = is_q ? 2 : 0; - int pass; - - for (pass = 0; pass < 2; pass++) { - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); - TCGv_i32 tcg_op2 = tcg_temp_new_i32(); - TCGv_i64 tcg_op2_wide = tcg_temp_new_i64(); - static NeonGenWidenFn * const widenfns[3][2] = { - { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 }, - { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 }, - { tcg_gen_ext_i32_i64, tcg_gen_extu_i32_i64 }, - }; - NeonGenWidenFn *widenfn = widenfns[size][is_u]; - - read_vec_element(s, tcg_op1, rn, pass, MO_64); - read_vec_element_i32(s, tcg_op2, rm, part + pass, MO_32); - widenfn(tcg_op2_wide, tcg_op2); - tcg_res[pass] = tcg_temp_new_i64(); - gen_neon_addl(size, (opcode == 3), - tcg_res[pass], tcg_op1, tcg_op2_wide); - } - - for (pass = 0; pass < 2; pass++) { - write_vec_element(s, tcg_res[pass], rd, pass, MO_64); - } -} - -static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in) -{ - tcg_gen_addi_i64(in, in, 1U << 31); - tcg_gen_extrh_i64_i32(res, in); -} - -static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size, - int opcode, int rd, int rn, int rm) -{ - TCGv_i32 tcg_res[2]; - int part = is_q ? 2 : 0; - int pass; - - for (pass = 0; pass < 2; pass++) { - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); - TCGv_i64 tcg_op2 = tcg_temp_new_i64(); - TCGv_i64 tcg_wideres = tcg_temp_new_i64(); - static NeonGenNarrowFn * const narrowfns[3][2] = { - { gen_helper_neon_narrow_high_u8, - gen_helper_neon_narrow_round_high_u8 }, - { gen_helper_neon_narrow_high_u16, - gen_helper_neon_narrow_round_high_u16 }, - { tcg_gen_extrh_i64_i32, do_narrow_round_high_u32 }, - }; - NeonGenNarrowFn *gennarrow = narrowfns[size][is_u]; - - read_vec_element(s, tcg_op1, rn, pass, MO_64); - read_vec_element(s, tcg_op2, rm, pass, MO_64); - - gen_neon_addl(size, (opcode == 6), tcg_wideres, tcg_op1, tcg_op2); - - tcg_res[pass] = tcg_temp_new_i32(); - gennarrow(tcg_res[pass], tcg_wideres); - } - - for (pass = 0; pass < 2; pass++) { - write_vec_element_i32(s, tcg_res[pass], rd, pass + part, MO_32); - } - clear_vec_high(s, is_q, rd); -} - -/* AdvSIMD three different - * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 - * +---+---+---+-----------+------+---+------+--------+-----+------+------+ - * | 0 | Q | U | 0 1 1 1 0 | size | 1 | Rm | opcode | 0 0 | Rn | Rd | - * +---+---+---+-----------+------+---+------+--------+-----+------+------+ - */ -static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) -{ - /* Instructions in this group fall into three basic classes - * (in each case with the operation working on each element in - * the input vectors): - * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra - * 128 bit input) - * (2) wide 64 x 128 -> 128 - * (3) narrowing 128 x 128 -> 64 - * Here we do initial decode, catch unallocated cases and - * dispatch to separate functions for each class. - */ - int is_q = extract32(insn, 30, 1); - int is_u = extract32(insn, 29, 1); - int size = extract32(insn, 22, 2); - int opcode = extract32(insn, 12, 4); - int rm = extract32(insn, 16, 5); - int rn = extract32(insn, 5, 5); - int rd = extract32(insn, 0, 5); - - switch (opcode) { - case 1: /* SADDW, SADDW2, UADDW, UADDW2 */ - case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */ - /* 64 x 128 -> 128 */ - if (size == 3) { - unallocated_encoding(s); - return; - } - if (!fp_access_check(s)) { - return; - } - handle_3rd_wide(s, is_q, is_u, size, opcode, rd, rn, rm); - break; - case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */ - case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */ - /* 128 x 128 -> 64 */ - if (size == 3) { - unallocated_encoding(s); - return; - } - if (!fp_access_check(s)) { - return; - } - handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm); - break; - case 14: /* PMULL, PMULL2 */ - if (is_u) { - unallocated_encoding(s); - return; - } - switch (size) { - case 0: /* PMULL.P8 */ - if (!fp_access_check(s)) { - return; - } - /* The Q field specifies lo/hi half input for this insn. */ - gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, - gen_helper_neon_pmull_h); - break; - - case 3: /* PMULL.P64 */ - if (!dc_isar_feature(aa64_pmull, s)) { - unallocated_encoding(s); - return; - } - if (!fp_access_check(s)) { - return; - } - /* The Q field specifies lo/hi half input for this insn. */ - gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, - gen_helper_gvec_pmull_q); - break; - - default: - unallocated_encoding(s); - break; - } - return; - case 9: /* SQDMLAL, SQDMLAL2 */ - case 11: /* SQDMLSL, SQDMLSL2 */ - case 13: /* SQDMULL, SQDMULL2 */ - if (is_u || size == 0) { - unallocated_encoding(s); - return; - } - /* fall through */ - case 0: /* SADDL, SADDL2, UADDL, UADDL2 */ - case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */ - case 5: /* SABAL, SABAL2, UABAL, UABAL2 */ - case 7: /* SABDL, SABDL2, UABDL, UABDL2 */ - case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ - case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ - case 12: /* SMULL, SMULL2, UMULL, UMULL2 */ - /* 64 x 64 -> 128 */ - if (size == 3) { - unallocated_encoding(s); - return; - } - if (!fp_access_check(s)) { - return; - } - - handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm); - break; - default: - /* opcode 15 not allocated */ - unallocated_encoding(s); - break; - } -} - static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q, int size, int rn, int rd) { @@ -11944,268 +11821,6 @@ static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn) } } -/* AdvSIMD scalar x indexed element - * 31 30 29 28 24 23 22 21 20 19 16 15 12 11 10 9 5 4 0 - * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+ - * | 0 1 | U | 1 1 1 1 1 | size | L | M | Rm | opc | H | 0 | Rn | Rd | - * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+ - * AdvSIMD vector x indexed element - * 31 30 29 28 24 23 22 21 20 19 16 15 12 11 10 9 5 4 0 - * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+ - * | 0 | Q | U | 0 1 1 1 1 | size | L | M | Rm | opc | H | 0 | Rn | Rd | - * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+ - */ -static void disas_simd_indexed(DisasContext *s, uint32_t insn) -{ - /* This encoding has two kinds of instruction: - * normal, where we perform elt x idxelt => elt for each - * element in the vector - * long, where we perform elt x idxelt and generate a result of - * double the width of the input element - * The long ops have a 'part' specifier (ie come in INSN, INSN2 pairs). - */ - bool is_scalar = extract32(insn, 28, 1); - bool is_q = extract32(insn, 30, 1); - bool u = extract32(insn, 29, 1); - int size = extract32(insn, 22, 2); - int l = extract32(insn, 21, 1); - int m = extract32(insn, 20, 1); - /* Note that the Rm field here is only 4 bits, not 5 as it usually is */ - int rm = extract32(insn, 16, 4); - int opcode = extract32(insn, 12, 4); - int h = extract32(insn, 11, 1); - int rn = extract32(insn, 5, 5); - int rd = extract32(insn, 0, 5); - int index; - - switch (16 * u + opcode) { - case 0x02: /* SMLAL, SMLAL2 */ - case 0x12: /* UMLAL, UMLAL2 */ - case 0x06: /* SMLSL, SMLSL2 */ - case 0x16: /* UMLSL, UMLSL2 */ - case 0x0a: /* SMULL, SMULL2 */ - case 0x1a: /* UMULL, UMULL2 */ - if (is_scalar) { - unallocated_encoding(s); - return; - } - break; - case 0x03: /* SQDMLAL, SQDMLAL2 */ - case 0x07: /* SQDMLSL, SQDMLSL2 */ - case 0x0b: /* SQDMULL, SQDMULL2 */ - break; - default: - case 0x00: /* FMLAL */ - case 0x01: /* FMLA */ - case 0x04: /* FMLSL */ - case 0x05: /* FMLS */ - case 0x08: /* MUL */ - case 0x09: /* FMUL */ - case 0x0c: /* SQDMULH */ - case 0x0d: /* SQRDMULH */ - case 0x0e: /* SDOT */ - case 0x0f: /* SUDOT / BFDOT / USDOT / BFMLAL */ - case 0x10: /* MLA */ - case 0x11: /* FCMLA #0 */ - case 0x13: /* FCMLA #90 */ - case 0x14: /* MLS */ - case 0x15: /* FCMLA #180 */ - case 0x17: /* FCMLA #270 */ - case 0x18: /* FMLAL2 */ - case 0x19: /* FMULX */ - case 0x1c: /* FMLSL2 */ - case 0x1d: /* SQRDMLAH */ - case 0x1e: /* UDOT */ - case 0x1f: /* SQRDMLSH */ - unallocated_encoding(s); - return; - } - - /* Given MemOp size, adjust register and indexing. */ - switch (size) { - case MO_8: - case MO_64: - unallocated_encoding(s); - return; - case MO_16: - index = h << 2 | l << 1 | m; - break; - case MO_32: - index = h << 1 | l; - rm |= m << 4; - break; - default: - g_assert_not_reached(); - } - - if (!fp_access_check(s)) { - return; - } - - if (size == 3) { - g_assert_not_reached(); - } else { - /* long ops: 16x16->32 or 32x32->64 */ - TCGv_i64 tcg_res[2]; - int pass; - bool satop = extract32(opcode, 0, 1); - MemOp memop = MO_32; - - if (satop || !u) { - memop |= MO_SIGN; - } - - if (size == 2) { - TCGv_i64 tcg_idx = tcg_temp_new_i64(); - - read_vec_element(s, tcg_idx, rm, index, memop); - - for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { - TCGv_i64 tcg_op = tcg_temp_new_i64(); - TCGv_i64 tcg_passres; - int passelt; - - if (is_scalar) { - passelt = 0; - } else { - passelt = pass + (is_q * 2); - } - - read_vec_element(s, tcg_op, rn, passelt, memop); - - tcg_res[pass] = tcg_temp_new_i64(); - - if (opcode == 0xa || opcode == 0xb) { - /* Non-accumulating ops */ - tcg_passres = tcg_res[pass]; - } else { - tcg_passres = tcg_temp_new_i64(); - } - - tcg_gen_mul_i64(tcg_passres, tcg_op, tcg_idx); - - if (satop) { - /* saturating, doubling */ - gen_helper_neon_addl_saturate_s64(tcg_passres, tcg_env, - tcg_passres, tcg_passres); - } - - if (opcode == 0xa || opcode == 0xb) { - continue; - } - - /* Accumulating op: handle accumulate step */ - read_vec_element(s, tcg_res[pass], rd, pass, MO_64); - - switch (opcode) { - case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ - tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres); - break; - case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ - tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres); - break; - case 0x7: /* SQDMLSL, SQDMLSL2 */ - tcg_gen_neg_i64(tcg_passres, tcg_passres); - /* fall through */ - case 0x3: /* SQDMLAL, SQDMLAL2 */ - gen_helper_neon_addl_saturate_s64(tcg_res[pass], tcg_env, - tcg_res[pass], - tcg_passres); - break; - default: - g_assert_not_reached(); - } - } - - clear_vec_high(s, !is_scalar, rd); - } else { - TCGv_i32 tcg_idx = tcg_temp_new_i32(); - - assert(size == 1); - read_vec_element_i32(s, tcg_idx, rm, index, size); - - if (!is_scalar) { - /* The simplest way to handle the 16x16 indexed ops is to - * duplicate the index into both halves of the 32 bit tcg_idx - * and then use the usual Neon helpers. - */ - tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16); - } - - for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { - TCGv_i32 tcg_op = tcg_temp_new_i32(); - TCGv_i64 tcg_passres; - - if (is_scalar) { - read_vec_element_i32(s, tcg_op, rn, pass, size); - } else { - read_vec_element_i32(s, tcg_op, rn, - pass + (is_q * 2), MO_32); - } - - tcg_res[pass] = tcg_temp_new_i64(); - - if (opcode == 0xa || opcode == 0xb) { - /* Non-accumulating ops */ - tcg_passres = tcg_res[pass]; - } else { - tcg_passres = tcg_temp_new_i64(); - } - - if (memop & MO_SIGN) { - gen_helper_neon_mull_s16(tcg_passres, tcg_op, tcg_idx); - } else { - gen_helper_neon_mull_u16(tcg_passres, tcg_op, tcg_idx); - } - if (satop) { - gen_helper_neon_addl_saturate_s32(tcg_passres, tcg_env, - tcg_passres, tcg_passres); - } - - if (opcode == 0xa || opcode == 0xb) { - continue; - } - - /* Accumulating op: handle accumulate step */ - read_vec_element(s, tcg_res[pass], rd, pass, MO_64); - - switch (opcode) { - case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ - gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass], - tcg_passres); - break; - case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ - gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass], - tcg_passres); - break; - case 0x7: /* SQDMLSL, SQDMLSL2 */ - gen_helper_neon_negl_u32(tcg_passres, tcg_passres); - /* fall through */ - case 0x3: /* SQDMLAL, SQDMLAL2 */ - gen_helper_neon_addl_saturate_s32(tcg_res[pass], tcg_env, - tcg_res[pass], - tcg_passres); - break; - default: - g_assert_not_reached(); - } - } - - if (is_scalar) { - tcg_gen_ext32u_i64(tcg_res[0], tcg_res[0]); - } - } - - if (is_scalar) { - tcg_res[1] = tcg_constant_i64(0); - } - - for (pass = 0; pass < 2; pass++) { - write_vec_element(s, tcg_res[pass], rd, pass, MO_64); - } - } -} - /* C3.6 Data processing - SIMD, inc Crypto * * As the decode gets a little complex we are using a table based @@ -12213,19 +11828,15 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) */ static const AArch64DecodeTable data_proc_simd[] = { /* pattern , mask , fn */ - { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff }, { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc }, { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes }, - { 0x0f000000, 0x9f000400, disas_simd_indexed }, /* vector indexed */ /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */ { 0x0f000400, 0x9ff80400, disas_simd_mod_imm }, { 0x0f000400, 0x9f800400, disas_simd_shift_imm }, { 0x0e000000, 0xbf208c00, disas_simd_tb }, { 0x0e000800, 0xbf208c00, disas_simd_zip_trn }, { 0x2e000000, 0xbf208400, disas_simd_ext }, - { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff }, { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc }, - { 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */ { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm }, { 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 }, { 0x00000000, 0x00000000, NULL } diff --git a/target/arm/tcg/translate-m-nocp.c b/target/arm/tcg/translate-m-nocp.c index f564d06ccf..b92773b4af 100644 --- a/target/arm/tcg/translate-m-nocp.c +++ b/target/arm/tcg/translate-m-nocp.c @@ -332,7 +332,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, if (dc_isar_feature(aa32_mve, s)) { /* QC is only present for MVE; otherwise RES0 */ TCGv_i32 qc = tcg_temp_new_i32(); - tcg_gen_andi_i32(qc, tmp, FPCR_QC); + tcg_gen_andi_i32(qc, tmp, FPSR_QC); /* * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; * here writing the same value into all elements is simplest. @@ -340,11 +340,11 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), 16, 16, qc); } - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); - fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); + tcg_gen_andi_i32(tmp, tmp, FPSR_NZCV_MASK); + fpscr = load_cpu_field_low32(vfp.fpsr); + tcg_gen_andi_i32(fpscr, fpscr, ~FPSR_NZCV_MASK); tcg_gen_or_i32(fpscr, fpscr, tmp); - store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]); + store_cpu_field_low32(fpscr, vfp.fpsr); break; } case ARM_VFP_FPCXT_NS: @@ -390,7 +390,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, tcg_gen_deposit_i32(control, control, sfpa, R_V7M_CONTROL_SFPA_SHIFT, 1); store_cpu_field(control, v7m.control[M_REG_S]); - tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + tcg_gen_andi_i32(tmp, tmp, ~FPSR_NZCV_MASK); gen_helper_vfp_set_fpscr(tcg_env, tmp); s->base.is_jmp = DISAS_UPDATE_NOCHAIN; break; @@ -457,7 +457,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, case ARM_VFP_FPSCR_NZCVQC: tmp = tcg_temp_new_i32(); gen_helper_vfp_get_fpscr(tmp, tcg_env); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); + tcg_gen_andi_i32(tmp, tmp, FPSR_NZCVQC_MASK); storefn(s, opaque, tmp, true); break; case QEMU_VFP_FPSCR_NZCV: @@ -465,8 +465,8 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, * Read just NZCV; this is a special case to avoid the * helper call for the "VMRS to CPSR.NZCV" insn. */ - tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + tmp = load_cpu_field_low32(vfp.fpsr); + tcg_gen_andi_i32(tmp, tmp, FPSR_NZCV_MASK); storefn(s, opaque, tmp, true); break; case ARM_VFP_FPCXT_S: @@ -476,7 +476,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, tmp = tcg_temp_new_i32(); sfpa = tcg_temp_new_i32(); gen_helper_vfp_get_fpscr(tmp, tcg_env); - tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + tcg_gen_andi_i32(tmp, tmp, ~FPSR_NZCV_MASK); control = load_cpu_field(v7m.control[M_REG_S]); tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); @@ -529,7 +529,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, sfpa = tcg_temp_new_i32(); fpscr = tcg_temp_new_i32(); gen_helper_vfp_get_fpscr(fpscr, tcg_env); - tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK); + tcg_gen_andi_i32(tmp, fpscr, ~FPSR_NZCV_MASK); control = load_cpu_field(v7m.control[M_REG_S]); tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c index 39ec971ff7..cd5b848357 100644 --- a/target/arm/tcg/translate-vfp.c +++ b/target/arm/tcg/translate-vfp.c @@ -833,8 +833,8 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) break; case ARM_VFP_FPSCR: if (a->rt == 15) { - tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + tmp = load_cpu_field_low32(vfp.fpsr); + tcg_gen_andi_i32(tmp, tmp, FPSR_NZCV_MASK); } else { tmp = tcg_temp_new_i32(); gen_helper_vfp_get_fpscr(tmp, tcg_env); diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index aba21f730f..a8672c857c 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -351,8 +351,7 @@ static inline TCGv_i32 get_ahp_flag(void) { TCGv_i32 ret = tcg_temp_new_i32(); - tcg_gen_ld_i32(ret, tcg_env, - offsetof(CPUARMState, vfp.xregs[ARM_VFP_FPSCR])); + tcg_gen_ld_i32(ret, tcg_env, offsetoflow32(CPUARMState, vfp.fpcr)); tcg_gen_extract_i32(ret, ret, 26, 1); return ret; diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index 50d7042fa9..b3698da8ca 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -85,7 +85,7 @@ static inline int vfp_exceptbits_to_host(int target_bits) return host_bits; } -static uint32_t vfp_get_fpscr_from_host(CPUARMState *env) +static uint32_t vfp_get_fpsr_from_host(CPUARMState *env) { uint32_t i; @@ -99,14 +99,28 @@ static uint32_t vfp_get_fpscr_from_host(CPUARMState *env) return vfp_exceptbits_from_host(i); } -static void vfp_set_fpscr_to_host(CPUARMState *env, uint32_t val) +static void vfp_set_fpsr_to_host(CPUARMState *env, uint32_t val) { - int i; - uint32_t changed = env->vfp.xregs[ARM_VFP_FPSCR]; + /* + * The exception flags are ORed together when we read fpscr so we + * only need to preserve the current state in one of our + * float_status values. + */ + int i = vfp_exceptbits_to_host(val); + set_float_exception_flags(i, &env->vfp.fp_status); + set_float_exception_flags(0, &env->vfp.fp_status_f16); + set_float_exception_flags(0, &env->vfp.standard_fp_status); + set_float_exception_flags(0, &env->vfp.standard_fp_status_f16); +} + +static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) +{ + uint64_t changed = env->vfp.fpcr; changed ^= val; + changed &= mask; if (changed & (3 << 22)) { - i = (val >> 22) & 3; + int i = (val >> 22) & 3; switch (i) { case FPROUNDING_TIEEVEN: i = float_round_nearest_even; @@ -141,52 +155,56 @@ static void vfp_set_fpscr_to_host(CPUARMState *env, uint32_t val) set_default_nan_mode(dnan_enabled, &env->vfp.fp_status); set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16); } - - /* - * The exception flags are ORed together when we read fpscr so we - * only need to preserve the current state in one of our - * float_status values. - */ - i = vfp_exceptbits_to_host(val); - set_float_exception_flags(i, &env->vfp.fp_status); - set_float_exception_flags(0, &env->vfp.fp_status_f16); - set_float_exception_flags(0, &env->vfp.standard_fp_status); - set_float_exception_flags(0, &env->vfp.standard_fp_status_f16); } #else -static uint32_t vfp_get_fpscr_from_host(CPUARMState *env) +static uint32_t vfp_get_fpsr_from_host(CPUARMState *env) { return 0; } -static void vfp_set_fpscr_to_host(CPUARMState *env, uint32_t val) +static void vfp_set_fpsr_to_host(CPUARMState *env, uint32_t val) +{ +} + +static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) { } #endif -uint32_t HELPER(vfp_get_fpscr)(CPUARMState *env) +uint32_t vfp_get_fpcr(CPUARMState *env) { - uint32_t i, fpscr; - - fpscr = env->vfp.xregs[ARM_VFP_FPSCR] - | (env->vfp.vec_len << 16) - | (env->vfp.vec_stride << 20); + uint32_t fpcr = env->vfp.fpcr + | (env->vfp.vec_len << 16) + | (env->vfp.vec_stride << 20); /* - * M-profile LTPSIZE overlaps A-profile Stride; whichever of the - * two is not applicable to this CPU will always be zero. + * M-profile LTPSIZE is the same bits [18:16] as A-profile Len; whichever + * of the two is not applicable to this CPU will always be zero. */ - fpscr |= env->v7m.ltpsize << 16; + fpcr |= env->v7m.ltpsize << 16; - fpscr |= vfp_get_fpscr_from_host(env); + return fpcr; +} + +uint32_t vfp_get_fpsr(CPUARMState *env) +{ + uint32_t fpsr = env->vfp.fpsr; + uint32_t i; + + fpsr |= vfp_get_fpsr_from_host(env); i = env->vfp.qc[0] | env->vfp.qc[1] | env->vfp.qc[2] | env->vfp.qc[3]; - fpscr |= i ? FPCR_QC : 0; + fpsr |= i ? FPSR_QC : 0; + return fpsr; +} - return fpscr; +uint32_t HELPER(vfp_get_fpscr)(CPUARMState *env) +{ + return (vfp_get_fpcr(env) & FPSCR_FPCR_MASK) | + (vfp_get_fpsr(env) & FPSCR_FPSR_MASK); } uint32_t vfp_get_fpscr(CPUARMState *env) @@ -194,56 +212,91 @@ uint32_t vfp_get_fpscr(CPUARMState *env) return HELPER(vfp_get_fpscr)(env); } -void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val) +void vfp_set_fpsr(CPUARMState *env, uint32_t val) { ARMCPU *cpu = env_archcpu(env); + vfp_set_fpsr_to_host(env, val); + + if (arm_feature(env, ARM_FEATURE_NEON) || + cpu_isar_feature(aa32_mve, cpu)) { + /* + * The bit we set within vfp.qc[] is arbitrary; the array as a + * whole being zero/non-zero is what counts. + */ + env->vfp.qc[0] = val & FPSR_QC; + env->vfp.qc[1] = 0; + env->vfp.qc[2] = 0; + env->vfp.qc[3] = 0; + } + + /* + * The only FPSR bits we keep in vfp.fpsr are NZCV: + * the exception flags IOC|DZC|OFC|UFC|IXC|IDC are stored in + * fp_status, and QC is in vfp.qc[]. Store the NZCV bits there, + * and zero any of the other FPSR bits. + */ + val &= FPSR_NZCV_MASK; + env->vfp.fpsr = val; +} + +static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask) +{ + /* + * We only set FPCR bits defined by mask, and leave the others alone. + * We assume the mask is sensible (e.g. doesn't try to set only + * part of a field) + */ + ARMCPU *cpu = env_archcpu(env); + /* When ARMv8.2-FP16 is not supported, FZ16 is RES0. */ if (!cpu_isar_feature(any_fp16, cpu)) { val &= ~FPCR_FZ16; } - vfp_set_fpscr_to_host(env, val); + vfp_set_fpcr_to_host(env, val, mask); - if (!arm_feature(env, ARM_FEATURE_M)) { - /* - * Short-vector length and stride; on M-profile these bits - * are used for different purposes. - * We can't make this conditional be "if MVFR0.FPShVec != 0", - * because in v7A no-short-vector-support cores still had to - * allow Stride/Len to be written with the only effect that - * some insns are required to UNDEF if the guest sets them. - */ - env->vfp.vec_len = extract32(val, 16, 3); - env->vfp.vec_stride = extract32(val, 20, 2); - } else if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.ltpsize = extract32(val, FPCR_LTPSIZE_SHIFT, - FPCR_LTPSIZE_LENGTH); - } - - if (arm_feature(env, ARM_FEATURE_NEON) || - cpu_isar_feature(aa32_mve, cpu)) { - /* - * The bit we set within fpscr_q is arbitrary; the register as a - * whole being zero/non-zero is what counts. - * TODO: M-profile MVE also has a QC bit. - */ - env->vfp.qc[0] = val & FPCR_QC; - env->vfp.qc[1] = 0; - env->vfp.qc[2] = 0; - env->vfp.qc[3] = 0; + if (mask & (FPCR_LEN_MASK | FPCR_STRIDE_MASK)) { + if (!arm_feature(env, ARM_FEATURE_M)) { + /* + * Short-vector length and stride; on M-profile these bits + * are used for different purposes. + * We can't make this conditional be "if MVFR0.FPShVec != 0", + * because in v7A no-short-vector-support cores still had to + * allow Stride/Len to be written with the only effect that + * some insns are required to UNDEF if the guest sets them. + */ + env->vfp.vec_len = extract32(val, 16, 3); + env->vfp.vec_stride = extract32(val, 20, 2); + } else if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.ltpsize = extract32(val, FPCR_LTPSIZE_SHIFT, + FPCR_LTPSIZE_LENGTH); + } } /* * We don't implement trapped exception handling, so the * trap enable bits, IDE|IXE|UFE|OFE|DZE|IOE are all RAZ/WI (not RES0!) * - * The exception flags IOC|DZC|OFC|UFC|IXC|IDC are stored in - * fp_status; QC, Len and Stride are stored separately earlier. - * Clear out all of those and the RES0 bits: only NZCV, AHP, DN, - * FZ, RMode and FZ16 are kept in vfp.xregs[FPSCR]. + * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode + * and FZ16. Len, Stride and LTPSIZE we just handled. Store those bits + * there, and zero any of the other FPCR bits and the RES0 and RAZ/WI + * bits. */ - env->vfp.xregs[ARM_VFP_FPSCR] = val & 0xf7c80000; + val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16; + env->vfp.fpcr &= ~mask; + env->vfp.fpcr |= val; +} + +void vfp_set_fpcr(CPUARMState *env, uint32_t val) +{ + vfp_set_fpcr_masked(env, val, MAKE_64BIT_MASK(0, 32)); +} + +void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val) +{ + vfp_set_fpcr_masked(env, val, FPSCR_FPCR_MASK); + vfp_set_fpsr(env, val & FPSCR_FPSR_MASK); } void vfp_set_fpscr(CPUARMState *env, uint32_t val) @@ -315,8 +368,7 @@ static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp) default: g_assert_not_reached(); } - env->vfp.xregs[ARM_VFP_FPSCR] = - deposit32(env->vfp.xregs[ARM_VFP_FPSCR], 28, 4, flags); + env->vfp.fpsr = deposit64(env->vfp.fpsr, 28, 4, flags); /* NZCV */ } /* XXX: check quiet/signaling case */ @@ -1119,8 +1171,7 @@ uint32_t HELPER(vjcvt)(float64 value, CPUARMState *env) uint32_t z = (pair >> 32) == 0; /* Store Z, clear NCV, in FPSCR.NZCV. */ - env->vfp.xregs[ARM_VFP_FPSCR] - = (env->vfp.xregs[ARM_VFP_FPSCR] & ~CPSR_NZCV) | (z * CPSR_Z); + env->vfp.fpsr = (env->vfp.fpsr & ~FPSR_NZCV_MASK) | (z * FPSR_Z); return result; } diff --git a/target/avr/cpu.c b/target/avr/cpu.c index f53e1192b1..3132842d56 100644 --- a/target/avr/cpu.c +++ b/target/avr/cpu.c @@ -210,6 +210,7 @@ static const TCGCPUOps avr_tcg_ops = { .synchronize_from_tb = avr_cpu_synchronize_from_tb, .restore_state_to_opc = avr_restore_state_to_opc, .cpu_exec_interrupt = avr_cpu_exec_interrupt, + .cpu_exec_halt = avr_cpu_has_work, .tlb_fill = avr_cpu_tlb_fill, .do_interrupt = avr_cpu_do_interrupt, }; diff --git a/target/cris/cpu.c b/target/cris/cpu.c index 535ec39c73..ff31ca7fbc 100644 --- a/target/cris/cpu.c +++ b/target/cris/cpu.c @@ -186,6 +186,7 @@ static const TCGCPUOps crisv10_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = cris_cpu_tlb_fill, .cpu_exec_interrupt = cris_cpu_exec_interrupt, + .cpu_exec_halt = cris_cpu_has_work, .do_interrupt = crisv10_cpu_do_interrupt, #endif /* !CONFIG_USER_ONLY */ }; @@ -197,6 +198,7 @@ static const TCGCPUOps crisv32_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = cris_cpu_tlb_fill, .cpu_exec_interrupt = cris_cpu_exec_interrupt, + .cpu_exec_halt = cris_cpu_has_work, .do_interrupt = cris_cpu_do_interrupt, #endif /* !CONFIG_USER_ONLY */ }; diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c index f0507874ce..7cf2e2f266 100644 --- a/target/hppa/cpu.c +++ b/target/hppa/cpu.c @@ -228,6 +228,7 @@ static const TCGCPUOps hppa_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = hppa_cpu_tlb_fill, .cpu_exec_interrupt = hppa_cpu_exec_interrupt, + .cpu_exec_halt = hppa_cpu_has_work, .do_interrupt = hppa_cpu_do_interrupt, .do_unaligned_access = hppa_cpu_do_unaligned_access, .do_transaction_failed = hppa_cpu_do_transaction_failed, diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 270f711f11..69f9ad7711 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -736,6 +736,7 @@ static const TCGCPUOps loongarch_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = loongarch_cpu_tlb_fill, .cpu_exec_interrupt = loongarch_cpu_exec_interrupt, + .cpu_exec_halt = loongarch_cpu_has_work, .do_interrupt = loongarch_cpu_do_interrupt, .do_transaction_failed = loongarch_cpu_do_transaction_failed, #endif diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index efd6bbded8..1d49f4cb23 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -536,6 +536,7 @@ static const TCGCPUOps m68k_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = m68k_cpu_tlb_fill, .cpu_exec_interrupt = m68k_cpu_exec_interrupt, + .cpu_exec_halt = m68k_cpu_has_work, .do_interrupt = m68k_cpu_do_interrupt, .do_transaction_failed = m68k_cpu_transaction_failed, #endif /* !CONFIG_USER_ONLY */ diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c index 41ad47d04c..135947ee80 100644 --- a/target/microblaze/cpu.c +++ b/target/microblaze/cpu.c @@ -413,6 +413,7 @@ static const TCGCPUOps mb_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = mb_cpu_tlb_fill, .cpu_exec_interrupt = mb_cpu_exec_interrupt, + .cpu_exec_halt = mb_cpu_has_work, .do_interrupt = mb_cpu_do_interrupt, .do_transaction_failed = mb_cpu_transaction_failed, .do_unaligned_access = mb_cpu_do_unaligned_access, diff --git a/target/mips/cpu.c b/target/mips/cpu.c index bbe01d07dd..89655b1900 100644 --- a/target/mips/cpu.c +++ b/target/mips/cpu.c @@ -555,6 +555,7 @@ static const TCGCPUOps mips_tcg_ops = { #if !defined(CONFIG_USER_ONLY) .tlb_fill = mips_cpu_tlb_fill, .cpu_exec_interrupt = mips_cpu_exec_interrupt, + .cpu_exec_halt = mips_cpu_has_work, .do_interrupt = mips_cpu_do_interrupt, .do_transaction_failed = mips_cpu_do_transaction_failed, .do_unaligned_access = mips_cpu_do_unaligned_access, diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c index fdaaa09fc8..6ec54ad7a6 100644 --- a/target/openrisc/cpu.c +++ b/target/openrisc/cpu.c @@ -233,6 +233,7 @@ static const TCGCPUOps openrisc_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = openrisc_cpu_tlb_fill, .cpu_exec_interrupt = openrisc_cpu_exec_interrupt, + .cpu_exec_halt = openrisc_cpu_has_work, .do_interrupt = openrisc_cpu_do_interrupt, #endif /* !CONFIG_USER_ONLY */ }; diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index 01e358a4a5..cdada7987d 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -1,3 +1,4 @@ + /* * PowerPC CPU initialization for qemu. * @@ -7481,6 +7482,7 @@ static const TCGCPUOps ppc_tcg_ops = { #else .tlb_fill = ppc_cpu_tlb_fill, .cpu_exec_interrupt = ppc_cpu_exec_interrupt, + .cpu_exec_halt = ppc_cpu_has_work, .do_interrupt = ppc_cpu_do_interrupt, .cpu_exec_enter = ppc_cpu_exec_enter, .cpu_exec_exit = ppc_cpu_exec_exit, diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index a2640cf259..c53b0d5b40 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -903,7 +903,7 @@ static vaddr riscv_cpu_get_pc(CPUState *cs) return env->pc; } -static bool riscv_cpu_has_work(CPUState *cs) +bool riscv_cpu_has_work(CPUState *cs) { #ifndef CONFIG_USER_ONLY RISCVCPU *cpu = RISCV_CPU(cs); diff --git a/target/riscv/internals.h b/target/riscv/internals.h index 8239ae83cc..0ac17bc5ad 100644 --- a/target/riscv/internals.h +++ b/target/riscv/internals.h @@ -136,4 +136,7 @@ static inline float16 check_nanbox_h(CPURISCVState *env, uint64_t f) } } +/* Our implementation of CPUClass::has_work */ +bool riscv_cpu_has_work(CPUState *cs); + #endif diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c index ae25686824..ecf366d6c7 100644 --- a/target/riscv/tcg/tcg-cpu.c +++ b/target/riscv/tcg/tcg-cpu.c @@ -21,6 +21,7 @@ #include "exec/exec-all.h" #include "tcg-cpu.h" #include "cpu.h" +#include "internals.h" #include "pmu.h" #include "time_helper.h" #include "qapi/error.h" @@ -138,6 +139,7 @@ static const TCGCPUOps riscv_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = riscv_cpu_tlb_fill, .cpu_exec_interrupt = riscv_cpu_exec_interrupt, + .cpu_exec_halt = riscv_cpu_has_work, .do_interrupt = riscv_cpu_do_interrupt, .do_transaction_failed = riscv_cpu_do_transaction_failed, .do_unaligned_access = riscv_cpu_do_unaligned_access, diff --git a/target/rx/cpu.c b/target/rx/cpu.c index 8a584f0a11..36d2a6f189 100644 --- a/target/rx/cpu.c +++ b/target/rx/cpu.c @@ -192,6 +192,7 @@ static const TCGCPUOps rx_tcg_ops = { #ifndef CONFIG_USER_ONLY .cpu_exec_interrupt = rx_cpu_exec_interrupt, + .cpu_exec_halt = rx_cpu_has_work, .do_interrupt = rx_cpu_do_interrupt, #endif /* !CONFIG_USER_ONLY */ }; diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 2bbeaca36e..0fbfcd35d8 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -370,6 +370,7 @@ static const TCGCPUOps s390_tcg_ops = { #else .tlb_fill = s390_cpu_tlb_fill, .cpu_exec_interrupt = s390_cpu_exec_interrupt, + .cpu_exec_halt = s390_cpu_has_work, .do_interrupt = s390_cpu_do_interrupt, .debug_excp_handler = s390x_cpu_debug_excp_handler, .do_unaligned_access = s390x_cpu_do_unaligned_access, diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c index 618aa7154e..8f07261dcf 100644 --- a/target/sh4/cpu.c +++ b/target/sh4/cpu.c @@ -254,6 +254,7 @@ static const TCGCPUOps superh_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = superh_cpu_tlb_fill, .cpu_exec_interrupt = superh_cpu_exec_interrupt, + .cpu_exec_halt = superh_cpu_has_work, .do_interrupt = superh_cpu_do_interrupt, .do_unaligned_access = superh_cpu_do_unaligned_access, .io_recompile_replay_branch = superh_io_recompile_replay_branch, diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c index 9bacfb68cb..54cb269e0a 100644 --- a/target/sparc/cpu.c +++ b/target/sparc/cpu.c @@ -926,6 +926,7 @@ static const TCGCPUOps sparc_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = sparc_cpu_tlb_fill, .cpu_exec_interrupt = sparc_cpu_exec_interrupt, + .cpu_exec_halt = sparc_cpu_has_work, .do_interrupt = sparc_cpu_do_interrupt, .do_transaction_failed = sparc_cpu_do_transaction_failed, .do_unaligned_access = sparc_cpu_do_unaligned_access, diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c index bdefb84511..4d9c0368f2 100644 --- a/target/tricore/cpu.c +++ b/target/tricore/cpu.c @@ -169,6 +169,7 @@ static const TCGCPUOps tricore_tcg_ops = { .synchronize_from_tb = tricore_cpu_synchronize_from_tb, .restore_state_to_opc = tricore_restore_state_to_opc, .tlb_fill = tricore_cpu_tlb_fill, + .cpu_exec_halt = tricore_cpu_has_work, }; static void tricore_cpu_class_init(ObjectClass *c, void *data) diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c index de907cfeb1..a08c7a0b1f 100644 --- a/target/xtensa/cpu.c +++ b/target/xtensa/cpu.c @@ -234,6 +234,7 @@ static const TCGCPUOps xtensa_tcg_ops = { #ifndef CONFIG_USER_ONLY .tlb_fill = xtensa_cpu_tlb_fill, .cpu_exec_interrupt = xtensa_cpu_exec_interrupt, + .cpu_exec_halt = xtensa_cpu_has_work, .do_interrupt = xtensa_cpu_do_interrupt, .do_transaction_failed = xtensa_cpu_do_transaction_failed, .do_unaligned_access = xtensa_cpu_do_unaligned_access,