diff --git a/MAINTAINERS b/MAINTAINERS index 5580a36b68..b3af081c51 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2198,6 +2198,7 @@ Generic Loader M: Alistair Francis S: Maintained F: hw/core/generic-loader.c +F: hw/core/uboot_image.h F: include/hw/core/generic-loader.h F: docs/system/generic-loader.rst diff --git a/hw/arm/armv7m.c b/hw/arm/armv7m.c index 32349ec94b..990861ee5e 100644 --- a/hw/arm/armv7m.c +++ b/hw/arm/armv7m.c @@ -570,7 +570,7 @@ static void armv7m_reset(void *opaque) void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename, int mem_size) { - int image_size; + ssize_t image_size; uint64_t entry; int big_endian; AddressSpace *as; diff --git a/hw/arm/boot.c b/hw/arm/boot.c index a8de33fd64..ada2717f76 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -881,7 +881,7 @@ static int do_arm_linux_init(Object *obj, void *opaque) return 0; } -static int64_t arm_load_elf(struct arm_boot_info *info, uint64_t *pentry, +static ssize_t arm_load_elf(struct arm_boot_info *info, uint64_t *pentry, uint64_t *lowaddr, uint64_t *highaddr, int elf_machine, AddressSpace *as) { @@ -892,7 +892,7 @@ static int64_t arm_load_elf(struct arm_boot_info *info, uint64_t *pentry, } elf_header; int data_swab = 0; bool big_endian; - int64_t ret = -1; + ssize_t ret = -1; Error *err = NULL; @@ -1014,7 +1014,7 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, /* Set up for a direct boot of a kernel image file. */ CPUState *cs; AddressSpace *as = arm_boot_address_space(cpu, info); - int kernel_size; + ssize_t kernel_size; int initrd_size; int is_linux = 0; uint64_t elf_entry; @@ -1093,7 +1093,7 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, if (kernel_size > info->ram_size) { error_report("kernel '%s' is too large to fit in RAM " - "(kernel size %d, RAM size %" PRId64 ")", + "(kernel size %zd, RAM size %" PRId64 ")", info->kernel_filename, kernel_size, info->ram_size); exit(1); } diff --git a/hw/core/generic-loader.c b/hw/core/generic-loader.c index c666545aa0..4f4d77908d 100644 --- a/hw/core/generic-loader.c +++ b/hw/core/generic-loader.c @@ -67,7 +67,7 @@ static void generic_loader_realize(DeviceState *dev, Error **errp) GenericLoaderState *s = GENERIC_LOADER(dev); hwaddr entry; int big_endian; - int size = 0; + ssize_t size = 0; s->set_pc = false; diff --git a/hw/core/loader.c b/hw/core/loader.c index edde657ac3..0548830733 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -114,17 +114,17 @@ ssize_t read_targphys(const char *name, return did; } -int load_image_targphys(const char *filename, - hwaddr addr, uint64_t max_sz) +ssize_t load_image_targphys(const char *filename, + hwaddr addr, uint64_t max_sz) { return load_image_targphys_as(filename, addr, max_sz, NULL); } /* return the size or -1 if error */ -int load_image_targphys_as(const char *filename, - hwaddr addr, uint64_t max_sz, AddressSpace *as) +ssize_t load_image_targphys_as(const char *filename, + hwaddr addr, uint64_t max_sz, AddressSpace *as) { - int size; + ssize_t size; size = get_image_size(filename); if (size < 0 || size > max_sz) { @@ -138,9 +138,9 @@ int load_image_targphys_as(const char *filename, return size; } -int load_image_mr(const char *filename, MemoryRegion *mr) +ssize_t load_image_mr(const char *filename, MemoryRegion *mr) { - int size; + ssize_t size; if (!memory_access_is_direct(mr, false)) { /* Can only load an image into RAM or ROM */ @@ -222,8 +222,8 @@ static void bswap_ahdr(struct exec *e) : (_N_SEGMENT_ROUND (_N_TXTENDADDR(x, target_page_size), target_page_size))) -int load_aout(const char *filename, hwaddr addr, int max_sz, - int bswap_needed, hwaddr target_page_size) +ssize_t load_aout(const char *filename, hwaddr addr, int max_sz, + int bswap_needed, hwaddr target_page_size) { int fd; ssize_t size, ret; @@ -617,13 +617,14 @@ toosmall: } /* Load a U-Boot image. */ -static int load_uboot_image(const char *filename, hwaddr *ep, hwaddr *loadaddr, - int *is_linux, uint8_t image_type, - uint64_t (*translate_fn)(void *, uint64_t), - void *translate_opaque, AddressSpace *as) +static ssize_t load_uboot_image(const char *filename, hwaddr *ep, + hwaddr *loadaddr, int *is_linux, + uint8_t image_type, + uint64_t (*translate_fn)(void *, uint64_t), + void *translate_opaque, AddressSpace *as) { int fd; - int size; + ssize_t size; hwaddr address; uboot_image_header_t h; uboot_image_header_t *hdr = &h; @@ -760,40 +761,40 @@ out: return ret; } -int load_uimage(const char *filename, hwaddr *ep, hwaddr *loadaddr, - int *is_linux, - uint64_t (*translate_fn)(void *, uint64_t), - void *translate_opaque) +ssize_t load_uimage(const char *filename, hwaddr *ep, hwaddr *loadaddr, + int *is_linux, + uint64_t (*translate_fn)(void *, uint64_t), + void *translate_opaque) { return load_uboot_image(filename, ep, loadaddr, is_linux, IH_TYPE_KERNEL, translate_fn, translate_opaque, NULL); } -int load_uimage_as(const char *filename, hwaddr *ep, hwaddr *loadaddr, - int *is_linux, - uint64_t (*translate_fn)(void *, uint64_t), - void *translate_opaque, AddressSpace *as) +ssize_t load_uimage_as(const char *filename, hwaddr *ep, hwaddr *loadaddr, + int *is_linux, + uint64_t (*translate_fn)(void *, uint64_t), + void *translate_opaque, AddressSpace *as) { return load_uboot_image(filename, ep, loadaddr, is_linux, IH_TYPE_KERNEL, translate_fn, translate_opaque, as); } /* Load a ramdisk. */ -int load_ramdisk(const char *filename, hwaddr addr, uint64_t max_sz) +ssize_t load_ramdisk(const char *filename, hwaddr addr, uint64_t max_sz) { return load_ramdisk_as(filename, addr, max_sz, NULL); } -int load_ramdisk_as(const char *filename, hwaddr addr, uint64_t max_sz, - AddressSpace *as) +ssize_t load_ramdisk_as(const char *filename, hwaddr addr, uint64_t max_sz, + AddressSpace *as) { return load_uboot_image(filename, NULL, &addr, NULL, IH_TYPE_RAMDISK, NULL, NULL, as); } /* Load a gzip-compressed kernel to a dynamically allocated buffer. */ -int load_image_gzipped_buffer(const char *filename, uint64_t max_sz, - uint8_t **buffer) +ssize_t load_image_gzipped_buffer(const char *filename, uint64_t max_sz, + uint8_t **buffer) { uint8_t *compressed_data = NULL; uint8_t *data = NULL; @@ -838,9 +839,9 @@ int load_image_gzipped_buffer(const char *filename, uint64_t max_sz, } /* Load a gzip-compressed kernel. */ -int load_image_gzipped(const char *filename, hwaddr addr, uint64_t max_sz) +ssize_t load_image_gzipped(const char *filename, hwaddr addr, uint64_t max_sz) { - int bytes; + ssize_t bytes; uint8_t *data; bytes = load_image_gzipped_buffer(filename, max_sz, &data); @@ -970,14 +971,15 @@ static void *rom_set_mr(Rom *rom, Object *owner, const char *name, bool ro) return data; } -int rom_add_file(const char *file, const char *fw_dir, - hwaddr addr, int32_t bootindex, - bool option_rom, MemoryRegion *mr, - AddressSpace *as) +ssize_t rom_add_file(const char *file, const char *fw_dir, + hwaddr addr, int32_t bootindex, + bool option_rom, MemoryRegion *mr, + AddressSpace *as) { MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); Rom *rom; - int rc, fd = -1; + ssize_t rc; + int fd = -1; char devpath[100]; if (as && mr) { @@ -1019,7 +1021,7 @@ int rom_add_file(const char *file, const char *fw_dir, lseek(fd, 0, SEEK_SET); rc = read(fd, rom->data, rom->datasize); if (rc != rom->datasize) { - fprintf(stderr, "rom: file %-20s: read error: rc=%d (expected %zd)\n", + fprintf(stderr, "rom: file %-20s: read error: rc=%zd (expected %zd)\n", rom->name, rc, rom->datasize); goto err; } @@ -1138,12 +1140,12 @@ int rom_add_elf_program(const char *name, GMappedFile *mapped_file, void *data, return 0; } -int rom_add_vga(const char *file) +ssize_t rom_add_vga(const char *file) { return rom_add_file(file, "vgaroms", 0, -1, true, NULL, NULL); } -int rom_add_option(const char *file, int32_t bootindex) +ssize_t rom_add_option(const char *file, int32_t bootindex) { return rom_add_file(file, "genroms", 0, bootindex, true, NULL, NULL); } @@ -1846,11 +1848,12 @@ out: } /* return size or -1 if error */ -int load_targphys_hex_as(const char *filename, hwaddr *entry, AddressSpace *as) +ssize_t load_targphys_hex_as(const char *filename, hwaddr *entry, + AddressSpace *as) { gsize hex_blob_size; gchar *hex_blob; - int total_size = 0; + ssize_t total_size = 0; if (!g_file_get_contents(filename, &hex_blob, &hex_blob_size, NULL)) { return -1; diff --git a/hw/i386/x86.c b/hw/i386/x86.c index 78b05ab7a2..6003b4b2df 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1115,7 +1115,7 @@ void x86_bios_rom_init(MachineState *ms, const char *default_firmware, char *filename; MemoryRegion *bios, *isa_bios; int bios_size, isa_bios_size; - int ret; + ssize_t ret; /* BIOS load */ bios_name = ms->firmware ?: default_firmware; diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c index eebbcf33d4..56d60e9ac9 100644 --- a/hw/intc/sifive_plic.c +++ b/hw/intc/sifive_plic.c @@ -431,7 +431,7 @@ DeviceState *sifive_plic_create(hwaddr addr, char *hart_config, uint32_t context_stride, uint32_t aperture_size) { DeviceState *dev = qdev_new(TYPE_SIFIVE_PLIC); - int i, j = 0; + int i; SiFivePLICState *plic; assert(enable_stride == (enable_stride & -enable_stride)); @@ -451,18 +451,17 @@ DeviceState *sifive_plic_create(hwaddr addr, char *hart_config, sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr); plic = SIFIVE_PLIC(dev); - for (i = 0; i < num_harts; i++) { - CPUState *cpu = qemu_get_cpu(hartid_base + i); - if (plic->addr_config[j].mode == PLICMode_M) { - j++; - qdev_connect_gpio_out(dev, num_harts + i, + for (i = 0; i < plic->num_addrs; i++) { + int cpu_num = plic->addr_config[i].hartid; + CPUState *cpu = qemu_get_cpu(hartid_base + cpu_num); + + if (plic->addr_config[i].mode == PLICMode_M) { + qdev_connect_gpio_out(dev, num_harts + cpu_num, qdev_get_gpio_in(DEVICE(cpu), IRQ_M_EXT)); } - - if (plic->addr_config[j].mode == PLICMode_S) { - j++; - qdev_connect_gpio_out(dev, i, + if (plic->addr_config[i].mode == PLICMode_S) { + qdev_connect_gpio_out(dev, cpu_num, qdev_get_gpio_in(DEVICE(cpu), IRQ_S_EXT)); } } diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c index 57a41df8e9..2d80f40b31 100644 --- a/hw/riscv/boot.c +++ b/hw/riscv/boot.c @@ -129,7 +129,8 @@ target_ulong riscv_load_firmware(const char *firmware_filename, hwaddr firmware_load_addr, symbol_fn_t sym_cb) { - uint64_t firmware_entry, firmware_size, firmware_end; + uint64_t firmware_entry, firmware_end; + ssize_t firmware_size; if (load_elf_ram_sym(firmware_filename, NULL, NULL, NULL, &firmware_entry, NULL, &firmware_end, NULL, @@ -185,7 +186,7 @@ target_ulong riscv_load_kernel(const char *kernel_filename, hwaddr riscv_load_initrd(const char *filename, uint64_t mem_size, uint64_t kernel_entry, hwaddr *start) { - int size; + ssize_t size; /* * We want to put the initrd far enough into RAM that when the diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c index 293e9c95b7..bc424dd2f5 100644 --- a/hw/riscv/virt.c +++ b/hw/riscv/virt.c @@ -975,6 +975,23 @@ static void create_fdt_flash(RISCVVirtState *s, const MemMapEntry *memmap) g_free(name); } +static void create_fdt_fw_cfg(RISCVVirtState *s, const MemMapEntry *memmap) +{ + char *nodename; + MachineState *mc = MACHINE(s); + hwaddr base = memmap[VIRT_FW_CFG].base; + hwaddr size = memmap[VIRT_FW_CFG].size; + + nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base); + qemu_fdt_add_subnode(mc->fdt, nodename); + qemu_fdt_setprop_string(mc->fdt, nodename, + "compatible", "qemu,fw-cfg-mmio"); + qemu_fdt_setprop_sized_cells(mc->fdt, nodename, "reg", + 2, base, 2, size); + qemu_fdt_setprop(mc->fdt, nodename, "dma-coherent", NULL, 0); + g_free(nodename); +} + static void create_fdt(RISCVVirtState *s, const MemMapEntry *memmap, uint64_t mem_size, const char *cmdline, bool is_32_bit) { @@ -1023,6 +1040,7 @@ static void create_fdt(RISCVVirtState *s, const MemMapEntry *memmap, create_fdt_rtc(s, memmap, irq_mmio_phandle); create_fdt_flash(s, memmap); + create_fdt_fw_cfg(s, memmap); update_bootargs: if (cmdline && *cmdline) { @@ -1082,22 +1100,12 @@ static inline DeviceState *gpex_pcie_init(MemoryRegion *sys_mem, static FWCfgState *create_fw_cfg(const MachineState *mc) { hwaddr base = virt_memmap[VIRT_FW_CFG].base; - hwaddr size = virt_memmap[VIRT_FW_CFG].size; FWCfgState *fw_cfg; - char *nodename; fw_cfg = fw_cfg_init_mem_wide(base + 8, base, 8, base + 16, &address_space_memory); fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, (uint16_t)mc->smp.cpus); - nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base); - qemu_fdt_add_subnode(mc->fdt, nodename); - qemu_fdt_setprop_string(mc->fdt, nodename, - "compatible", "qemu,fw-cfg-mmio"); - qemu_fdt_setprop_sized_cells(mc->fdt, nodename, "reg", - 2, base, 2, size); - qemu_fdt_setprop(mc->fdt, nodename, "dma-coherent", NULL, 0); - g_free(nodename); return fw_cfg; } diff --git a/include/hw/loader.h b/include/hw/loader.h index 5572108ba5..70248e0da7 100644 --- a/include/hw/loader.h +++ b/include/hw/loader.h @@ -40,8 +40,8 @@ ssize_t load_image_size(const char *filename, void *addr, size_t size); * * Returns the size of the loaded image on success, -1 otherwise. */ -int load_image_targphys_as(const char *filename, - hwaddr addr, uint64_t max_sz, AddressSpace *as); +ssize_t load_image_targphys_as(const char *filename, + hwaddr addr, uint64_t max_sz, AddressSpace *as); /**load_targphys_hex_as: * @filename: Path to the .hex file @@ -53,14 +53,15 @@ int load_image_targphys_as(const char *filename, * * Returns the size of the loaded .hex file on success, -1 otherwise. */ -int load_targphys_hex_as(const char *filename, hwaddr *entry, AddressSpace *as); +ssize_t load_targphys_hex_as(const char *filename, hwaddr *entry, + AddressSpace *as); /** load_image_targphys: * Same as load_image_targphys_as(), but doesn't allow the caller to specify * an AddressSpace. */ -int load_image_targphys(const char *filename, hwaddr, - uint64_t max_sz); +ssize_t load_image_targphys(const char *filename, hwaddr, + uint64_t max_sz); /** * load_image_mr: load an image into a memory region @@ -73,7 +74,7 @@ int load_image_targphys(const char *filename, hwaddr, * If the file is larger than the memory region's size the call will fail. * Returns -1 on failure, or the size of the file. */ -int load_image_mr(const char *filename, MemoryRegion *mr); +ssize_t load_image_mr(const char *filename, MemoryRegion *mr); /* This is the limit on the maximum uncompressed image size that * load_image_gzipped_buffer() and load_image_gzipped() will read. It prevents @@ -81,9 +82,9 @@ int load_image_mr(const char *filename, MemoryRegion *mr); */ #define LOAD_IMAGE_MAX_GUNZIP_BYTES (256 << 20) -int load_image_gzipped_buffer(const char *filename, uint64_t max_sz, - uint8_t **buffer); -int load_image_gzipped(const char *filename, hwaddr addr, uint64_t max_sz); +ssize_t load_image_gzipped_buffer(const char *filename, uint64_t max_sz, + uint8_t **buffer); +ssize_t load_image_gzipped(const char *filename, hwaddr addr, uint64_t max_sz); #define ELF_LOAD_FAILED -1 #define ELF_LOAD_NOT_ELF -2 @@ -183,8 +184,8 @@ ssize_t load_elf(const char *filename, */ void load_elf_hdr(const char *filename, void *hdr, bool *is64, Error **errp); -int load_aout(const char *filename, hwaddr addr, int max_sz, - int bswap_needed, hwaddr target_page_size); +ssize_t load_aout(const char *filename, hwaddr addr, int max_sz, + int bswap_needed, hwaddr target_page_size); #define LOAD_UIMAGE_LOADADDR_INVALID (-1) @@ -205,19 +206,19 @@ int load_aout(const char *filename, hwaddr addr, int max_sz, * * Returns the size of the loaded image on success, -1 otherwise. */ -int load_uimage_as(const char *filename, hwaddr *ep, - hwaddr *loadaddr, int *is_linux, - uint64_t (*translate_fn)(void *, uint64_t), - void *translate_opaque, AddressSpace *as); +ssize_t load_uimage_as(const char *filename, hwaddr *ep, + hwaddr *loadaddr, int *is_linux, + uint64_t (*translate_fn)(void *, uint64_t), + void *translate_opaque, AddressSpace *as); /** load_uimage: * Same as load_uimage_as(), but doesn't allow the caller to specify an * AddressSpace. */ -int load_uimage(const char *filename, hwaddr *ep, - hwaddr *loadaddr, int *is_linux, - uint64_t (*translate_fn)(void *, uint64_t), - void *translate_opaque); +ssize_t load_uimage(const char *filename, hwaddr *ep, + hwaddr *loadaddr, int *is_linux, + uint64_t (*translate_fn)(void *, uint64_t), + void *translate_opaque); /** * load_ramdisk_as: @@ -232,15 +233,15 @@ int load_uimage(const char *filename, hwaddr *ep, * * Returns the size of the loaded image on success, -1 otherwise. */ -int load_ramdisk_as(const char *filename, hwaddr addr, uint64_t max_sz, - AddressSpace *as); +ssize_t load_ramdisk_as(const char *filename, hwaddr addr, uint64_t max_sz, + AddressSpace *as); /** * load_ramdisk: * Same as load_ramdisk_as(), but doesn't allow the caller to specify * an AddressSpace. */ -int load_ramdisk(const char *filename, hwaddr addr, uint64_t max_sz); +ssize_t load_ramdisk(const char *filename, hwaddr addr, uint64_t max_sz); ssize_t gunzip(void *dst, size_t dstlen, uint8_t *src, size_t srclen); @@ -253,9 +254,9 @@ void pstrcpy_targphys(const char *name, extern bool option_rom_has_mr; extern bool rom_file_has_mr; -int rom_add_file(const char *file, const char *fw_dir, - hwaddr addr, int32_t bootindex, - bool option_rom, MemoryRegion *mr, AddressSpace *as); +ssize_t rom_add_file(const char *file, const char *fw_dir, + hwaddr addr, int32_t bootindex, + bool option_rom, MemoryRegion *mr, AddressSpace *as); MemoryRegion *rom_add_blob(const char *name, const void *blob, size_t len, size_t max_len, hwaddr addr, const char *fw_file_name, @@ -336,8 +337,8 @@ void hmp_info_roms(Monitor *mon, const QDict *qdict); #define rom_add_blob_fixed_as(_f, _b, _l, _a, _as) \ rom_add_blob(_f, _b, _l, _l, _a, NULL, NULL, NULL, _as, true) -int rom_add_vga(const char *file); -int rom_add_option(const char *file, int32_t bootindex); +ssize_t rom_add_vga(const char *file); +ssize_t rom_add_option(const char *file, int32_t bootindex); /* This is the usual maximum in uboot, so if a uImage overflows this, it would * overflow on real hardware too. */ diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index a91253d4bd..05e6521351 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -118,6 +118,8 @@ static const char * const riscv_intr_names[] = { "reserved" }; +static void register_cpu_props(DeviceState *dev); + const char *riscv_cpu_get_trap_name(target_ulong cause, bool async) { if (async) { @@ -161,6 +163,7 @@ static void riscv_any_cpu_init(Object *obj) set_misa(env, MXL_RV64, RVI | RVM | RVA | RVF | RVD | RVC | RVU); #endif set_priv_version(env, PRIV_VERSION_1_12_0); + register_cpu_props(DEVICE(obj)); } #if defined(TARGET_RISCV64) @@ -169,6 +172,7 @@ static void rv64_base_cpu_init(Object *obj) CPURISCVState *env = &RISCV_CPU(obj)->env; /* We set this in the realise function */ set_misa(env, MXL_RV64, 0); + register_cpu_props(DEVICE(obj)); } static void rv64_sifive_u_cpu_init(Object *obj) @@ -181,9 +185,11 @@ static void rv64_sifive_u_cpu_init(Object *obj) static void rv64_sifive_e_cpu_init(Object *obj) { CPURISCVState *env = &RISCV_CPU(obj)->env; + RISCVCPU *cpu = RISCV_CPU(obj); + set_misa(env, MXL_RV64, RVI | RVM | RVA | RVC | RVU); set_priv_version(env, PRIV_VERSION_1_10_0); - qdev_prop_set_bit(DEVICE(obj), "mmu", false); + cpu->cfg.mmu = false; } static void rv128_base_cpu_init(Object *obj) @@ -197,6 +203,7 @@ static void rv128_base_cpu_init(Object *obj) CPURISCVState *env = &RISCV_CPU(obj)->env; /* We set this in the realise function */ set_misa(env, MXL_RV128, 0); + register_cpu_props(DEVICE(obj)); } #else static void rv32_base_cpu_init(Object *obj) @@ -204,6 +211,7 @@ static void rv32_base_cpu_init(Object *obj) CPURISCVState *env = &RISCV_CPU(obj)->env; /* We set this in the realise function */ set_misa(env, MXL_RV32, 0); + register_cpu_props(DEVICE(obj)); } static void rv32_sifive_u_cpu_init(Object *obj) @@ -216,27 +224,33 @@ static void rv32_sifive_u_cpu_init(Object *obj) static void rv32_sifive_e_cpu_init(Object *obj) { CPURISCVState *env = &RISCV_CPU(obj)->env; + RISCVCPU *cpu = RISCV_CPU(obj); + set_misa(env, MXL_RV32, RVI | RVM | RVA | RVC | RVU); set_priv_version(env, PRIV_VERSION_1_10_0); - qdev_prop_set_bit(DEVICE(obj), "mmu", false); + cpu->cfg.mmu = false; } static void rv32_ibex_cpu_init(Object *obj) { CPURISCVState *env = &RISCV_CPU(obj)->env; + RISCVCPU *cpu = RISCV_CPU(obj); + set_misa(env, MXL_RV32, RVI | RVM | RVC | RVU); set_priv_version(env, PRIV_VERSION_1_10_0); - qdev_prop_set_bit(DEVICE(obj), "mmu", false); - qdev_prop_set_bit(DEVICE(obj), "x-epmp", true); + cpu->cfg.mmu = false; + cpu->cfg.epmp = true; } static void rv32_imafcu_nommu_cpu_init(Object *obj) { CPURISCVState *env = &RISCV_CPU(obj)->env; + RISCVCPU *cpu = RISCV_CPU(obj); + set_misa(env, MXL_RV32, RVI | RVM | RVA | RVF | RVC | RVU); set_priv_version(env, PRIV_VERSION_1_10_0); set_resetvec(env, DEFAULT_RSTVEC); - qdev_prop_set_bit(DEVICE(obj), "mmu", false); + cpu->cfg.mmu = false; } #endif @@ -249,6 +263,7 @@ static void riscv_host_cpu_init(Object *obj) #elif defined(TARGET_RISCV64) set_misa(env, MXL_RV64, 0); #endif + register_cpu_props(DEVICE(obj)); } #endif @@ -391,7 +406,7 @@ static bool riscv_cpu_has_work(CPUState *cs) * Definition of the WFI instruction requires it to ignore the privilege * mode and delegation registers, but respect individual enables */ - return (env->mip & env->mie) != 0; + return riscv_cpu_all_pending(env) != 0; #else return true; #endif @@ -600,6 +615,11 @@ static void riscv_cpu_realize(DeviceState *dev, Error **errp) cpu->cfg.ext_ifencei = true; } + if (cpu->cfg.ext_m && cpu->cfg.ext_zmmul) { + warn_report("Zmmul will override M"); + cpu->cfg.ext_m = false; + } + if (cpu->cfg.ext_i && cpu->cfg.ext_e) { error_setg(errp, "I and E extensions are incompatible"); @@ -831,6 +851,12 @@ static void riscv_cpu_init(Object *obj) { RISCVCPU *cpu = RISCV_CPU(obj); + cpu->cfg.ext_counters = true; + cpu->cfg.ext_ifencei = true; + cpu->cfg.ext_icsr = true; + cpu->cfg.mmu = true; + cpu->cfg.pmp = true; + cpu_set_cpustate_pointers(cpu); #ifndef CONFIG_USER_ONLY @@ -839,7 +865,7 @@ static void riscv_cpu_init(Object *obj) #endif /* CONFIG_USER_ONLY */ } -static Property riscv_cpu_properties[] = { +static Property riscv_cpu_extensions[] = { /* Defaults for standard extensions */ DEFINE_PROP_BOOL("i", RISCVCPU, cfg.ext_i, true), DEFINE_PROP_BOOL("e", RISCVCPU, cfg.ext_e, false), @@ -862,17 +888,12 @@ static Property riscv_cpu_properties[] = { DEFINE_PROP_BOOL("Zve64f", RISCVCPU, cfg.ext_zve64f, false), DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true), DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true), - DEFINE_PROP_BOOL("debug", RISCVCPU, cfg.debug, true), DEFINE_PROP_STRING("priv_spec", RISCVCPU, cfg.priv_spec), DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec), DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128), DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64), - DEFINE_PROP_UINT32("mvendorid", RISCVCPU, cfg.mvendorid, 0), - DEFINE_PROP_UINT64("marchid", RISCVCPU, cfg.marchid, RISCV_CPU_MARCHID), - DEFINE_PROP_UINT64("mimpid", RISCVCPU, cfg.mimpid, RISCV_CPU_MIMPID), - DEFINE_PROP_BOOL("svinval", RISCVCPU, cfg.ext_svinval, false), DEFINE_PROP_BOOL("svnapot", RISCVCPU, cfg.ext_svnapot, false), DEFINE_PROP_BOOL("svpbmt", RISCVCPU, cfg.ext_svpbmt, false), @@ -905,13 +926,35 @@ static Property riscv_cpu_properties[] = { /* These are experimental so mark with 'x-' */ DEFINE_PROP_BOOL("x-j", RISCVCPU, cfg.ext_j, false), + DEFINE_PROP_BOOL("x-zmmul", RISCVCPU, cfg.ext_zmmul, false), /* ePMP 0.9.3 */ DEFINE_PROP_BOOL("x-epmp", RISCVCPU, cfg.epmp, false), DEFINE_PROP_BOOL("x-aia", RISCVCPU, cfg.aia, false), + DEFINE_PROP_END_OF_LIST(), +}; + +static void register_cpu_props(DeviceState *dev) +{ + Property *prop; + + for (prop = riscv_cpu_extensions; prop && prop->name; prop++) { + qdev_property_add_static(dev, prop); + } +} + +static Property riscv_cpu_properties[] = { + DEFINE_PROP_BOOL("debug", RISCVCPU, cfg.debug, true), + + DEFINE_PROP_UINT32("mvendorid", RISCVCPU, cfg.mvendorid, 0), + DEFINE_PROP_UINT64("marchid", RISCVCPU, cfg.marchid, RISCV_CPU_MARCHID), + DEFINE_PROP_UINT64("mimpid", RISCVCPU, cfg.mimpid, RISCV_CPU_MIMPID), + DEFINE_PROP_UINT64("resetvec", RISCVCPU, cfg.resetvec, DEFAULT_RSTVEC), DEFINE_PROP_BOOL("short-isa-string", RISCVCPU, cfg.short_isa_string, false), + + DEFINE_PROP_BOOL("rvv_ta_all_1s", RISCVCPU, cfg.rvv_ta_all_1s, false), DEFINE_PROP_END_OF_LIST(), }; @@ -1031,6 +1074,7 @@ static void riscv_isa_string_ext(RISCVCPU *cpu, char **isa_str, int max_str_len) struct isa_ext_data isa_edata_arr[] = { ISA_EDATA_ENTRY(zicsr, ext_icsr), ISA_EDATA_ENTRY(zifencei, ext_ifencei), + ISA_EDATA_ENTRY(zmmul, ext_zmmul), ISA_EDATA_ENTRY(zfh, ext_zfh), ISA_EDATA_ENTRY(zfhmin, ext_zfhmin), ISA_EDATA_ENTRY(zfinx, ext_zfinx), diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h index f08c3e8813..7d6397acdf 100644 --- a/target/riscv/cpu.h +++ b/target/riscv/cpu.h @@ -411,6 +411,8 @@ struct RISCVCPUConfig { bool ext_zhinxmin; bool ext_zve32f; bool ext_zve64f; + bool ext_zmmul; + bool rvv_ta_all_1s; uint32_t mvendorid; uint64_t marchid; @@ -488,6 +490,7 @@ int riscv_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg); int riscv_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg); int riscv_cpu_hviprio_index2irq(int index, int *out_irq, int *out_rdzero); uint8_t riscv_cpu_default_priority(int irq); +uint64_t riscv_cpu_all_pending(CPURISCVState *env); int riscv_cpu_mirq_pending(CPURISCVState *env); int riscv_cpu_sirq_pending(CPURISCVState *env); int riscv_cpu_vsirq_pending(CPURISCVState *env); @@ -565,6 +568,7 @@ FIELD(TB_FLAGS, XL, 20, 2) /* If PointerMasking should be applied */ FIELD(TB_FLAGS, PM_MASK_ENABLED, 22, 1) FIELD(TB_FLAGS, PM_BASE_ENABLED, 23, 1) +FIELD(TB_FLAGS, VTA, 24, 1) #ifdef TARGET_RISCV32 #define riscv_cpu_mxl(env) ((void)(env), MXL_RV32) diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c index d99fac9d2d..4a6700c890 100644 --- a/target/riscv/cpu_helper.c +++ b/target/riscv/cpu_helper.c @@ -65,6 +65,8 @@ void cpu_get_tb_cpu_state(CPURISCVState *env, target_ulong *pc, flags = FIELD_DP32(flags, TB_FLAGS, LMUL, FIELD_EX64(env->vtype, VTYPE, VLMUL)); flags = FIELD_DP32(flags, TB_FLAGS, VL_EQ_VLMAX, vl_eq_vlmax); + flags = FIELD_DP32(flags, TB_FLAGS, VTA, + FIELD_EX64(env->vtype, VTYPE, VTA)); } else { flags = FIELD_DP32(flags, TB_FLAGS, VILL, 1); } @@ -340,7 +342,7 @@ static int riscv_cpu_pending_to_irq(CPURISCVState *env, return best_irq; } -static uint64_t riscv_cpu_all_pending(CPURISCVState *env) +uint64_t riscv_cpu_all_pending(CPURISCVState *env) { uint32_t gein = get_field(env->hstatus, HSTATUS_VGEIN); uint64_t vsgein = (env->hgeip & (1ULL << gein)) ? MIP_VSEIP : 0; diff --git a/target/riscv/debug.c b/target/riscv/debug.c index 2f2a51c732..fc6e13222f 100644 --- a/target/riscv/debug.c +++ b/target/riscv/debug.c @@ -77,6 +77,7 @@ static inline target_ulong trigger_type(CPURISCVState *env, tdata1 = RV32_TYPE(type); break; case MXL_RV64: + case MXL_RV128: tdata1 = RV64_TYPE(type); break; default: @@ -123,6 +124,7 @@ static target_ulong tdata1_validate(CPURISCVState *env, target_ulong val, tdata1 = RV32_TYPE(t); break; case MXL_RV64: + case MXL_RV128: type = extract64(val, 60, 4); dmode = extract64(val, 59, 1); tdata1 = RV64_TYPE(t); diff --git a/target/riscv/insn_trans/trans_rvm.c.inc b/target/riscv/insn_trans/trans_rvm.c.inc index 16b029edf0..ec7f705aab 100644 --- a/target/riscv/insn_trans/trans_rvm.c.inc +++ b/target/riscv/insn_trans/trans_rvm.c.inc @@ -18,6 +18,12 @@ * this program. If not, see . */ +#define REQUIRE_M_OR_ZMMUL(ctx) do { \ + if (!ctx->cfg_ptr->ext_zmmul && !has_ext(ctx, RVM)) { \ + return false; \ + } \ +} while (0) + static void gen_mulhu_i128(TCGv r2, TCGv r3, TCGv al, TCGv ah, TCGv bl, TCGv bh) { TCGv tmpl = tcg_temp_new(); @@ -65,7 +71,7 @@ static void gen_mul_i128(TCGv rl, TCGv rh, static bool trans_mul(DisasContext *ctx, arg_mul *a) { - REQUIRE_EXT(ctx, RVM); + REQUIRE_M_OR_ZMMUL(ctx); return gen_arith(ctx, a, EXT_NONE, tcg_gen_mul_tl, gen_mul_i128); } @@ -109,7 +115,7 @@ static void gen_mulh_w(TCGv ret, TCGv s1, TCGv s2) static bool trans_mulh(DisasContext *ctx, arg_mulh *a) { - REQUIRE_EXT(ctx, RVM); + REQUIRE_M_OR_ZMMUL(ctx); return gen_arith_per_ol(ctx, a, EXT_SIGN, gen_mulh, gen_mulh_w, gen_mulh_i128); } @@ -161,7 +167,7 @@ static void gen_mulhsu_w(TCGv ret, TCGv arg1, TCGv arg2) static bool trans_mulhsu(DisasContext *ctx, arg_mulhsu *a) { - REQUIRE_EXT(ctx, RVM); + REQUIRE_M_OR_ZMMUL(ctx); return gen_arith_per_ol(ctx, a, EXT_NONE, gen_mulhsu, gen_mulhsu_w, gen_mulhsu_i128); } @@ -176,7 +182,7 @@ static void gen_mulhu(TCGv ret, TCGv s1, TCGv s2) static bool trans_mulhu(DisasContext *ctx, arg_mulhu *a) { - REQUIRE_EXT(ctx, RVM); + REQUIRE_M_OR_ZMMUL(ctx); /* gen_mulh_w works for either sign as input. */ return gen_arith_per_ol(ctx, a, EXT_ZERO, gen_mulhu, gen_mulh_w, gen_mulhu_i128); @@ -349,7 +355,7 @@ static bool trans_remu(DisasContext *ctx, arg_remu *a) static bool trans_mulw(DisasContext *ctx, arg_mulw *a) { REQUIRE_64_OR_128BIT(ctx); - REQUIRE_EXT(ctx, RVM); + REQUIRE_M_OR_ZMMUL(ctx); ctx->ol = MXL_RV32; return gen_arith(ctx, a, EXT_NONE, tcg_gen_mul_tl, NULL); } @@ -389,7 +395,7 @@ static bool trans_remuw(DisasContext *ctx, arg_remuw *a) static bool trans_muld(DisasContext *ctx, arg_muld *a) { REQUIRE_128BIT(ctx); - REQUIRE_EXT(ctx, RVM); + REQUIRE_M_OR_ZMMUL(ctx); ctx->ol = MXL_RV64; return gen_arith(ctx, a, EXT_SIGN, tcg_gen_mul_tl, NULL); } diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index 391c61fe93..6c091824b6 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -652,6 +652,7 @@ static bool ldst_us_trans(uint32_t vd, uint32_t rs1, uint32_t data, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -710,6 +711,7 @@ static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); } @@ -773,6 +775,8 @@ static bool ld_us_mask_op(DisasContext *s, arg_vlm_v *a, uint8_t eew) /* EMUL = 1, NFIELDS = 1 */ data = FIELD_DP32(data, VDATA, LMUL, 0); data = FIELD_DP32(data, VDATA, NF, 1); + /* Mask destination register are always tail-agnostic */ + data = FIELD_DP32(data, VDATA, VTA, s->cfg_vta_all_1s); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); } @@ -818,6 +822,7 @@ static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -860,6 +865,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); } @@ -925,6 +931,7 @@ static bool ldst_index_trans(uint32_t vd, uint32_t rs1, uint32_t vs2, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -988,6 +995,7 @@ static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); } @@ -1067,6 +1075,7 @@ static bool ldff_trans(uint32_t vd, uint32_t rs1, uint32_t data, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -1104,6 +1113,7 @@ static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldff_trans(a->rd, a->rs1, data, fn, s); } @@ -1225,8 +1235,9 @@ do_opivv_gvec(DisasContext *s, arg_rmrr *a, GVecGen3Fn *gvec_fn, } tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); - if (a->vm && s->vl_eq_vlmax) { + if (a->vm && s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { gvec_fn(s->sew, vreg_ofs(s, a->rd), vreg_ofs(s, a->rs2), vreg_ofs(s, a->rs1), MAXSZ(s), MAXSZ(s)); @@ -1235,6 +1246,7 @@ do_opivv_gvec(DisasContext *s, arg_rmrr *a, GVecGen3Fn *gvec_fn, data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2), cpu_env, s->cfg_ptr->vlen / 8, @@ -1272,6 +1284,7 @@ static bool opivx_trans(uint32_t vd, uint32_t rs1, uint32_t vs2, uint32_t vm, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -1280,6 +1293,8 @@ static bool opivx_trans(uint32_t vd, uint32_t rs1, uint32_t vs2, uint32_t vm, data = FIELD_DP32(data, VDATA, VM, vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); + data = FIELD_DP32(data, VDATA, VTA_ALL_1S, s->cfg_vta_all_1s); desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlen / 8, s->cfg_ptr->vlen / 8, data)); @@ -1315,7 +1330,7 @@ do_opivx_gvec(DisasContext *s, arg_rmrr *a, GVecGen2sFn *gvec_fn, return false; } - if (a->vm && s->vl_eq_vlmax) { + if (a->vm && s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { TCGv_i64 src1 = tcg_temp_new_i64(); tcg_gen_ext_tl_i64(src1, get_gpr(s, a->rs1, EXT_SIGN)); @@ -1436,6 +1451,7 @@ static bool opivi_trans(uint32_t vd, uint32_t imm, uint32_t vs2, uint32_t vm, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -1444,6 +1460,8 @@ static bool opivi_trans(uint32_t vd, uint32_t imm, uint32_t vs2, uint32_t vm, data = FIELD_DP32(data, VDATA, VM, vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); + data = FIELD_DP32(data, VDATA, VTA_ALL_1S, s->cfg_vta_all_1s); desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlen / 8, s->cfg_ptr->vlen / 8, data)); @@ -1472,7 +1490,7 @@ do_opivi_gvec(DisasContext *s, arg_rmrr *a, GVecGen2iFn *gvec_fn, return false; } - if (a->vm && s->vl_eq_vlmax) { + if (a->vm && s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { gvec_fn(s->sew, vreg_ofs(s, a->rd), vreg_ofs(s, a->rs2), extract_imm(s, a->rs1, imm_mode), MAXSZ(s), MAXSZ(s)); mark_vs_dirty(s); @@ -1522,9 +1540,11 @@ static bool do_opivv_widen(DisasContext *s, arg_rmrr *a, uint32_t data = 0; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2), @@ -1602,9 +1622,11 @@ static bool do_opiwv_widen(DisasContext *s, arg_rmrr *a, uint32_t data = 0; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2), @@ -1679,9 +1701,13 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ }; \ TCGLabel *over = gen_new_label(); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ + data = \ + FIELD_DP32(data, VDATA, VTA_ALL_1S, s->cfg_vta_all_1s);\ tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs1), \ vreg_ofs(s, a->rs2), cpu_env, \ @@ -1805,7 +1831,7 @@ do_opivx_gvec_shift(DisasContext *s, arg_rmrr *a, GVecGen2sFn32 *gvec_fn, return false; } - if (a->vm && s->vl_eq_vlmax) { + if (a->vm && s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { TCGv_i32 src1 = tcg_temp_new_i32(); tcg_gen_trunc_tl_i32(src1, get_gpr(s, a->rs1, EXT_NONE)); @@ -1860,9 +1886,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ }; \ TCGLabel *over = gen_new_label(); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs1), \ vreg_ofs(s, a->rs2), cpu_env, \ @@ -2058,18 +2086,20 @@ static bool trans_vmv_v_v(DisasContext *s, arg_vmv_v_v *a) vext_check_isa_ill(s) && /* vmv.v.v has rs2 = 0 and vm = 1 */ vext_check_sss(s, a->rd, a->rs1, 0, 1)) { - if (s->vl_eq_vlmax) { + if (s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { tcg_gen_gvec_mov(s->sew, vreg_ofs(s, a->rd), vreg_ofs(s, a->rs1), MAXSZ(s), MAXSZ(s)); } else { uint32_t data = FIELD_DP32(0, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); static gen_helper_gvec_2_ptr * const fns[4] = { gen_helper_vmv_v_v_b, gen_helper_vmv_v_v_h, gen_helper_vmv_v_v_w, gen_helper_vmv_v_v_d, }; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); tcg_gen_gvec_2_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, a->rs1), cpu_env, s->cfg_ptr->vlen / 8, @@ -2093,17 +2123,27 @@ static bool trans_vmv_v_x(DisasContext *s, arg_vmv_v_x *a) TCGv s1; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); s1 = get_gpr(s, a->rs1, EXT_SIGN); - if (s->vl_eq_vlmax) { - tcg_gen_gvec_dup_tl(s->sew, vreg_ofs(s, a->rd), - MAXSZ(s), MAXSZ(s), s1); + if (s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { + if (get_xl(s) == MXL_RV32 && s->sew == MO_64) { + TCGv_i64 s1_i64 = tcg_temp_new_i64(); + tcg_gen_ext_tl_i64(s1_i64, s1); + tcg_gen_gvec_dup_i64(s->sew, vreg_ofs(s, a->rd), + MAXSZ(s), MAXSZ(s), s1_i64); + tcg_temp_free_i64(s1_i64); + } else { + tcg_gen_gvec_dup_tl(s->sew, vreg_ofs(s, a->rd), + MAXSZ(s), MAXSZ(s), s1); + } } else { TCGv_i32 desc; TCGv_i64 s1_i64 = tcg_temp_new_i64(); TCGv_ptr dest = tcg_temp_new_ptr(); uint32_t data = FIELD_DP32(0, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); static gen_helper_vmv_vx * const fns[4] = { gen_helper_vmv_v_x_b, gen_helper_vmv_v_x_h, gen_helper_vmv_v_x_w, gen_helper_vmv_v_x_d, @@ -2133,7 +2173,7 @@ static bool trans_vmv_v_i(DisasContext *s, arg_vmv_v_i *a) /* vmv.v.i has rs2 = 0 and vm = 1 */ vext_check_ss(s, a->rd, 0, 1)) { int64_t simm = sextract64(a->rs1, 0, 5); - if (s->vl_eq_vlmax) { + if (s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { tcg_gen_gvec_dup_imm(s->sew, vreg_ofs(s, a->rd), MAXSZ(s), MAXSZ(s), simm); mark_vs_dirty(s); @@ -2142,12 +2182,14 @@ static bool trans_vmv_v_i(DisasContext *s, arg_vmv_v_i *a) TCGv_i64 s1; TCGv_ptr dest; uint32_t data = FIELD_DP32(0, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); static gen_helper_vmv_vx * const fns[4] = { gen_helper_vmv_v_x_b, gen_helper_vmv_v_x_h, gen_helper_vmv_v_x_w, gen_helper_vmv_v_x_d, }; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); s1 = tcg_constant_i64(simm); dest = tcg_temp_new_ptr(); @@ -2300,9 +2342,13 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, RISCV_FRM_DYN); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ + data = \ + FIELD_DP32(data, VDATA, VTA_ALL_1S, s->cfg_vta_all_1s);\ tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs1), \ vreg_ofs(s, a->rs2), cpu_env, \ @@ -2330,6 +2376,7 @@ static bool opfvf_trans(uint32_t vd, uint32_t rs1, uint32_t vs2, TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); dest = tcg_temp_new_ptr(); mask = tcg_temp_new_ptr(); @@ -2384,6 +2431,9 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ gen_set_rm(s, RISCV_FRM_DYN); \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ + data = FIELD_DP32(data, VDATA, VTA_ALL_1S, \ + s->cfg_vta_all_1s); \ return opfvf_trans(a->rd, a->rs1, a->rs2, data, \ fns[s->sew - 1], s); \ } \ @@ -2418,9 +2468,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, RISCV_FRM_DYN); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over);\ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs1), \ vreg_ofs(s, a->rs2), cpu_env, \ @@ -2460,6 +2512,7 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ gen_set_rm(s, RISCV_FRM_DYN); \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ return opfvf_trans(a->rd, a->rs1, a->rs2, data, \ fns[s->sew - 1], s); \ } \ @@ -2492,9 +2545,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, RISCV_FRM_DYN); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs1), \ vreg_ofs(s, a->rs2), cpu_env, \ @@ -2534,6 +2589,7 @@ static bool trans_##NAME(DisasContext *s, arg_rmrr *a) \ gen_set_rm(s, RISCV_FRM_DYN); \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ return opfvf_trans(a->rd, a->rs1, a->rs2, data, \ fns[s->sew - 1], s); \ } \ @@ -2613,9 +2669,11 @@ static bool do_opfv(DisasContext *s, arg_rmr *a, TCGLabel *over = gen_new_label(); gen_set_rm(s, rm); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), vreg_ofs(s, a->rs2), cpu_env, s->cfg_ptr->vlen / 8, @@ -2707,7 +2765,7 @@ static bool trans_vfmv_v_f(DisasContext *s, arg_vfmv_v_f *a) TCGv_i64 t1; - if (s->vl_eq_vlmax) { + if (s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { t1 = tcg_temp_new_i64(); /* NaN-box f[rs1] */ do_nanbox(s, t1, cpu_fpr[a->rs1]); @@ -2719,6 +2777,7 @@ static bool trans_vfmv_v_f(DisasContext *s, arg_vfmv_v_f *a) TCGv_ptr dest; TCGv_i32 desc; uint32_t data = FIELD_DP32(0, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); static gen_helper_vmv_vx * const fns[3] = { gen_helper_vmv_v_x_h, gen_helper_vmv_v_x_w, @@ -2726,6 +2785,7 @@ static bool trans_vfmv_v_f(DisasContext *s, arg_vfmv_v_f *a) }; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); t1 = tcg_temp_new_i64(); /* NaN-box f[rs1] */ @@ -2814,9 +2874,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, FRM); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs2), cpu_env, \ s->cfg_ptr->vlen / 8, \ @@ -2865,8 +2927,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, RISCV_FRM_DYN); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ + data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs2), cpu_env, \ s->cfg_ptr->vlen / 8, \ @@ -2930,9 +2995,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, FRM); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs2), cpu_env, \ s->cfg_ptr->vlen / 8, \ @@ -2983,8 +3050,11 @@ static bool trans_##NAME(DisasContext *s, arg_rmr *a) \ TCGLabel *over = gen_new_label(); \ gen_set_rm(s, FRM); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ + data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = FIELD_DP32(data, VDATA, VTA, s->vta); \ tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs2), cpu_env, \ s->cfg_ptr->vlen / 8, \ @@ -3070,8 +3140,11 @@ static bool trans_##NAME(DisasContext *s, arg_r *a) \ gen_helper_gvec_4_ptr *fn = gen_helper_##NAME; \ TCGLabel *over = gen_new_label(); \ tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); \ + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); \ \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = \ + FIELD_DP32(data, VDATA, VTA_ALL_1S, s->cfg_vta_all_1s);\ tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), \ vreg_ofs(s, a->rs1), \ vreg_ofs(s, a->rs2), cpu_env, \ @@ -3176,6 +3249,8 @@ static bool trans_##NAME(DisasContext *s, arg_rmr *a) \ \ data = FIELD_DP32(data, VDATA, VM, a->vm); \ data = FIELD_DP32(data, VDATA, LMUL, s->lmul); \ + data = \ + FIELD_DP32(data, VDATA, VTA_ALL_1S, s->cfg_vta_all_1s);\ tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), \ vreg_ofs(s, 0), vreg_ofs(s, a->rs2), \ cpu_env, s->cfg_ptr->vlen / 8, \ @@ -3213,6 +3288,7 @@ static bool trans_viota_m(DisasContext *s, arg_viota_m *a) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); static gen_helper_gvec_3_ptr * const fns[4] = { gen_helper_viota_m_b, gen_helper_viota_m_h, gen_helper_viota_m_w, gen_helper_viota_m_d, @@ -3238,9 +3314,11 @@ static bool trans_vid_v(DisasContext *s, arg_vid_v *a) uint32_t data = 0; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); static gen_helper_gvec_2_ptr * const fns[4] = { gen_helper_vid_v_b, gen_helper_vid_v_h, gen_helper_vid_v_w, gen_helper_vid_v_d, @@ -3599,7 +3677,7 @@ static bool trans_vrgather_vx(DisasContext *s, arg_rmrr *a) return false; } - if (a->vm && s->vl_eq_vlmax) { + if (a->vm && s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { int scale = s->lmul - (s->sew + 3); int vlmax = s->cfg_ptr->vlen >> -scale; TCGv_i64 dest = tcg_temp_new_i64(); @@ -3631,7 +3709,7 @@ static bool trans_vrgather_vi(DisasContext *s, arg_rmrr *a) return false; } - if (a->vm && s->vl_eq_vlmax) { + if (a->vm && s->vl_eq_vlmax && !(s->vta && s->lmul < 0)) { int scale = s->lmul - (s->sew + 3); int vlmax = s->cfg_ptr->vlen >> -scale; if (a->rs1 >= vlmax) { @@ -3683,6 +3761,7 @@ static bool trans_vcompress_vm(DisasContext *s, arg_r *a) tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); tcg_gen_gvec_4_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), vreg_ofs(s, a->rs1), vreg_ofs(s, a->rs2), cpu_env, s->cfg_ptr->vlen / 8, @@ -3748,6 +3827,7 @@ static bool int_ext_op(DisasContext *s, arg_rmr *a, uint8_t seq) gen_helper_gvec_3_ptr *fn; TCGLabel *over = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_vl, 0, over); + tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over); static gen_helper_gvec_3_ptr * const fns[6][4] = { { @@ -3782,6 +3862,8 @@ static bool int_ext_op(DisasContext *s, arg_rmr *a, uint8_t seq) } data = FIELD_DP32(data, VDATA, VM, a->vm); + data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, VTA, s->vta); tcg_gen_gvec_3_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, 0), vreg_ofs(s, a->rs2), cpu_env, diff --git a/target/riscv/internals.h b/target/riscv/internals.h index dbb322bfa7..193ce57a6d 100644 --- a/target/riscv/internals.h +++ b/target/riscv/internals.h @@ -24,8 +24,10 @@ /* share data between vector helpers and decode code */ FIELD(VDATA, VM, 0, 1) FIELD(VDATA, LMUL, 1, 3) -FIELD(VDATA, NF, 4, 4) -FIELD(VDATA, WD, 4, 1) +FIELD(VDATA, VTA, 4, 1) +FIELD(VDATA, VTA_ALL_1S, 5, 1) +FIELD(VDATA, NF, 6, 4) +FIELD(VDATA, WD, 6, 1) /* float point classify helpers */ target_ulong fclass_h(uint64_t frs1); diff --git a/target/riscv/translate.c b/target/riscv/translate.c index 55a4713af2..b151c20674 100644 --- a/target/riscv/translate.c +++ b/target/riscv/translate.c @@ -94,6 +94,8 @@ typedef struct DisasContext { */ int8_t lmul; uint8_t sew; + uint8_t vta; + bool cfg_vta_all_1s; target_ulong vstart; bool vl_eq_vlmax; uint8_t ntemp; @@ -1099,6 +1101,8 @@ static void riscv_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs) ctx->vill = FIELD_EX32(tb_flags, TB_FLAGS, VILL); ctx->sew = FIELD_EX32(tb_flags, TB_FLAGS, SEW); ctx->lmul = sextract32(FIELD_EX32(tb_flags, TB_FLAGS, LMUL), 0, 3); + ctx->vta = FIELD_EX32(tb_flags, TB_FLAGS, VTA) && cpu->cfg.rvv_ta_all_1s; + ctx->cfg_vta_all_1s = cpu->cfg.rvv_ta_all_1s; ctx->vstart = env->vstart; ctx->vl_eq_vlmax = FIELD_EX32(tb_flags, TB_FLAGS, VL_EQ_VLMAX); ctx->misa_mxl_max = env->misa_mxl_max; diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 576b14e5a3..a96fc49c71 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -122,12 +122,22 @@ static inline int32_t vext_lmul(uint32_t desc) return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3); } +static inline uint32_t vext_vta(uint32_t desc) +{ + return FIELD_EX32(simd_data(desc), VDATA, VTA); +} + +static inline uint32_t vext_vta_all_1s(uint32_t desc) +{ + return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S); +} + /* * Get the maximum number of elements can be operated. * - * esz: log2 of element size in bytes. + * log2_esz: log2 of element size in bytes. */ -static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz) +static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz) { /* * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits. @@ -136,10 +146,25 @@ static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz) uint32_t vlenb = simd_maxsz(desc); /* Return VLMAX */ - int scale = vext_lmul(desc) - esz; + int scale = vext_lmul(desc) - log2_esz; return scale < 0 ? vlenb >> -scale : vlenb << scale; } +/* + * Get number of total elements, including prestart, body and tail elements. + * Note that when LMUL < 1, the tail includes the elements past VLMAX that + * are held in the same vector register. + */ +static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc, + uint32_t esz) +{ + uint32_t vlenb = simd_maxsz(desc); + uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW); + int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 : + ctzl(esz) - ctzl(sew) + vext_lmul(desc); + return (vlenb << emul) / esz; +} + static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr) { return (addr & env->cur_pmmask) | env->cur_pmbase; @@ -172,6 +197,20 @@ static void probe_pages(CPURISCVState *env, target_ulong addr, } } +/* set agnostic elements to 1s */ +static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt, + uint32_t tot) +{ + if (is_agnostic == 0) { + /* policy undisturbed */ + return; + } + if (tot - cnt == 0) { + return ; + } + memset(base + cnt, -1, tot - cnt); +} + static inline void vext_set_elem_mask(void *v0, int index, uint8_t value) { @@ -231,11 +270,14 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride, CPURISCVState *env, uint32_t desc, uint32_t vm, vext_ldst_elem_fn *ldst_elem, - uint32_t esz, uintptr_t ra, MMUAccessType access_type) + uint32_t log2_esz, uintptr_t ra) { uint32_t i, k; uint32_t nf = vext_nf(desc); - uint32_t max_elems = vext_max_elems(desc, esz); + uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); for (i = env->vstart; i < env->vl; i++, env->vstart++) { if (!vm && !vext_elem_mask(v0, i)) { @@ -244,12 +286,24 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, k = 0; while (k < nf) { - target_ulong addr = base + stride * i + (k << esz); + target_ulong addr = base + stride * i + (k << log2_esz); ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); k++; } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz, + (k * max_elems + max_elems) * esz); + } + if (nf * max_elems % total_elems != 0) { + uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3; + uint32_t registers_used = + ((nf * max_elems) * esz + (vlenb - 1)) / vlenb; + vext_set_elems_1s(vd, vta, (nf * max_elems) * esz, + registers_used * vlenb); + } } #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ @@ -259,7 +313,7 @@ void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ { \ uint32_t vm = vext_vm(desc); \ vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC(), MMU_DATA_LOAD); \ + ctzl(sizeof(ETYPE)), GETPC()); \ } GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) @@ -274,7 +328,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ { \ uint32_t vm = vext_vm(desc); \ vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN, \ - ctzl(sizeof(ETYPE)), GETPC(), MMU_DATA_STORE); \ + ctzl(sizeof(ETYPE)), GETPC()); \ } GEN_VEXT_ST_STRIDE(vsse8_v, int8_t, ste_b) @@ -289,23 +343,38 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d) /* unmasked unit-stride load and store operation*/ static void vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn *ldst_elem, uint32_t esz, uint32_t evl, - uintptr_t ra, MMUAccessType access_type) + vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl, + uintptr_t ra) { uint32_t i, k; uint32_t nf = vext_nf(desc); - uint32_t max_elems = vext_max_elems(desc, esz); + uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); /* load bytes from guest memory */ for (i = env->vstart; i < evl; i++, env->vstart++) { k = 0; while (k < nf) { - target_ulong addr = base + ((i * nf + k) << esz); + target_ulong addr = base + ((i * nf + k) << log2_esz); ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); k++; } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz, + (k * max_elems + max_elems) * esz); + } + if (nf * max_elems % total_elems != 0) { + uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3; + uint32_t registers_used = + ((nf * max_elems) * esz + (vlenb - 1)) / vlenb; + vext_set_elems_1s(vd, vta, (nf * max_elems) * esz, + registers_used * vlenb); + } } /* @@ -319,14 +388,14 @@ void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ { \ uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC(), MMU_DATA_LOAD); \ + ctzl(sizeof(ETYPE)), GETPC()); \ } \ \ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ CPURISCVState *env, uint32_t desc) \ { \ vext_ldst_us(vd, base, env, desc, LOAD_FN, \ - ctzl(sizeof(ETYPE)), env->vl, GETPC(), MMU_DATA_LOAD); \ + ctzl(sizeof(ETYPE)), env->vl, GETPC()); \ } GEN_VEXT_LD_US(vle8_v, int8_t, lde_b) @@ -340,14 +409,14 @@ void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ { \ uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN, \ - ctzl(sizeof(ETYPE)), GETPC(), MMU_DATA_STORE); \ + ctzl(sizeof(ETYPE)), GETPC()); \ } \ \ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ CPURISCVState *env, uint32_t desc) \ { \ vext_ldst_us(vd, base, env, desc, STORE_FN, \ - ctzl(sizeof(ETYPE)), env->vl, GETPC(), MMU_DATA_STORE); \ + ctzl(sizeof(ETYPE)), env->vl, GETPC()); \ } GEN_VEXT_ST_US(vse8_v, int8_t, ste_b) @@ -364,7 +433,7 @@ void HELPER(vlm_v)(void *vd, void *v0, target_ulong base, /* evl = ceil(vl/8) */ uint8_t evl = (env->vl + 7) >> 3; vext_ldst_us(vd, base, env, desc, lde_b, - 0, evl, GETPC(), MMU_DATA_LOAD); + 0, evl, GETPC()); } void HELPER(vsm_v)(void *vd, void *v0, target_ulong base, @@ -373,7 +442,7 @@ void HELPER(vsm_v)(void *vd, void *v0, target_ulong base, /* evl = ceil(vl/8) */ uint8_t evl = (env->vl + 7) >> 3; vext_ldst_us(vd, base, env, desc, ste_b, - 0, evl, GETPC(), MMU_DATA_STORE); + 0, evl, GETPC()); } /* @@ -399,12 +468,15 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, void *vs2, CPURISCVState *env, uint32_t desc, vext_get_index_addr get_index_addr, vext_ldst_elem_fn *ldst_elem, - uint32_t esz, uintptr_t ra, MMUAccessType access_type) + uint32_t log2_esz, uintptr_t ra) { uint32_t i, k; uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); - uint32_t max_elems = vext_max_elems(desc, esz); + uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); /* load bytes from guest memory */ for (i = env->vstart; i < env->vl; i++, env->vstart++) { @@ -414,12 +486,24 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, k = 0; while (k < nf) { - abi_ptr addr = get_index_addr(base, i, vs2) + (k << esz); + abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz); ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); k++; } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz, + (k * max_elems + max_elems) * esz); + } + if (nf * max_elems % total_elems != 0) { + uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3; + uint32_t registers_used = + ((nf * max_elems) * esz + (vlenb - 1)) / vlenb; + vext_set_elems_1s(vd, vta, (nf * max_elems) * esz, + registers_used * vlenb); + } } #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN) \ @@ -427,7 +511,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ void *vs2, CPURISCVState *env, uint32_t desc) \ { \ vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \ - LOAD_FN, ctzl(sizeof(ETYPE)), GETPC(), MMU_DATA_LOAD); \ + LOAD_FN, ctzl(sizeof(ETYPE)), GETPC()); \ } GEN_VEXT_LD_INDEX(vlxei8_8_v, int8_t, idx_b, lde_b) @@ -453,7 +537,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ { \ vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \ STORE_FN, ctzl(sizeof(ETYPE)), \ - GETPC(), MMU_DATA_STORE); \ + GETPC()); \ } GEN_VEXT_ST_INDEX(vsxei8_8_v, int8_t, idx_b, ste_b) @@ -480,13 +564,16 @@ static inline void vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env, uint32_t desc, vext_ldst_elem_fn *ldst_elem, - uint32_t esz, uintptr_t ra) + uint32_t log2_esz, uintptr_t ra) { void *host; uint32_t i, k, vl = 0; uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); - uint32_t max_elems = vext_max_elems(desc, esz); + uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); target_ulong addr, offset, remain; /* probe every access*/ @@ -494,12 +581,12 @@ vext_ldff(void *vd, void *v0, target_ulong base, if (!vm && !vext_elem_mask(v0, i)) { continue; } - addr = adjust_addr(env, base + i * (nf << esz)); + addr = adjust_addr(env, base + i * (nf << log2_esz)); if (i == 0) { - probe_pages(env, addr, nf << esz, ra, MMU_DATA_LOAD); + probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD); } else { /* if it triggers an exception, no need to check watchpoint */ - remain = nf << esz; + remain = nf << log2_esz; while (remain > 0) { offset = -(addr | TARGET_PAGE_MASK); host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, @@ -536,12 +623,24 @@ ProbeSuccess: continue; } while (k < nf) { - target_ulong addr = base + ((i * nf + k) << esz); + target_ulong addr = base + ((i * nf + k) << log2_esz); ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); k++; } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz, + (k * max_elems + max_elems) * esz); + } + if (nf * max_elems % total_elems != 0) { + uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3; + uint32_t registers_used = + ((nf * max_elems) * esz + (vlenb - 1)) / vlenb; + vext_set_elems_1s(vd, vta, (nf * max_elems) * esz, + registers_used * vlenb); + } } #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \ @@ -576,13 +675,12 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d) */ static void vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn *ldst_elem, uint32_t esz, uintptr_t ra, - MMUAccessType access_type) + vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra) { uint32_t i, k, off, pos; uint32_t nf = vext_nf(desc); uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3; - uint32_t max_elems = vlenb >> esz; + uint32_t max_elems = vlenb >> log2_esz; k = env->vstart / max_elems; off = env->vstart % max_elems; @@ -590,7 +688,7 @@ vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, if (off) { /* load/store rest of elements of current segment pointed by vstart */ for (pos = off; pos < max_elems; pos++, env->vstart++) { - target_ulong addr = base + ((pos + k * max_elems) << esz); + target_ulong addr = base + ((pos + k * max_elems) << log2_esz); ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra); } k++; @@ -599,7 +697,7 @@ vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, /* load/store elements for rest of segments */ for (; k < nf; k++) { for (i = 0; i < max_elems; i++, env->vstart++) { - target_ulong addr = base + ((i + k * max_elems) << esz); + target_ulong addr = base + ((i + k * max_elems) << log2_esz); ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); } } @@ -612,8 +710,7 @@ void HELPER(NAME)(void *vd, target_ulong base, \ CPURISCVState *env, uint32_t desc) \ { \ vext_ldst_whole(vd, base, env, desc, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC(), \ - MMU_DATA_LOAD); \ + ctzl(sizeof(ETYPE)), GETPC()); \ } GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b) @@ -638,8 +735,7 @@ void HELPER(NAME)(void *vd, target_ulong base, \ CPURISCVState *env, uint32_t desc) \ { \ vext_ldst_whole(vd, base, env, desc, STORE_FN, \ - ctzl(sizeof(ETYPE)), GETPC(), \ - MMU_DATA_STORE); \ + ctzl(sizeof(ETYPE)), GETPC()); \ } GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b) @@ -710,11 +806,12 @@ RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB) static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2, CPURISCVState *env, uint32_t desc, - uint32_t esz, uint32_t dsz, - opivv2_fn *fn) + opivv2_fn *fn, uint32_t esz) { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); uint32_t i; for (i = env->vstart; i < vl; i++) { @@ -724,26 +821,28 @@ static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2, fn(vd, vs1, vs2, i); } env->vstart = 0; + /* set tail elements to 1s */ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); } /* generate the helpers for OPIVV */ -#define GEN_VEXT_VV(NAME, ESZ, DSZ) \ +#define GEN_VEXT_VV(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ void *vs2, CPURISCVState *env, \ uint32_t desc) \ { \ - do_vext_vv(vd, v0, vs1, vs2, env, desc, ESZ, DSZ, \ - do_##NAME); \ + do_vext_vv(vd, v0, vs1, vs2, env, desc, \ + do_##NAME, ESZ); \ } -GEN_VEXT_VV(vadd_vv_b, 1, 1) -GEN_VEXT_VV(vadd_vv_h, 2, 2) -GEN_VEXT_VV(vadd_vv_w, 4, 4) -GEN_VEXT_VV(vadd_vv_d, 8, 8) -GEN_VEXT_VV(vsub_vv_b, 1, 1) -GEN_VEXT_VV(vsub_vv_h, 2, 2) -GEN_VEXT_VV(vsub_vv_w, 4, 4) -GEN_VEXT_VV(vsub_vv_d, 8, 8) +GEN_VEXT_VV(vadd_vv_b, 1) +GEN_VEXT_VV(vadd_vv_h, 2) +GEN_VEXT_VV(vadd_vv_w, 4) +GEN_VEXT_VV(vadd_vv_d, 8) +GEN_VEXT_VV(vsub_vv_b, 1) +GEN_VEXT_VV(vsub_vv_h, 2) +GEN_VEXT_VV(vsub_vv_w, 4) +GEN_VEXT_VV(vsub_vv_d, 8) typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i); @@ -773,11 +872,12 @@ RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB) static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2, CPURISCVState *env, uint32_t desc, - uint32_t esz, uint32_t dsz, - opivx2_fn fn) + opivx2_fn fn, uint32_t esz) { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); uint32_t i; for (i = env->vstart; i < vl; i++) { @@ -787,30 +887,32 @@ static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2, fn(vd, s1, vs2, i); } env->vstart = 0; + /* set tail elements to 1s */ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); } /* generate the helpers for OPIVX */ -#define GEN_VEXT_VX(NAME, ESZ, DSZ) \ +#define GEN_VEXT_VX(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ void *vs2, CPURISCVState *env, \ uint32_t desc) \ { \ - do_vext_vx(vd, v0, s1, vs2, env, desc, ESZ, DSZ, \ - do_##NAME); \ + do_vext_vx(vd, v0, s1, vs2, env, desc, \ + do_##NAME, ESZ); \ } -GEN_VEXT_VX(vadd_vx_b, 1, 1) -GEN_VEXT_VX(vadd_vx_h, 2, 2) -GEN_VEXT_VX(vadd_vx_w, 4, 4) -GEN_VEXT_VX(vadd_vx_d, 8, 8) -GEN_VEXT_VX(vsub_vx_b, 1, 1) -GEN_VEXT_VX(vsub_vx_h, 2, 2) -GEN_VEXT_VX(vsub_vx_w, 4, 4) -GEN_VEXT_VX(vsub_vx_d, 8, 8) -GEN_VEXT_VX(vrsub_vx_b, 1, 1) -GEN_VEXT_VX(vrsub_vx_h, 2, 2) -GEN_VEXT_VX(vrsub_vx_w, 4, 4) -GEN_VEXT_VX(vrsub_vx_d, 8, 8) +GEN_VEXT_VX(vadd_vx_b, 1) +GEN_VEXT_VX(vadd_vx_h, 2) +GEN_VEXT_VX(vadd_vx_w, 4) +GEN_VEXT_VX(vadd_vx_d, 8) +GEN_VEXT_VX(vsub_vx_b, 1) +GEN_VEXT_VX(vsub_vx_h, 2) +GEN_VEXT_VX(vsub_vx_w, 4) +GEN_VEXT_VX(vsub_vx_d, 8) +GEN_VEXT_VX(vrsub_vx_b, 1) +GEN_VEXT_VX(vrsub_vx_h, 2) +GEN_VEXT_VX(vrsub_vx_w, 4) +GEN_VEXT_VX(vrsub_vx_d, 8) void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc) { @@ -889,30 +991,30 @@ RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD) RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB) RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB) RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB) -GEN_VEXT_VV(vwaddu_vv_b, 1, 2) -GEN_VEXT_VV(vwaddu_vv_h, 2, 4) -GEN_VEXT_VV(vwaddu_vv_w, 4, 8) -GEN_VEXT_VV(vwsubu_vv_b, 1, 2) -GEN_VEXT_VV(vwsubu_vv_h, 2, 4) -GEN_VEXT_VV(vwsubu_vv_w, 4, 8) -GEN_VEXT_VV(vwadd_vv_b, 1, 2) -GEN_VEXT_VV(vwadd_vv_h, 2, 4) -GEN_VEXT_VV(vwadd_vv_w, 4, 8) -GEN_VEXT_VV(vwsub_vv_b, 1, 2) -GEN_VEXT_VV(vwsub_vv_h, 2, 4) -GEN_VEXT_VV(vwsub_vv_w, 4, 8) -GEN_VEXT_VV(vwaddu_wv_b, 1, 2) -GEN_VEXT_VV(vwaddu_wv_h, 2, 4) -GEN_VEXT_VV(vwaddu_wv_w, 4, 8) -GEN_VEXT_VV(vwsubu_wv_b, 1, 2) -GEN_VEXT_VV(vwsubu_wv_h, 2, 4) -GEN_VEXT_VV(vwsubu_wv_w, 4, 8) -GEN_VEXT_VV(vwadd_wv_b, 1, 2) -GEN_VEXT_VV(vwadd_wv_h, 2, 4) -GEN_VEXT_VV(vwadd_wv_w, 4, 8) -GEN_VEXT_VV(vwsub_wv_b, 1, 2) -GEN_VEXT_VV(vwsub_wv_h, 2, 4) -GEN_VEXT_VV(vwsub_wv_w, 4, 8) +GEN_VEXT_VV(vwaddu_vv_b, 2) +GEN_VEXT_VV(vwaddu_vv_h, 4) +GEN_VEXT_VV(vwaddu_vv_w, 8) +GEN_VEXT_VV(vwsubu_vv_b, 2) +GEN_VEXT_VV(vwsubu_vv_h, 4) +GEN_VEXT_VV(vwsubu_vv_w, 8) +GEN_VEXT_VV(vwadd_vv_b, 2) +GEN_VEXT_VV(vwadd_vv_h, 4) +GEN_VEXT_VV(vwadd_vv_w, 8) +GEN_VEXT_VV(vwsub_vv_b, 2) +GEN_VEXT_VV(vwsub_vv_h, 4) +GEN_VEXT_VV(vwsub_vv_w, 8) +GEN_VEXT_VV(vwaddu_wv_b, 2) +GEN_VEXT_VV(vwaddu_wv_h, 4) +GEN_VEXT_VV(vwaddu_wv_w, 8) +GEN_VEXT_VV(vwsubu_wv_b, 2) +GEN_VEXT_VV(vwsubu_wv_h, 4) +GEN_VEXT_VV(vwsubu_wv_w, 8) +GEN_VEXT_VV(vwadd_wv_b, 2) +GEN_VEXT_VV(vwadd_wv_h, 4) +GEN_VEXT_VV(vwadd_wv_w, 8) +GEN_VEXT_VV(vwsub_wv_b, 2) +GEN_VEXT_VV(vwsub_wv_h, 4) +GEN_VEXT_VV(vwsub_wv_w, 8) RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD) RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD) @@ -938,30 +1040,30 @@ RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD) RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB) RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB) RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB) -GEN_VEXT_VX(vwaddu_vx_b, 1, 2) -GEN_VEXT_VX(vwaddu_vx_h, 2, 4) -GEN_VEXT_VX(vwaddu_vx_w, 4, 8) -GEN_VEXT_VX(vwsubu_vx_b, 1, 2) -GEN_VEXT_VX(vwsubu_vx_h, 2, 4) -GEN_VEXT_VX(vwsubu_vx_w, 4, 8) -GEN_VEXT_VX(vwadd_vx_b, 1, 2) -GEN_VEXT_VX(vwadd_vx_h, 2, 4) -GEN_VEXT_VX(vwadd_vx_w, 4, 8) -GEN_VEXT_VX(vwsub_vx_b, 1, 2) -GEN_VEXT_VX(vwsub_vx_h, 2, 4) -GEN_VEXT_VX(vwsub_vx_w, 4, 8) -GEN_VEXT_VX(vwaddu_wx_b, 1, 2) -GEN_VEXT_VX(vwaddu_wx_h, 2, 4) -GEN_VEXT_VX(vwaddu_wx_w, 4, 8) -GEN_VEXT_VX(vwsubu_wx_b, 1, 2) -GEN_VEXT_VX(vwsubu_wx_h, 2, 4) -GEN_VEXT_VX(vwsubu_wx_w, 4, 8) -GEN_VEXT_VX(vwadd_wx_b, 1, 2) -GEN_VEXT_VX(vwadd_wx_h, 2, 4) -GEN_VEXT_VX(vwadd_wx_w, 4, 8) -GEN_VEXT_VX(vwsub_wx_b, 1, 2) -GEN_VEXT_VX(vwsub_wx_h, 2, 4) -GEN_VEXT_VX(vwsub_wx_w, 4, 8) +GEN_VEXT_VX(vwaddu_vx_b, 2) +GEN_VEXT_VX(vwaddu_vx_h, 4) +GEN_VEXT_VX(vwaddu_vx_w, 8) +GEN_VEXT_VX(vwsubu_vx_b, 2) +GEN_VEXT_VX(vwsubu_vx_h, 4) +GEN_VEXT_VX(vwsubu_vx_w, 8) +GEN_VEXT_VX(vwadd_vx_b, 2) +GEN_VEXT_VX(vwadd_vx_h, 4) +GEN_VEXT_VX(vwadd_vx_w, 8) +GEN_VEXT_VX(vwsub_vx_b, 2) +GEN_VEXT_VX(vwsub_vx_h, 4) +GEN_VEXT_VX(vwsub_vx_w, 8) +GEN_VEXT_VX(vwaddu_wx_b, 2) +GEN_VEXT_VX(vwaddu_wx_h, 4) +GEN_VEXT_VX(vwaddu_wx_w, 8) +GEN_VEXT_VX(vwsubu_wx_b, 2) +GEN_VEXT_VX(vwsubu_wx_h, 4) +GEN_VEXT_VX(vwsubu_wx_w, 8) +GEN_VEXT_VX(vwadd_wx_b, 2) +GEN_VEXT_VX(vwadd_wx_h, 4) +GEN_VEXT_VX(vwadd_wx_w, 8) +GEN_VEXT_VX(vwsub_wx_b, 2) +GEN_VEXT_VX(vwsub_wx_h, 4) +GEN_VEXT_VX(vwsub_wx_w, 8) /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */ #define DO_VADC(N, M, C) (N + M + C) @@ -972,6 +1074,10 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -982,6 +1088,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t, H1, DO_VADC) @@ -999,6 +1107,9 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1008,6 +1119,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t, H1, DO_VADC) @@ -1030,6 +1143,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ { \ uint32_t vl = env->vl; \ uint32_t vm = vext_vm(desc); \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1039,6 +1154,13 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail-agnostic */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t, H1, DO_MADC) @@ -1057,6 +1179,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ { \ uint32_t vl = env->vl; \ uint32_t vm = vext_vm(desc); \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1066,6 +1190,13 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ DO_OP(s2, (ETYPE)(target_long)s1, carry)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail-agnostic */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t, H1, DO_MADC) @@ -1091,18 +1222,18 @@ RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR) RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR) RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR) RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR) -GEN_VEXT_VV(vand_vv_b, 1, 1) -GEN_VEXT_VV(vand_vv_h, 2, 2) -GEN_VEXT_VV(vand_vv_w, 4, 4) -GEN_VEXT_VV(vand_vv_d, 8, 8) -GEN_VEXT_VV(vor_vv_b, 1, 1) -GEN_VEXT_VV(vor_vv_h, 2, 2) -GEN_VEXT_VV(vor_vv_w, 4, 4) -GEN_VEXT_VV(vor_vv_d, 8, 8) -GEN_VEXT_VV(vxor_vv_b, 1, 1) -GEN_VEXT_VV(vxor_vv_h, 2, 2) -GEN_VEXT_VV(vxor_vv_w, 4, 4) -GEN_VEXT_VV(vxor_vv_d, 8, 8) +GEN_VEXT_VV(vand_vv_b, 1) +GEN_VEXT_VV(vand_vv_h, 2) +GEN_VEXT_VV(vand_vv_w, 4) +GEN_VEXT_VV(vand_vv_d, 8) +GEN_VEXT_VV(vor_vv_b, 1) +GEN_VEXT_VV(vor_vv_h, 2) +GEN_VEXT_VV(vor_vv_w, 4) +GEN_VEXT_VV(vor_vv_d, 8) +GEN_VEXT_VV(vxor_vv_b, 1) +GEN_VEXT_VV(vxor_vv_h, 2) +GEN_VEXT_VV(vxor_vv_w, 4) +GEN_VEXT_VV(vxor_vv_d, 8) RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND) RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND) @@ -1116,18 +1247,18 @@ RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR) RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR) RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR) RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR) -GEN_VEXT_VX(vand_vx_b, 1, 1) -GEN_VEXT_VX(vand_vx_h, 2, 2) -GEN_VEXT_VX(vand_vx_w, 4, 4) -GEN_VEXT_VX(vand_vx_d, 8, 8) -GEN_VEXT_VX(vor_vx_b, 1, 1) -GEN_VEXT_VX(vor_vx_h, 2, 2) -GEN_VEXT_VX(vor_vx_w, 4, 4) -GEN_VEXT_VX(vor_vx_d, 8, 8) -GEN_VEXT_VX(vxor_vx_b, 1, 1) -GEN_VEXT_VX(vxor_vx_h, 2, 2) -GEN_VEXT_VX(vxor_vx_w, 4, 4) -GEN_VEXT_VX(vxor_vx_d, 8, 8) +GEN_VEXT_VX(vand_vx_b, 1) +GEN_VEXT_VX(vand_vx_h, 2) +GEN_VEXT_VX(vand_vx_w, 4) +GEN_VEXT_VX(vand_vx_d, 8) +GEN_VEXT_VX(vor_vx_b, 1) +GEN_VEXT_VX(vor_vx_h, 2) +GEN_VEXT_VX(vor_vx_w, 4) +GEN_VEXT_VX(vor_vx_d, 8) +GEN_VEXT_VX(vxor_vx_b, 1) +GEN_VEXT_VX(vxor_vx_h, 2) +GEN_VEXT_VX(vxor_vx_w, 4) +GEN_VEXT_VX(vxor_vx_d, 8) /* Vector Single-Width Bit Shift Instructions */ #define DO_SLL(N, M) (N << (M)) @@ -1140,6 +1271,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(TS1); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1151,6 +1285,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t, uint8_t, H1, H1, DO_SLL, 0x7) @@ -1175,6 +1311,10 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(TD); \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1185,6 +1325,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ *((TD *)vd + HD(i)) = OP(s2, s1 & MASK); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\ } GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7) @@ -1229,6 +1371,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1240,6 +1384,13 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ vext_set_elem_mask(vd, i, DO_OP(s2, s1)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail-agnostic */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t, H1, DO_MSEQ) @@ -1278,6 +1429,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1289,6 +1442,13 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ DO_OP(s2, (ETYPE)(target_long)s1)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail-agnostic */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t, H1, DO_MSEQ) @@ -1348,22 +1508,22 @@ RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX) RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX) RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX) RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX) -GEN_VEXT_VV(vminu_vv_b, 1, 1) -GEN_VEXT_VV(vminu_vv_h, 2, 2) -GEN_VEXT_VV(vminu_vv_w, 4, 4) -GEN_VEXT_VV(vminu_vv_d, 8, 8) -GEN_VEXT_VV(vmin_vv_b, 1, 1) -GEN_VEXT_VV(vmin_vv_h, 2, 2) -GEN_VEXT_VV(vmin_vv_w, 4, 4) -GEN_VEXT_VV(vmin_vv_d, 8, 8) -GEN_VEXT_VV(vmaxu_vv_b, 1, 1) -GEN_VEXT_VV(vmaxu_vv_h, 2, 2) -GEN_VEXT_VV(vmaxu_vv_w, 4, 4) -GEN_VEXT_VV(vmaxu_vv_d, 8, 8) -GEN_VEXT_VV(vmax_vv_b, 1, 1) -GEN_VEXT_VV(vmax_vv_h, 2, 2) -GEN_VEXT_VV(vmax_vv_w, 4, 4) -GEN_VEXT_VV(vmax_vv_d, 8, 8) +GEN_VEXT_VV(vminu_vv_b, 1) +GEN_VEXT_VV(vminu_vv_h, 2) +GEN_VEXT_VV(vminu_vv_w, 4) +GEN_VEXT_VV(vminu_vv_d, 8) +GEN_VEXT_VV(vmin_vv_b, 1) +GEN_VEXT_VV(vmin_vv_h, 2) +GEN_VEXT_VV(vmin_vv_w, 4) +GEN_VEXT_VV(vmin_vv_d, 8) +GEN_VEXT_VV(vmaxu_vv_b, 1) +GEN_VEXT_VV(vmaxu_vv_h, 2) +GEN_VEXT_VV(vmaxu_vv_w, 4) +GEN_VEXT_VV(vmaxu_vv_d, 8) +GEN_VEXT_VV(vmax_vv_b, 1) +GEN_VEXT_VV(vmax_vv_h, 2) +GEN_VEXT_VV(vmax_vv_w, 4) +GEN_VEXT_VV(vmax_vv_d, 8) RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN) RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN) @@ -1381,22 +1541,22 @@ RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX) RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX) RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX) RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX) -GEN_VEXT_VX(vminu_vx_b, 1, 1) -GEN_VEXT_VX(vminu_vx_h, 2, 2) -GEN_VEXT_VX(vminu_vx_w, 4, 4) -GEN_VEXT_VX(vminu_vx_d, 8, 8) -GEN_VEXT_VX(vmin_vx_b, 1, 1) -GEN_VEXT_VX(vmin_vx_h, 2, 2) -GEN_VEXT_VX(vmin_vx_w, 4, 4) -GEN_VEXT_VX(vmin_vx_d, 8, 8) -GEN_VEXT_VX(vmaxu_vx_b, 1, 1) -GEN_VEXT_VX(vmaxu_vx_h, 2, 2) -GEN_VEXT_VX(vmaxu_vx_w, 4, 4) -GEN_VEXT_VX(vmaxu_vx_d, 8, 8) -GEN_VEXT_VX(vmax_vx_b, 1, 1) -GEN_VEXT_VX(vmax_vx_h, 2, 2) -GEN_VEXT_VX(vmax_vx_w, 4, 4) -GEN_VEXT_VX(vmax_vx_d, 8, 8) +GEN_VEXT_VX(vminu_vx_b, 1) +GEN_VEXT_VX(vminu_vx_h, 2) +GEN_VEXT_VX(vminu_vx_w, 4) +GEN_VEXT_VX(vminu_vx_d, 8) +GEN_VEXT_VX(vmin_vx_b, 1) +GEN_VEXT_VX(vmin_vx_h, 2) +GEN_VEXT_VX(vmin_vx_w, 4) +GEN_VEXT_VX(vmin_vx_d, 8) +GEN_VEXT_VX(vmaxu_vx_b, 1) +GEN_VEXT_VX(vmaxu_vx_h, 2) +GEN_VEXT_VX(vmaxu_vx_w, 4) +GEN_VEXT_VX(vmaxu_vx_d, 8) +GEN_VEXT_VX(vmax_vx_b, 1) +GEN_VEXT_VX(vmax_vx_h, 2) +GEN_VEXT_VX(vmax_vx_w, 4) +GEN_VEXT_VX(vmax_vx_d, 8) /* Vector Single-Width Integer Multiply Instructions */ #define DO_MUL(N, M) (N * M) @@ -1404,10 +1564,10 @@ RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL) RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL) RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL) RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL) -GEN_VEXT_VV(vmul_vv_b, 1, 1) -GEN_VEXT_VV(vmul_vv_h, 2, 2) -GEN_VEXT_VV(vmul_vv_w, 4, 4) -GEN_VEXT_VV(vmul_vv_d, 8, 8) +GEN_VEXT_VV(vmul_vv_b, 1) +GEN_VEXT_VV(vmul_vv_h, 2) +GEN_VEXT_VV(vmul_vv_w, 4) +GEN_VEXT_VV(vmul_vv_d, 8) static int8_t do_mulh_b(int8_t s2, int8_t s1) { @@ -1511,18 +1671,18 @@ RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b) RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h) RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w) RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d) -GEN_VEXT_VV(vmulh_vv_b, 1, 1) -GEN_VEXT_VV(vmulh_vv_h, 2, 2) -GEN_VEXT_VV(vmulh_vv_w, 4, 4) -GEN_VEXT_VV(vmulh_vv_d, 8, 8) -GEN_VEXT_VV(vmulhu_vv_b, 1, 1) -GEN_VEXT_VV(vmulhu_vv_h, 2, 2) -GEN_VEXT_VV(vmulhu_vv_w, 4, 4) -GEN_VEXT_VV(vmulhu_vv_d, 8, 8) -GEN_VEXT_VV(vmulhsu_vv_b, 1, 1) -GEN_VEXT_VV(vmulhsu_vv_h, 2, 2) -GEN_VEXT_VV(vmulhsu_vv_w, 4, 4) -GEN_VEXT_VV(vmulhsu_vv_d, 8, 8) +GEN_VEXT_VV(vmulh_vv_b, 1) +GEN_VEXT_VV(vmulh_vv_h, 2) +GEN_VEXT_VV(vmulh_vv_w, 4) +GEN_VEXT_VV(vmulh_vv_d, 8) +GEN_VEXT_VV(vmulhu_vv_b, 1) +GEN_VEXT_VV(vmulhu_vv_h, 2) +GEN_VEXT_VV(vmulhu_vv_w, 4) +GEN_VEXT_VV(vmulhu_vv_d, 8) +GEN_VEXT_VV(vmulhsu_vv_b, 1) +GEN_VEXT_VV(vmulhsu_vv_h, 2) +GEN_VEXT_VV(vmulhsu_vv_w, 4) +GEN_VEXT_VV(vmulhsu_vv_d, 8) RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL) RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL) @@ -1540,22 +1700,22 @@ RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b) RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h) RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w) RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d) -GEN_VEXT_VX(vmul_vx_b, 1, 1) -GEN_VEXT_VX(vmul_vx_h, 2, 2) -GEN_VEXT_VX(vmul_vx_w, 4, 4) -GEN_VEXT_VX(vmul_vx_d, 8, 8) -GEN_VEXT_VX(vmulh_vx_b, 1, 1) -GEN_VEXT_VX(vmulh_vx_h, 2, 2) -GEN_VEXT_VX(vmulh_vx_w, 4, 4) -GEN_VEXT_VX(vmulh_vx_d, 8, 8) -GEN_VEXT_VX(vmulhu_vx_b, 1, 1) -GEN_VEXT_VX(vmulhu_vx_h, 2, 2) -GEN_VEXT_VX(vmulhu_vx_w, 4, 4) -GEN_VEXT_VX(vmulhu_vx_d, 8, 8) -GEN_VEXT_VX(vmulhsu_vx_b, 1, 1) -GEN_VEXT_VX(vmulhsu_vx_h, 2, 2) -GEN_VEXT_VX(vmulhsu_vx_w, 4, 4) -GEN_VEXT_VX(vmulhsu_vx_d, 8, 8) +GEN_VEXT_VX(vmul_vx_b, 1) +GEN_VEXT_VX(vmul_vx_h, 2) +GEN_VEXT_VX(vmul_vx_w, 4) +GEN_VEXT_VX(vmul_vx_d, 8) +GEN_VEXT_VX(vmulh_vx_b, 1) +GEN_VEXT_VX(vmulh_vx_h, 2) +GEN_VEXT_VX(vmulh_vx_w, 4) +GEN_VEXT_VX(vmulh_vx_d, 8) +GEN_VEXT_VX(vmulhu_vx_b, 1) +GEN_VEXT_VX(vmulhu_vx_h, 2) +GEN_VEXT_VX(vmulhu_vx_w, 4) +GEN_VEXT_VX(vmulhu_vx_d, 8) +GEN_VEXT_VX(vmulhsu_vx_b, 1) +GEN_VEXT_VX(vmulhsu_vx_h, 2) +GEN_VEXT_VX(vmulhsu_vx_w, 4) +GEN_VEXT_VX(vmulhsu_vx_d, 8) /* Vector Integer Divide Instructions */ #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M) @@ -1581,22 +1741,22 @@ RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM) RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM) RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM) RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM) -GEN_VEXT_VV(vdivu_vv_b, 1, 1) -GEN_VEXT_VV(vdivu_vv_h, 2, 2) -GEN_VEXT_VV(vdivu_vv_w, 4, 4) -GEN_VEXT_VV(vdivu_vv_d, 8, 8) -GEN_VEXT_VV(vdiv_vv_b, 1, 1) -GEN_VEXT_VV(vdiv_vv_h, 2, 2) -GEN_VEXT_VV(vdiv_vv_w, 4, 4) -GEN_VEXT_VV(vdiv_vv_d, 8, 8) -GEN_VEXT_VV(vremu_vv_b, 1, 1) -GEN_VEXT_VV(vremu_vv_h, 2, 2) -GEN_VEXT_VV(vremu_vv_w, 4, 4) -GEN_VEXT_VV(vremu_vv_d, 8, 8) -GEN_VEXT_VV(vrem_vv_b, 1, 1) -GEN_VEXT_VV(vrem_vv_h, 2, 2) -GEN_VEXT_VV(vrem_vv_w, 4, 4) -GEN_VEXT_VV(vrem_vv_d, 8, 8) +GEN_VEXT_VV(vdivu_vv_b, 1) +GEN_VEXT_VV(vdivu_vv_h, 2) +GEN_VEXT_VV(vdivu_vv_w, 4) +GEN_VEXT_VV(vdivu_vv_d, 8) +GEN_VEXT_VV(vdiv_vv_b, 1) +GEN_VEXT_VV(vdiv_vv_h, 2) +GEN_VEXT_VV(vdiv_vv_w, 4) +GEN_VEXT_VV(vdiv_vv_d, 8) +GEN_VEXT_VV(vremu_vv_b, 1) +GEN_VEXT_VV(vremu_vv_h, 2) +GEN_VEXT_VV(vremu_vv_w, 4) +GEN_VEXT_VV(vremu_vv_d, 8) +GEN_VEXT_VV(vrem_vv_b, 1) +GEN_VEXT_VV(vrem_vv_h, 2) +GEN_VEXT_VV(vrem_vv_w, 4) +GEN_VEXT_VV(vrem_vv_d, 8) RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU) RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU) @@ -1614,22 +1774,22 @@ RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM) RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM) RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM) RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM) -GEN_VEXT_VX(vdivu_vx_b, 1, 1) -GEN_VEXT_VX(vdivu_vx_h, 2, 2) -GEN_VEXT_VX(vdivu_vx_w, 4, 4) -GEN_VEXT_VX(vdivu_vx_d, 8, 8) -GEN_VEXT_VX(vdiv_vx_b, 1, 1) -GEN_VEXT_VX(vdiv_vx_h, 2, 2) -GEN_VEXT_VX(vdiv_vx_w, 4, 4) -GEN_VEXT_VX(vdiv_vx_d, 8, 8) -GEN_VEXT_VX(vremu_vx_b, 1, 1) -GEN_VEXT_VX(vremu_vx_h, 2, 2) -GEN_VEXT_VX(vremu_vx_w, 4, 4) -GEN_VEXT_VX(vremu_vx_d, 8, 8) -GEN_VEXT_VX(vrem_vx_b, 1, 1) -GEN_VEXT_VX(vrem_vx_h, 2, 2) -GEN_VEXT_VX(vrem_vx_w, 4, 4) -GEN_VEXT_VX(vrem_vx_d, 8, 8) +GEN_VEXT_VX(vdivu_vx_b, 1) +GEN_VEXT_VX(vdivu_vx_h, 2) +GEN_VEXT_VX(vdivu_vx_w, 4) +GEN_VEXT_VX(vdivu_vx_d, 8) +GEN_VEXT_VX(vdiv_vx_b, 1) +GEN_VEXT_VX(vdiv_vx_h, 2) +GEN_VEXT_VX(vdiv_vx_w, 4) +GEN_VEXT_VX(vdiv_vx_d, 8) +GEN_VEXT_VX(vremu_vx_b, 1) +GEN_VEXT_VX(vremu_vx_h, 2) +GEN_VEXT_VX(vremu_vx_w, 4) +GEN_VEXT_VX(vremu_vx_d, 8) +GEN_VEXT_VX(vrem_vx_b, 1) +GEN_VEXT_VX(vrem_vx_h, 2) +GEN_VEXT_VX(vrem_vx_w, 4) +GEN_VEXT_VX(vrem_vx_d, 8) /* Vector Widening Integer Multiply Instructions */ RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL) @@ -1641,15 +1801,15 @@ RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL) RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL) RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL) RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL) -GEN_VEXT_VV(vwmul_vv_b, 1, 2) -GEN_VEXT_VV(vwmul_vv_h, 2, 4) -GEN_VEXT_VV(vwmul_vv_w, 4, 8) -GEN_VEXT_VV(vwmulu_vv_b, 1, 2) -GEN_VEXT_VV(vwmulu_vv_h, 2, 4) -GEN_VEXT_VV(vwmulu_vv_w, 4, 8) -GEN_VEXT_VV(vwmulsu_vv_b, 1, 2) -GEN_VEXT_VV(vwmulsu_vv_h, 2, 4) -GEN_VEXT_VV(vwmulsu_vv_w, 4, 8) +GEN_VEXT_VV(vwmul_vv_b, 2) +GEN_VEXT_VV(vwmul_vv_h, 4) +GEN_VEXT_VV(vwmul_vv_w, 8) +GEN_VEXT_VV(vwmulu_vv_b, 2) +GEN_VEXT_VV(vwmulu_vv_h, 4) +GEN_VEXT_VV(vwmulu_vv_w, 8) +GEN_VEXT_VV(vwmulsu_vv_b, 2) +GEN_VEXT_VV(vwmulsu_vv_h, 4) +GEN_VEXT_VV(vwmulsu_vv_w, 8) RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL) RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL) @@ -1660,15 +1820,15 @@ RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL) RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL) RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL) RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL) -GEN_VEXT_VX(vwmul_vx_b, 1, 2) -GEN_VEXT_VX(vwmul_vx_h, 2, 4) -GEN_VEXT_VX(vwmul_vx_w, 4, 8) -GEN_VEXT_VX(vwmulu_vx_b, 1, 2) -GEN_VEXT_VX(vwmulu_vx_h, 2, 4) -GEN_VEXT_VX(vwmulu_vx_w, 4, 8) -GEN_VEXT_VX(vwmulsu_vx_b, 1, 2) -GEN_VEXT_VX(vwmulsu_vx_h, 2, 4) -GEN_VEXT_VX(vwmulsu_vx_w, 4, 8) +GEN_VEXT_VX(vwmul_vx_b, 2) +GEN_VEXT_VX(vwmul_vx_h, 4) +GEN_VEXT_VX(vwmul_vx_w, 8) +GEN_VEXT_VX(vwmulu_vx_b, 2) +GEN_VEXT_VX(vwmulu_vx_h, 4) +GEN_VEXT_VX(vwmulu_vx_w, 8) +GEN_VEXT_VX(vwmulsu_vx_b, 2) +GEN_VEXT_VX(vwmulsu_vx_h, 4) +GEN_VEXT_VX(vwmulsu_vx_w, 8) /* Vector Single-Width Integer Multiply-Add Instructions */ #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ @@ -1700,22 +1860,22 @@ RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB) RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB) RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB) RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB) -GEN_VEXT_VV(vmacc_vv_b, 1, 1) -GEN_VEXT_VV(vmacc_vv_h, 2, 2) -GEN_VEXT_VV(vmacc_vv_w, 4, 4) -GEN_VEXT_VV(vmacc_vv_d, 8, 8) -GEN_VEXT_VV(vnmsac_vv_b, 1, 1) -GEN_VEXT_VV(vnmsac_vv_h, 2, 2) -GEN_VEXT_VV(vnmsac_vv_w, 4, 4) -GEN_VEXT_VV(vnmsac_vv_d, 8, 8) -GEN_VEXT_VV(vmadd_vv_b, 1, 1) -GEN_VEXT_VV(vmadd_vv_h, 2, 2) -GEN_VEXT_VV(vmadd_vv_w, 4, 4) -GEN_VEXT_VV(vmadd_vv_d, 8, 8) -GEN_VEXT_VV(vnmsub_vv_b, 1, 1) -GEN_VEXT_VV(vnmsub_vv_h, 2, 2) -GEN_VEXT_VV(vnmsub_vv_w, 4, 4) -GEN_VEXT_VV(vnmsub_vv_d, 8, 8) +GEN_VEXT_VV(vmacc_vv_b, 1) +GEN_VEXT_VV(vmacc_vv_h, 2) +GEN_VEXT_VV(vmacc_vv_w, 4) +GEN_VEXT_VV(vmacc_vv_d, 8) +GEN_VEXT_VV(vnmsac_vv_b, 1) +GEN_VEXT_VV(vnmsac_vv_h, 2) +GEN_VEXT_VV(vnmsac_vv_w, 4) +GEN_VEXT_VV(vnmsac_vv_d, 8) +GEN_VEXT_VV(vmadd_vv_b, 1) +GEN_VEXT_VV(vmadd_vv_h, 2) +GEN_VEXT_VV(vmadd_vv_w, 4) +GEN_VEXT_VV(vmadd_vv_d, 8) +GEN_VEXT_VV(vnmsub_vv_b, 1) +GEN_VEXT_VV(vnmsub_vv_h, 2) +GEN_VEXT_VV(vnmsub_vv_w, 4) +GEN_VEXT_VV(vnmsub_vv_d, 8) #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \ @@ -1741,22 +1901,22 @@ RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB) RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB) RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB) RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB) -GEN_VEXT_VX(vmacc_vx_b, 1, 1) -GEN_VEXT_VX(vmacc_vx_h, 2, 2) -GEN_VEXT_VX(vmacc_vx_w, 4, 4) -GEN_VEXT_VX(vmacc_vx_d, 8, 8) -GEN_VEXT_VX(vnmsac_vx_b, 1, 1) -GEN_VEXT_VX(vnmsac_vx_h, 2, 2) -GEN_VEXT_VX(vnmsac_vx_w, 4, 4) -GEN_VEXT_VX(vnmsac_vx_d, 8, 8) -GEN_VEXT_VX(vmadd_vx_b, 1, 1) -GEN_VEXT_VX(vmadd_vx_h, 2, 2) -GEN_VEXT_VX(vmadd_vx_w, 4, 4) -GEN_VEXT_VX(vmadd_vx_d, 8, 8) -GEN_VEXT_VX(vnmsub_vx_b, 1, 1) -GEN_VEXT_VX(vnmsub_vx_h, 2, 2) -GEN_VEXT_VX(vnmsub_vx_w, 4, 4) -GEN_VEXT_VX(vnmsub_vx_d, 8, 8) +GEN_VEXT_VX(vmacc_vx_b, 1) +GEN_VEXT_VX(vmacc_vx_h, 2) +GEN_VEXT_VX(vmacc_vx_w, 4) +GEN_VEXT_VX(vmacc_vx_d, 8) +GEN_VEXT_VX(vnmsac_vx_b, 1) +GEN_VEXT_VX(vnmsac_vx_h, 2) +GEN_VEXT_VX(vnmsac_vx_w, 4) +GEN_VEXT_VX(vnmsac_vx_d, 8) +GEN_VEXT_VX(vmadd_vx_b, 1) +GEN_VEXT_VX(vmadd_vx_h, 2) +GEN_VEXT_VX(vmadd_vx_w, 4) +GEN_VEXT_VX(vmadd_vx_d, 8) +GEN_VEXT_VX(vnmsub_vx_b, 1) +GEN_VEXT_VX(vnmsub_vx_h, 2) +GEN_VEXT_VX(vnmsub_vx_w, 4) +GEN_VEXT_VX(vnmsub_vx_d, 8) /* Vector Widening Integer Multiply-Add Instructions */ RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC) @@ -1768,15 +1928,15 @@ RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC) RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC) RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC) RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC) -GEN_VEXT_VV(vwmaccu_vv_b, 1, 2) -GEN_VEXT_VV(vwmaccu_vv_h, 2, 4) -GEN_VEXT_VV(vwmaccu_vv_w, 4, 8) -GEN_VEXT_VV(vwmacc_vv_b, 1, 2) -GEN_VEXT_VV(vwmacc_vv_h, 2, 4) -GEN_VEXT_VV(vwmacc_vv_w, 4, 8) -GEN_VEXT_VV(vwmaccsu_vv_b, 1, 2) -GEN_VEXT_VV(vwmaccsu_vv_h, 2, 4) -GEN_VEXT_VV(vwmaccsu_vv_w, 4, 8) +GEN_VEXT_VV(vwmaccu_vv_b, 2) +GEN_VEXT_VV(vwmaccu_vv_h, 4) +GEN_VEXT_VV(vwmaccu_vv_w, 8) +GEN_VEXT_VV(vwmacc_vv_b, 2) +GEN_VEXT_VV(vwmacc_vv_h, 4) +GEN_VEXT_VV(vwmacc_vv_w, 8) +GEN_VEXT_VV(vwmaccsu_vv_b, 2) +GEN_VEXT_VV(vwmaccsu_vv_h, 4) +GEN_VEXT_VV(vwmaccsu_vv_w, 8) RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC) RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC) @@ -1790,18 +1950,18 @@ RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC) RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC) RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC) RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC) -GEN_VEXT_VX(vwmaccu_vx_b, 1, 2) -GEN_VEXT_VX(vwmaccu_vx_h, 2, 4) -GEN_VEXT_VX(vwmaccu_vx_w, 4, 8) -GEN_VEXT_VX(vwmacc_vx_b, 1, 2) -GEN_VEXT_VX(vwmacc_vx_h, 2, 4) -GEN_VEXT_VX(vwmacc_vx_w, 4, 8) -GEN_VEXT_VX(vwmaccsu_vx_b, 1, 2) -GEN_VEXT_VX(vwmaccsu_vx_h, 2, 4) -GEN_VEXT_VX(vwmaccsu_vx_w, 4, 8) -GEN_VEXT_VX(vwmaccus_vx_b, 1, 2) -GEN_VEXT_VX(vwmaccus_vx_h, 2, 4) -GEN_VEXT_VX(vwmaccus_vx_w, 4, 8) +GEN_VEXT_VX(vwmaccu_vx_b, 2) +GEN_VEXT_VX(vwmaccu_vx_h, 4) +GEN_VEXT_VX(vwmaccu_vx_w, 8) +GEN_VEXT_VX(vwmacc_vx_b, 2) +GEN_VEXT_VX(vwmacc_vx_h, 4) +GEN_VEXT_VX(vwmacc_vx_w, 8) +GEN_VEXT_VX(vwmaccsu_vx_b, 2) +GEN_VEXT_VX(vwmaccsu_vx_h, 4) +GEN_VEXT_VX(vwmaccsu_vx_w, 8) +GEN_VEXT_VX(vwmaccus_vx_b, 2) +GEN_VEXT_VX(vwmaccus_vx_h, 4) +GEN_VEXT_VX(vwmaccus_vx_w, 8) /* Vector Integer Merge and Move Instructions */ #define GEN_VEXT_VMV_VV(NAME, ETYPE, H) \ @@ -1809,6 +1969,9 @@ void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env, \ uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1816,6 +1979,8 @@ void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env, \ *((ETYPE *)vd + H(i)) = s1; \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t, H1) @@ -1828,12 +1993,17 @@ void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \ uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ *((ETYPE *)vd + H(i)) = (ETYPE)s1; \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t, H1) @@ -1846,6 +2016,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1853,6 +2026,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ *((ETYPE *)vd + H(i)) = *(vt + H(i)); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t, H1) @@ -1865,6 +2040,9 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ void *vs2, CPURISCVState *env, uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -1874,6 +2052,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ *((ETYPE *)vd + H(i)) = d; \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t, H1) @@ -1922,11 +2102,13 @@ vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2, static inline void vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2, CPURISCVState *env, - uint32_t desc, uint32_t esz, uint32_t dsz, - opivv2_rm_fn *fn) + uint32_t desc, + opivv2_rm_fn *fn, uint32_t esz) { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); switch (env->vxrm) { case 0: /* rnu */ @@ -1946,15 +2128,17 @@ vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2, env, vl, vm, 3, fn); break; } + /* set tail elements to 1s */ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); } /* generate helpers for fixed point instructions with OPIVV format */ -#define GEN_VEXT_VV_RM(NAME, ESZ, DSZ) \ +#define GEN_VEXT_VV_RM(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - vext_vv_rm_2(vd, v0, vs1, vs2, env, desc, ESZ, DSZ, \ - do_##NAME); \ + vext_vv_rm_2(vd, v0, vs1, vs2, env, desc, \ + do_##NAME, ESZ); \ } static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b) @@ -2004,10 +2188,10 @@ RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8) RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16) RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32) RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64) -GEN_VEXT_VV_RM(vsaddu_vv_b, 1, 1) -GEN_VEXT_VV_RM(vsaddu_vv_h, 2, 2) -GEN_VEXT_VV_RM(vsaddu_vv_w, 4, 4) -GEN_VEXT_VV_RM(vsaddu_vv_d, 8, 8) +GEN_VEXT_VV_RM(vsaddu_vv_b, 1) +GEN_VEXT_VV_RM(vsaddu_vv_h, 2) +GEN_VEXT_VV_RM(vsaddu_vv_w, 4) +GEN_VEXT_VV_RM(vsaddu_vv_d, 8) typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i, CPURISCVState *env, int vxrm); @@ -2039,11 +2223,13 @@ vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2, static inline void vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2, CPURISCVState *env, - uint32_t desc, uint32_t esz, uint32_t dsz, - opivx2_rm_fn *fn) + uint32_t desc, + opivx2_rm_fn *fn, uint32_t esz) { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t total_elems = vext_get_total_elems(env, desc, esz); + uint32_t vta = vext_vta(desc); switch (env->vxrm) { case 0: /* rnu */ @@ -2063,25 +2249,27 @@ vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2, env, vl, vm, 3, fn); break; } + /* set tail elements to 1s */ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); } /* generate helpers for fixed point instructions with OPIVX format */ -#define GEN_VEXT_VX_RM(NAME, ESZ, DSZ) \ +#define GEN_VEXT_VX_RM(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ void *vs2, CPURISCVState *env, uint32_t desc) \ { \ - vext_vx_rm_2(vd, v0, s1, vs2, env, desc, ESZ, DSZ, \ - do_##NAME); \ + vext_vx_rm_2(vd, v0, s1, vs2, env, desc, \ + do_##NAME, ESZ); \ } RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8) RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16) RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32) RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64) -GEN_VEXT_VX_RM(vsaddu_vx_b, 1, 1) -GEN_VEXT_VX_RM(vsaddu_vx_h, 2, 2) -GEN_VEXT_VX_RM(vsaddu_vx_w, 4, 4) -GEN_VEXT_VX_RM(vsaddu_vx_d, 8, 8) +GEN_VEXT_VX_RM(vsaddu_vx_b, 1) +GEN_VEXT_VX_RM(vsaddu_vx_h, 2) +GEN_VEXT_VX_RM(vsaddu_vx_w, 4) +GEN_VEXT_VX_RM(vsaddu_vx_d, 8) static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) { @@ -2127,19 +2315,19 @@ RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8) RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16) RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32) RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64) -GEN_VEXT_VV_RM(vsadd_vv_b, 1, 1) -GEN_VEXT_VV_RM(vsadd_vv_h, 2, 2) -GEN_VEXT_VV_RM(vsadd_vv_w, 4, 4) -GEN_VEXT_VV_RM(vsadd_vv_d, 8, 8) +GEN_VEXT_VV_RM(vsadd_vv_b, 1) +GEN_VEXT_VV_RM(vsadd_vv_h, 2) +GEN_VEXT_VV_RM(vsadd_vv_w, 4) +GEN_VEXT_VV_RM(vsadd_vv_d, 8) RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8) RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16) RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32) RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64) -GEN_VEXT_VX_RM(vsadd_vx_b, 1, 1) -GEN_VEXT_VX_RM(vsadd_vx_h, 2, 2) -GEN_VEXT_VX_RM(vsadd_vx_w, 4, 4) -GEN_VEXT_VX_RM(vsadd_vx_d, 8, 8) +GEN_VEXT_VX_RM(vsadd_vx_b, 1) +GEN_VEXT_VX_RM(vsadd_vx_h, 2) +GEN_VEXT_VX_RM(vsadd_vx_w, 4) +GEN_VEXT_VX_RM(vsadd_vx_d, 8) static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b) { @@ -2188,19 +2376,19 @@ RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8) RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16) RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32) RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64) -GEN_VEXT_VV_RM(vssubu_vv_b, 1, 1) -GEN_VEXT_VV_RM(vssubu_vv_h, 2, 2) -GEN_VEXT_VV_RM(vssubu_vv_w, 4, 4) -GEN_VEXT_VV_RM(vssubu_vv_d, 8, 8) +GEN_VEXT_VV_RM(vssubu_vv_b, 1) +GEN_VEXT_VV_RM(vssubu_vv_h, 2) +GEN_VEXT_VV_RM(vssubu_vv_w, 4) +GEN_VEXT_VV_RM(vssubu_vv_d, 8) RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8) RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16) RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32) RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64) -GEN_VEXT_VX_RM(vssubu_vx_b, 1, 1) -GEN_VEXT_VX_RM(vssubu_vx_h, 2, 2) -GEN_VEXT_VX_RM(vssubu_vx_w, 4, 4) -GEN_VEXT_VX_RM(vssubu_vx_d, 8, 8) +GEN_VEXT_VX_RM(vssubu_vx_b, 1) +GEN_VEXT_VX_RM(vssubu_vx_h, 2) +GEN_VEXT_VX_RM(vssubu_vx_w, 4) +GEN_VEXT_VX_RM(vssubu_vx_d, 8) static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) { @@ -2246,19 +2434,19 @@ RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8) RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16) RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32) RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64) -GEN_VEXT_VV_RM(vssub_vv_b, 1, 1) -GEN_VEXT_VV_RM(vssub_vv_h, 2, 2) -GEN_VEXT_VV_RM(vssub_vv_w, 4, 4) -GEN_VEXT_VV_RM(vssub_vv_d, 8, 8) +GEN_VEXT_VV_RM(vssub_vv_b, 1) +GEN_VEXT_VV_RM(vssub_vv_h, 2) +GEN_VEXT_VV_RM(vssub_vv_w, 4) +GEN_VEXT_VV_RM(vssub_vv_d, 8) RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8) RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16) RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32) RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64) -GEN_VEXT_VX_RM(vssub_vx_b, 1, 1) -GEN_VEXT_VX_RM(vssub_vx_h, 2, 2) -GEN_VEXT_VX_RM(vssub_vx_w, 4, 4) -GEN_VEXT_VX_RM(vssub_vx_d, 8, 8) +GEN_VEXT_VX_RM(vssub_vx_b, 1) +GEN_VEXT_VX_RM(vssub_vx_h, 2) +GEN_VEXT_VX_RM(vssub_vx_w, 4) +GEN_VEXT_VX_RM(vssub_vx_d, 8) /* Vector Single-Width Averaging Add and Subtract */ static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift) @@ -2310,19 +2498,19 @@ RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) -GEN_VEXT_VV_RM(vaadd_vv_b, 1, 1) -GEN_VEXT_VV_RM(vaadd_vv_h, 2, 2) -GEN_VEXT_VV_RM(vaadd_vv_w, 4, 4) -GEN_VEXT_VV_RM(vaadd_vv_d, 8, 8) +GEN_VEXT_VV_RM(vaadd_vv_b, 1) +GEN_VEXT_VV_RM(vaadd_vv_h, 2) +GEN_VEXT_VV_RM(vaadd_vv_w, 4) +GEN_VEXT_VV_RM(vaadd_vv_d, 8) RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32) RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32) RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32) RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64) -GEN_VEXT_VX_RM(vaadd_vx_b, 1, 1) -GEN_VEXT_VX_RM(vaadd_vx_h, 2, 2) -GEN_VEXT_VX_RM(vaadd_vx_w, 4, 4) -GEN_VEXT_VX_RM(vaadd_vx_d, 8, 8) +GEN_VEXT_VX_RM(vaadd_vx_b, 1) +GEN_VEXT_VX_RM(vaadd_vx_h, 2) +GEN_VEXT_VX_RM(vaadd_vx_w, 4) +GEN_VEXT_VX_RM(vaadd_vx_d, 8) static inline uint32_t aaddu32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b) @@ -2347,19 +2535,19 @@ RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32) RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32) RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32) RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64) -GEN_VEXT_VV_RM(vaaddu_vv_b, 1, 1) -GEN_VEXT_VV_RM(vaaddu_vv_h, 2, 2) -GEN_VEXT_VV_RM(vaaddu_vv_w, 4, 4) -GEN_VEXT_VV_RM(vaaddu_vv_d, 8, 8) +GEN_VEXT_VV_RM(vaaddu_vv_b, 1) +GEN_VEXT_VV_RM(vaaddu_vv_h, 2) +GEN_VEXT_VV_RM(vaaddu_vv_w, 4) +GEN_VEXT_VV_RM(vaaddu_vv_d, 8) RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32) RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32) RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32) RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64) -GEN_VEXT_VX_RM(vaaddu_vx_b, 1, 1) -GEN_VEXT_VX_RM(vaaddu_vx_h, 2, 2) -GEN_VEXT_VX_RM(vaaddu_vx_w, 4, 4) -GEN_VEXT_VX_RM(vaaddu_vx_d, 8, 8) +GEN_VEXT_VX_RM(vaaddu_vx_b, 1) +GEN_VEXT_VX_RM(vaaddu_vx_h, 2) +GEN_VEXT_VX_RM(vaaddu_vx_w, 4) +GEN_VEXT_VX_RM(vaaddu_vx_d, 8) static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) { @@ -2383,19 +2571,19 @@ RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32) RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32) RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32) RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64) -GEN_VEXT_VV_RM(vasub_vv_b, 1, 1) -GEN_VEXT_VV_RM(vasub_vv_h, 2, 2) -GEN_VEXT_VV_RM(vasub_vv_w, 4, 4) -GEN_VEXT_VV_RM(vasub_vv_d, 8, 8) +GEN_VEXT_VV_RM(vasub_vv_b, 1) +GEN_VEXT_VV_RM(vasub_vv_h, 2) +GEN_VEXT_VV_RM(vasub_vv_w, 4) +GEN_VEXT_VV_RM(vasub_vv_d, 8) RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32) RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32) RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32) RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64) -GEN_VEXT_VX_RM(vasub_vx_b, 1, 1) -GEN_VEXT_VX_RM(vasub_vx_h, 2, 2) -GEN_VEXT_VX_RM(vasub_vx_w, 4, 4) -GEN_VEXT_VX_RM(vasub_vx_d, 8, 8) +GEN_VEXT_VX_RM(vasub_vx_b, 1) +GEN_VEXT_VX_RM(vasub_vx_h, 2) +GEN_VEXT_VX_RM(vasub_vx_w, 4) +GEN_VEXT_VX_RM(vasub_vx_d, 8) static inline uint32_t asubu32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b) @@ -2420,19 +2608,19 @@ RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32) RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32) RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32) RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64) -GEN_VEXT_VV_RM(vasubu_vv_b, 1, 1) -GEN_VEXT_VV_RM(vasubu_vv_h, 2, 2) -GEN_VEXT_VV_RM(vasubu_vv_w, 4, 4) -GEN_VEXT_VV_RM(vasubu_vv_d, 8, 8) +GEN_VEXT_VV_RM(vasubu_vv_b, 1) +GEN_VEXT_VV_RM(vasubu_vv_h, 2) +GEN_VEXT_VV_RM(vasubu_vv_w, 4) +GEN_VEXT_VV_RM(vasubu_vv_d, 8) RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32) RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32) RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32) RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64) -GEN_VEXT_VX_RM(vasubu_vx_b, 1, 1) -GEN_VEXT_VX_RM(vasubu_vx_h, 2, 2) -GEN_VEXT_VX_RM(vasubu_vx_w, 4, 4) -GEN_VEXT_VX_RM(vasubu_vx_d, 8, 8) +GEN_VEXT_VX_RM(vasubu_vx_b, 1) +GEN_VEXT_VX_RM(vasubu_vx_h, 2) +GEN_VEXT_VX_RM(vasubu_vx_w, 4) +GEN_VEXT_VX_RM(vasubu_vx_d, 8) /* Vector Single-Width Fractional Multiply with Rounding and Saturation */ static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) @@ -2527,19 +2715,19 @@ RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8) RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16) RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32) RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64) -GEN_VEXT_VV_RM(vsmul_vv_b, 1, 1) -GEN_VEXT_VV_RM(vsmul_vv_h, 2, 2) -GEN_VEXT_VV_RM(vsmul_vv_w, 4, 4) -GEN_VEXT_VV_RM(vsmul_vv_d, 8, 8) +GEN_VEXT_VV_RM(vsmul_vv_b, 1) +GEN_VEXT_VV_RM(vsmul_vv_h, 2) +GEN_VEXT_VV_RM(vsmul_vv_w, 4) +GEN_VEXT_VV_RM(vsmul_vv_d, 8) RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8) RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16) RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32) RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64) -GEN_VEXT_VX_RM(vsmul_vx_b, 1, 1) -GEN_VEXT_VX_RM(vsmul_vx_h, 2, 2) -GEN_VEXT_VX_RM(vsmul_vx_w, 4, 4) -GEN_VEXT_VX_RM(vsmul_vx_d, 8, 8) +GEN_VEXT_VX_RM(vsmul_vx_b, 1) +GEN_VEXT_VX_RM(vsmul_vx_h, 2) +GEN_VEXT_VX_RM(vsmul_vx_w, 4) +GEN_VEXT_VX_RM(vsmul_vx_d, 8) /* Vector Single-Width Scaling Shift Instructions */ static inline uint8_t @@ -2586,19 +2774,19 @@ RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8) RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16) RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32) RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64) -GEN_VEXT_VV_RM(vssrl_vv_b, 1, 1) -GEN_VEXT_VV_RM(vssrl_vv_h, 2, 2) -GEN_VEXT_VV_RM(vssrl_vv_w, 4, 4) -GEN_VEXT_VV_RM(vssrl_vv_d, 8, 8) +GEN_VEXT_VV_RM(vssrl_vv_b, 1) +GEN_VEXT_VV_RM(vssrl_vv_h, 2) +GEN_VEXT_VV_RM(vssrl_vv_w, 4) +GEN_VEXT_VV_RM(vssrl_vv_d, 8) RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8) RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16) RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32) RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64) -GEN_VEXT_VX_RM(vssrl_vx_b, 1, 1) -GEN_VEXT_VX_RM(vssrl_vx_h, 2, 2) -GEN_VEXT_VX_RM(vssrl_vx_w, 4, 4) -GEN_VEXT_VX_RM(vssrl_vx_d, 8, 8) +GEN_VEXT_VX_RM(vssrl_vx_b, 1) +GEN_VEXT_VX_RM(vssrl_vx_h, 2) +GEN_VEXT_VX_RM(vssrl_vx_w, 4) +GEN_VEXT_VX_RM(vssrl_vx_d, 8) static inline int8_t vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) @@ -2645,19 +2833,19 @@ RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8) RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16) RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32) RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64) -GEN_VEXT_VV_RM(vssra_vv_b, 1, 1) -GEN_VEXT_VV_RM(vssra_vv_h, 2, 2) -GEN_VEXT_VV_RM(vssra_vv_w, 4, 4) -GEN_VEXT_VV_RM(vssra_vv_d, 8, 8) +GEN_VEXT_VV_RM(vssra_vv_b, 1) +GEN_VEXT_VV_RM(vssra_vv_h, 2) +GEN_VEXT_VV_RM(vssra_vv_w, 4) +GEN_VEXT_VV_RM(vssra_vv_d, 8) RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8) RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16) RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32) RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64) -GEN_VEXT_VX_RM(vssra_vx_b, 1, 1) -GEN_VEXT_VX_RM(vssra_vx_h, 2, 2) -GEN_VEXT_VX_RM(vssra_vx_w, 4, 4) -GEN_VEXT_VX_RM(vssra_vx_d, 8, 8) +GEN_VEXT_VX_RM(vssra_vx_b, 1) +GEN_VEXT_VX_RM(vssra_vx_h, 2) +GEN_VEXT_VX_RM(vssra_vx_w, 4) +GEN_VEXT_VX_RM(vssra_vx_d, 8) /* Vector Narrowing Fixed-Point Clip Instructions */ static inline int8_t @@ -2720,16 +2908,16 @@ vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b) RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8) RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16) RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32) -GEN_VEXT_VV_RM(vnclip_wv_b, 1, 1) -GEN_VEXT_VV_RM(vnclip_wv_h, 2, 2) -GEN_VEXT_VV_RM(vnclip_wv_w, 4, 4) +GEN_VEXT_VV_RM(vnclip_wv_b, 1) +GEN_VEXT_VV_RM(vnclip_wv_h, 2) +GEN_VEXT_VV_RM(vnclip_wv_w, 4) RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8) RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16) RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32) -GEN_VEXT_VX_RM(vnclip_wx_b, 1, 1) -GEN_VEXT_VX_RM(vnclip_wx_h, 2, 2) -GEN_VEXT_VX_RM(vnclip_wx_w, 4, 4) +GEN_VEXT_VX_RM(vnclip_wx_b, 1) +GEN_VEXT_VX_RM(vnclip_wx_h, 2) +GEN_VEXT_VX_RM(vnclip_wx_w, 4) static inline uint8_t vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b) @@ -2782,16 +2970,16 @@ vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b) RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8) RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16) RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32) -GEN_VEXT_VV_RM(vnclipu_wv_b, 1, 1) -GEN_VEXT_VV_RM(vnclipu_wv_h, 2, 2) -GEN_VEXT_VV_RM(vnclipu_wv_w, 4, 4) +GEN_VEXT_VV_RM(vnclipu_wv_b, 1) +GEN_VEXT_VV_RM(vnclipu_wv_h, 2) +GEN_VEXT_VV_RM(vnclipu_wv_w, 4) RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8) RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16) RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32) -GEN_VEXT_VX_RM(vnclipu_wx_b, 1, 1) -GEN_VEXT_VX_RM(vnclipu_wx_h, 2, 2) -GEN_VEXT_VX_RM(vnclipu_wx_w, 4, 4) +GEN_VEXT_VX_RM(vnclipu_wx_b, 1) +GEN_VEXT_VX_RM(vnclipu_wx_h, 2) +GEN_VEXT_VX_RM(vnclipu_wx_w, 4) /* *** Vector Float Point Arithmetic Instructions @@ -2806,13 +2994,16 @@ static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \ *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status); \ } -#define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ) \ +#define GEN_VEXT_VV_ENV(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ void *vs2, CPURISCVState *env, \ uint32_t desc) \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, ESZ); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -2822,14 +3013,17 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ do_##NAME(vd, vs1, vs2, i, env); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * ESZ, \ + total_elems * ESZ); \ } RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add) RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add) RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add) -GEN_VEXT_VV_ENV(vfadd_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfadd_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfadd_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfadd_vv_h, 2) +GEN_VEXT_VV_ENV(vfadd_vv_w, 4) +GEN_VEXT_VV_ENV(vfadd_vv_d, 8) #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \ @@ -2839,13 +3033,16 @@ static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \ *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\ } -#define GEN_VEXT_VF(NAME, ESZ, DSZ) \ +#define GEN_VEXT_VF(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, \ void *vs2, CPURISCVState *env, \ uint32_t desc) \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, ESZ); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -2855,27 +3052,30 @@ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, \ do_##NAME(vd, s1, vs2, i, env); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * ESZ, \ + total_elems * ESZ); \ } RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add) RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add) RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add) -GEN_VEXT_VF(vfadd_vf_h, 2, 2) -GEN_VEXT_VF(vfadd_vf_w, 4, 4) -GEN_VEXT_VF(vfadd_vf_d, 8, 8) +GEN_VEXT_VF(vfadd_vf_h, 2) +GEN_VEXT_VF(vfadd_vf_w, 4) +GEN_VEXT_VF(vfadd_vf_d, 8) RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub) RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub) RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub) -GEN_VEXT_VV_ENV(vfsub_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfsub_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfsub_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfsub_vv_h, 2) +GEN_VEXT_VV_ENV(vfsub_vv_w, 4) +GEN_VEXT_VV_ENV(vfsub_vv_d, 8) RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub) RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub) RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub) -GEN_VEXT_VF(vfsub_vf_h, 2, 2) -GEN_VEXT_VF(vfsub_vf_w, 4, 4) -GEN_VEXT_VF(vfsub_vf_d, 8, 8) +GEN_VEXT_VF(vfsub_vf_h, 2) +GEN_VEXT_VF(vfsub_vf_w, 4) +GEN_VEXT_VF(vfsub_vf_d, 8) static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s) { @@ -2895,9 +3095,9 @@ static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s) RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub) RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub) RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub) -GEN_VEXT_VF(vfrsub_vf_h, 2, 2) -GEN_VEXT_VF(vfrsub_vf_w, 4, 4) -GEN_VEXT_VF(vfrsub_vf_d, 8, 8) +GEN_VEXT_VF(vfrsub_vf_h, 2) +GEN_VEXT_VF(vfrsub_vf_w, 4) +GEN_VEXT_VF(vfrsub_vf_d, 8) /* Vector Widening Floating-Point Add/Subtract Instructions */ static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s) @@ -2915,12 +3115,12 @@ static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s) RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16) RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32) -GEN_VEXT_VV_ENV(vfwadd_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwadd_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwadd_vv_h, 4) +GEN_VEXT_VV_ENV(vfwadd_vv_w, 8) RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16) RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32) -GEN_VEXT_VF(vfwadd_vf_h, 2, 4) -GEN_VEXT_VF(vfwadd_vf_w, 4, 8) +GEN_VEXT_VF(vfwadd_vf_h, 4) +GEN_VEXT_VF(vfwadd_vf_w, 8) static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s) { @@ -2937,12 +3137,12 @@ static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s) RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16) RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32) -GEN_VEXT_VV_ENV(vfwsub_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwsub_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwsub_vv_h, 4) +GEN_VEXT_VV_ENV(vfwsub_vv_w, 8) RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16) RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32) -GEN_VEXT_VF(vfwsub_vf_h, 2, 4) -GEN_VEXT_VF(vfwsub_vf_w, 4, 8) +GEN_VEXT_VF(vfwsub_vf_h, 4) +GEN_VEXT_VF(vfwsub_vf_w, 8) static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s) { @@ -2956,12 +3156,12 @@ static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s) RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16) RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32) -GEN_VEXT_VV_ENV(vfwadd_wv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwadd_wv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwadd_wv_h, 4) +GEN_VEXT_VV_ENV(vfwadd_wv_w, 8) RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16) RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32) -GEN_VEXT_VF(vfwadd_wf_h, 2, 4) -GEN_VEXT_VF(vfwadd_wf_w, 4, 8) +GEN_VEXT_VF(vfwadd_wf_h, 4) +GEN_VEXT_VF(vfwadd_wf_w, 8) static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s) { @@ -2975,39 +3175,39 @@ static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s) RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16) RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32) -GEN_VEXT_VV_ENV(vfwsub_wv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwsub_wv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwsub_wv_h, 4) +GEN_VEXT_VV_ENV(vfwsub_wv_w, 8) RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16) RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32) -GEN_VEXT_VF(vfwsub_wf_h, 2, 4) -GEN_VEXT_VF(vfwsub_wf_w, 4, 8) +GEN_VEXT_VF(vfwsub_wf_h, 4) +GEN_VEXT_VF(vfwsub_wf_w, 8) /* Vector Single-Width Floating-Point Multiply/Divide Instructions */ RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul) RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul) RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul) -GEN_VEXT_VV_ENV(vfmul_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmul_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmul_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmul_vv_h, 2) +GEN_VEXT_VV_ENV(vfmul_vv_w, 4) +GEN_VEXT_VV_ENV(vfmul_vv_d, 8) RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul) RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul) RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul) -GEN_VEXT_VF(vfmul_vf_h, 2, 2) -GEN_VEXT_VF(vfmul_vf_w, 4, 4) -GEN_VEXT_VF(vfmul_vf_d, 8, 8) +GEN_VEXT_VF(vfmul_vf_h, 2) +GEN_VEXT_VF(vfmul_vf_w, 4) +GEN_VEXT_VF(vfmul_vf_d, 8) RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div) RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div) RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div) -GEN_VEXT_VV_ENV(vfdiv_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfdiv_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfdiv_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfdiv_vv_h, 2) +GEN_VEXT_VV_ENV(vfdiv_vv_w, 4) +GEN_VEXT_VV_ENV(vfdiv_vv_d, 8) RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div) RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div) RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div) -GEN_VEXT_VF(vfdiv_vf_h, 2, 2) -GEN_VEXT_VF(vfdiv_vf_w, 4, 4) -GEN_VEXT_VF(vfdiv_vf_d, 8, 8) +GEN_VEXT_VF(vfdiv_vf_h, 2) +GEN_VEXT_VF(vfdiv_vf_w, 4) +GEN_VEXT_VF(vfdiv_vf_d, 8) static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s) { @@ -3027,9 +3227,9 @@ static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s) RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv) RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv) RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv) -GEN_VEXT_VF(vfrdiv_vf_h, 2, 2) -GEN_VEXT_VF(vfrdiv_vf_w, 4, 4) -GEN_VEXT_VF(vfrdiv_vf_d, 8, 8) +GEN_VEXT_VF(vfrdiv_vf_h, 2) +GEN_VEXT_VF(vfrdiv_vf_w, 4) +GEN_VEXT_VF(vfrdiv_vf_d, 8) /* Vector Widening Floating-Point Multiply */ static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s) @@ -3046,12 +3246,12 @@ static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s) } RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16) RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32) -GEN_VEXT_VV_ENV(vfwmul_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwmul_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwmul_vv_h, 4) +GEN_VEXT_VV_ENV(vfwmul_vv_w, 8) RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16) RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32) -GEN_VEXT_VF(vfwmul_vf_h, 2, 4) -GEN_VEXT_VF(vfwmul_vf_w, 4, 8) +GEN_VEXT_VF(vfwmul_vf_h, 4) +GEN_VEXT_VF(vfwmul_vf_w, 8) /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */ #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ @@ -3082,9 +3282,9 @@ static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16) RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32) RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64) -GEN_VEXT_VV_ENV(vfmacc_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmacc_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmacc_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmacc_vv_h, 2) +GEN_VEXT_VV_ENV(vfmacc_vv_w, 4) +GEN_VEXT_VV_ENV(vfmacc_vv_d, 8) #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \ @@ -3098,9 +3298,9 @@ static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \ RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16) RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32) RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64) -GEN_VEXT_VF(vfmacc_vf_h, 2, 2) -GEN_VEXT_VF(vfmacc_vf_w, 4, 4) -GEN_VEXT_VF(vfmacc_vf_d, 8, 8) +GEN_VEXT_VF(vfmacc_vf_h, 2) +GEN_VEXT_VF(vfmacc_vf_w, 4) +GEN_VEXT_VF(vfmacc_vf_d, 8) static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3123,15 +3323,15 @@ static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16) RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32) RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64) -GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2) +GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4) +GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8) RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16) RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32) RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64) -GEN_VEXT_VF(vfnmacc_vf_h, 2, 2) -GEN_VEXT_VF(vfnmacc_vf_w, 4, 4) -GEN_VEXT_VF(vfnmacc_vf_d, 8, 8) +GEN_VEXT_VF(vfnmacc_vf_h, 2) +GEN_VEXT_VF(vfnmacc_vf_w, 4) +GEN_VEXT_VF(vfnmacc_vf_d, 8) static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3151,15 +3351,15 @@ static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16) RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32) RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64) -GEN_VEXT_VV_ENV(vfmsac_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmsac_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmsac_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmsac_vv_h, 2) +GEN_VEXT_VV_ENV(vfmsac_vv_w, 4) +GEN_VEXT_VV_ENV(vfmsac_vv_d, 8) RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16) RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32) RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64) -GEN_VEXT_VF(vfmsac_vf_h, 2, 2) -GEN_VEXT_VF(vfmsac_vf_w, 4, 4) -GEN_VEXT_VF(vfmsac_vf_d, 8, 8) +GEN_VEXT_VF(vfmsac_vf_h, 2) +GEN_VEXT_VF(vfmsac_vf_w, 4) +GEN_VEXT_VF(vfmsac_vf_d, 8) static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3179,15 +3379,15 @@ static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16) RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32) RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64) -GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2) +GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4) +GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8) RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16) RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32) RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64) -GEN_VEXT_VF(vfnmsac_vf_h, 2, 2) -GEN_VEXT_VF(vfnmsac_vf_w, 4, 4) -GEN_VEXT_VF(vfnmsac_vf_d, 8, 8) +GEN_VEXT_VF(vfnmsac_vf_h, 2) +GEN_VEXT_VF(vfnmsac_vf_w, 4) +GEN_VEXT_VF(vfnmsac_vf_d, 8) static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3207,15 +3407,15 @@ static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16) RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32) RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64) -GEN_VEXT_VV_ENV(vfmadd_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmadd_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmadd_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmadd_vv_h, 2) +GEN_VEXT_VV_ENV(vfmadd_vv_w, 4) +GEN_VEXT_VV_ENV(vfmadd_vv_d, 8) RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16) RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32) RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64) -GEN_VEXT_VF(vfmadd_vf_h, 2, 2) -GEN_VEXT_VF(vfmadd_vf_w, 4, 4) -GEN_VEXT_VF(vfmadd_vf_d, 8, 8) +GEN_VEXT_VF(vfmadd_vf_h, 2) +GEN_VEXT_VF(vfmadd_vf_w, 4) +GEN_VEXT_VF(vfmadd_vf_d, 8) static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3238,15 +3438,15 @@ static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16) RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32) RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64) -GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2) +GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4) +GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8) RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16) RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32) RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64) -GEN_VEXT_VF(vfnmadd_vf_h, 2, 2) -GEN_VEXT_VF(vfnmadd_vf_w, 4, 4) -GEN_VEXT_VF(vfnmadd_vf_d, 8, 8) +GEN_VEXT_VF(vfnmadd_vf_h, 2) +GEN_VEXT_VF(vfnmadd_vf_w, 4) +GEN_VEXT_VF(vfnmadd_vf_d, 8) static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3266,15 +3466,15 @@ static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16) RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32) RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64) -GEN_VEXT_VV_ENV(vfmsub_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmsub_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmsub_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmsub_vv_h, 2) +GEN_VEXT_VV_ENV(vfmsub_vv_w, 4) +GEN_VEXT_VV_ENV(vfmsub_vv_d, 8) RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16) RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32) RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64) -GEN_VEXT_VF(vfmsub_vf_h, 2, 2) -GEN_VEXT_VF(vfmsub_vf_w, 4, 4) -GEN_VEXT_VF(vfmsub_vf_d, 8, 8) +GEN_VEXT_VF(vfmsub_vf_h, 2) +GEN_VEXT_VF(vfmsub_vf_w, 4) +GEN_VEXT_VF(vfmsub_vf_d, 8) static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s) { @@ -3294,15 +3494,15 @@ static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16) RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32) RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64) -GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2) +GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4) +GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8) RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16) RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32) RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64) -GEN_VEXT_VF(vfnmsub_vf_h, 2, 2) -GEN_VEXT_VF(vfnmsub_vf_w, 4, 4) -GEN_VEXT_VF(vfnmsub_vf_d, 8, 8) +GEN_VEXT_VF(vfnmsub_vf_h, 2) +GEN_VEXT_VF(vfnmsub_vf_w, 4) +GEN_VEXT_VF(vfnmsub_vf_d, 8) /* Vector Widening Floating-Point Fused Multiply-Add Instructions */ static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s) @@ -3319,12 +3519,12 @@ static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16) RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32) -GEN_VEXT_VV_ENV(vfwmacc_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwmacc_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4) +GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8) RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16) RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32) -GEN_VEXT_VF(vfwmacc_vf_h, 2, 4) -GEN_VEXT_VF(vfwmacc_vf_w, 4, 8) +GEN_VEXT_VF(vfwmacc_vf_h, 4) +GEN_VEXT_VF(vfwmacc_vf_w, 8) static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s) { @@ -3342,12 +3542,12 @@ static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16) RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32) -GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4) +GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8) RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16) RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32) -GEN_VEXT_VF(vfwnmacc_vf_h, 2, 4) -GEN_VEXT_VF(vfwnmacc_vf_w, 4, 8) +GEN_VEXT_VF(vfwnmacc_vf_h, 4) +GEN_VEXT_VF(vfwnmacc_vf_w, 8) static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s) { @@ -3365,12 +3565,12 @@ static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16) RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32) -GEN_VEXT_VV_ENV(vfwmsac_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwmsac_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4) +GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8) RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16) RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32) -GEN_VEXT_VF(vfwmsac_vf_h, 2, 4) -GEN_VEXT_VF(vfwmsac_vf_w, 4, 8) +GEN_VEXT_VF(vfwmsac_vf_h, 4) +GEN_VEXT_VF(vfwmsac_vf_w, 8) static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s) { @@ -3388,12 +3588,12 @@ static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s) RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16) RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32) -GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 2, 4) -GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 4, 8) +GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4) +GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8) RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16) RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32) -GEN_VEXT_VF(vfwnmsac_vf_h, 2, 4) -GEN_VEXT_VF(vfwnmsac_vf_w, 4, 8) +GEN_VEXT_VF(vfwnmsac_vf_h, 4) +GEN_VEXT_VF(vfwnmsac_vf_w, 8) /* Vector Floating-Point Square-Root Instruction */ /* (TD, T2, TX2) */ @@ -3409,12 +3609,15 @@ static void do_##NAME(void *vd, void *vs2, int i, \ *((TD *)vd + HD(i)) = OP(s2, &env->fp_status); \ } -#define GEN_VEXT_V_ENV(NAME, ESZ, DSZ) \ +#define GEN_VEXT_V_ENV(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, ESZ); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ if (vl == 0) { \ @@ -3427,14 +3630,16 @@ void HELPER(NAME)(void *vd, void *v0, void *vs2, \ do_##NAME(vd, vs2, i, env); \ } \ env->vstart = 0; \ + vext_set_elems_1s(vd, vta, vl * ESZ, \ + total_elems * ESZ); \ } RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt) RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt) RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt) -GEN_VEXT_V_ENV(vfsqrt_v_h, 2, 2) -GEN_VEXT_V_ENV(vfsqrt_v_w, 4, 4) -GEN_VEXT_V_ENV(vfsqrt_v_d, 8, 8) +GEN_VEXT_V_ENV(vfsqrt_v_h, 2) +GEN_VEXT_V_ENV(vfsqrt_v_w, 4) +GEN_VEXT_V_ENV(vfsqrt_v_d, 8) /* * Vector Floating-Point Reciprocal Square-Root Estimate Instruction @@ -3614,9 +3819,9 @@ static float64 frsqrt7_d(float64 f, float_status *s) RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h) RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s) RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d) -GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2, 2) -GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4, 4) -GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8, 8) +GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2) +GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4) +GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8) /* * Vector Floating-Point Reciprocal Estimate Instruction @@ -3805,36 +4010,36 @@ static float64 frec7_d(float64 f, float_status *s) RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h) RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s) RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d) -GEN_VEXT_V_ENV(vfrec7_v_h, 2, 2) -GEN_VEXT_V_ENV(vfrec7_v_w, 4, 4) -GEN_VEXT_V_ENV(vfrec7_v_d, 8, 8) +GEN_VEXT_V_ENV(vfrec7_v_h, 2) +GEN_VEXT_V_ENV(vfrec7_v_w, 4) +GEN_VEXT_V_ENV(vfrec7_v_d, 8) /* Vector Floating-Point MIN/MAX Instructions */ RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number) RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number) RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number) -GEN_VEXT_VV_ENV(vfmin_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmin_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmin_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmin_vv_h, 2) +GEN_VEXT_VV_ENV(vfmin_vv_w, 4) +GEN_VEXT_VV_ENV(vfmin_vv_d, 8) RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number) RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number) RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number) -GEN_VEXT_VF(vfmin_vf_h, 2, 2) -GEN_VEXT_VF(vfmin_vf_w, 4, 4) -GEN_VEXT_VF(vfmin_vf_d, 8, 8) +GEN_VEXT_VF(vfmin_vf_h, 2) +GEN_VEXT_VF(vfmin_vf_w, 4) +GEN_VEXT_VF(vfmin_vf_d, 8) RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number) RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number) RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number) -GEN_VEXT_VV_ENV(vfmax_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfmax_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfmax_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfmax_vv_h, 2) +GEN_VEXT_VV_ENV(vfmax_vv_w, 4) +GEN_VEXT_VV_ENV(vfmax_vv_d, 8) RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number) RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number) RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number) -GEN_VEXT_VF(vfmax_vf_h, 2, 2) -GEN_VEXT_VF(vfmax_vf_w, 4, 4) -GEN_VEXT_VF(vfmax_vf_d, 8, 8) +GEN_VEXT_VF(vfmax_vf_h, 2) +GEN_VEXT_VF(vfmax_vf_w, 4) +GEN_VEXT_VF(vfmax_vf_d, 8) /* Vector Floating-Point Sign-Injection Instructions */ static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s) @@ -3855,15 +4060,15 @@ static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s) RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16) RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32) RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64) -GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2) +GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4) +GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8) RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16) RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32) RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64) -GEN_VEXT_VF(vfsgnj_vf_h, 2, 2) -GEN_VEXT_VF(vfsgnj_vf_w, 4, 4) -GEN_VEXT_VF(vfsgnj_vf_d, 8, 8) +GEN_VEXT_VF(vfsgnj_vf_h, 2) +GEN_VEXT_VF(vfsgnj_vf_w, 4) +GEN_VEXT_VF(vfsgnj_vf_d, 8) static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s) { @@ -3883,15 +4088,15 @@ static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s) RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16) RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32) RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64) -GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2) +GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4) +GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8) RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16) RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32) RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64) -GEN_VEXT_VF(vfsgnjn_vf_h, 2, 2) -GEN_VEXT_VF(vfsgnjn_vf_w, 4, 4) -GEN_VEXT_VF(vfsgnjn_vf_d, 8, 8) +GEN_VEXT_VF(vfsgnjn_vf_h, 2) +GEN_VEXT_VF(vfsgnjn_vf_w, 4) +GEN_VEXT_VF(vfsgnjn_vf_d, 8) static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s) { @@ -3911,15 +4116,15 @@ static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s) RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16) RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32) RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64) -GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2, 2) -GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4, 4) -GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8, 8) +GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2) +GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4) +GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8) RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16) RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32) RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64) -GEN_VEXT_VF(vfsgnjx_vf_h, 2, 2) -GEN_VEXT_VF(vfsgnjx_vf_w, 4, 4) -GEN_VEXT_VF(vfsgnjx_vf_d, 8, 8) +GEN_VEXT_VF(vfsgnjx_vf_h, 2) +GEN_VEXT_VF(vfsgnjx_vf_w, 4) +GEN_VEXT_VF(vfsgnjx_vf_d, 8) /* Vector Floating-Point Compare Instructions */ #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP) \ @@ -3928,6 +4133,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -3940,6 +4147,13 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ DO_OP(s2, s1, &env->fp_status)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail-agnostic */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet) @@ -3952,6 +4166,8 @@ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -3963,6 +4179,13 @@ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ DO_OP(s2, (ETYPE)s1, &env->fp_status)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail-agnostic */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet) @@ -4063,12 +4286,15 @@ static void do_##NAME(void *vd, void *vs2, int i) \ *((TD *)vd + HD(i)) = OP(s2); \ } -#define GEN_VEXT_V(NAME, ESZ, DSZ) \ +#define GEN_VEXT_V(NAME, ESZ) \ void HELPER(NAME)(void *vd, void *v0, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, ESZ); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4078,6 +4304,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs2, \ do_##NAME(vd, vs2, i); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * ESZ, \ + total_elems * ESZ); \ } target_ulong fclass_h(uint64_t frs1) @@ -4140,17 +4369,22 @@ target_ulong fclass_d(uint64_t frs1) RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h) RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s) RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d) -GEN_VEXT_V(vfclass_v_h, 2, 2) -GEN_VEXT_V(vfclass_v_w, 4, 4) -GEN_VEXT_V(vfclass_v_d, 8, 8) +GEN_VEXT_V(vfclass_v_h, 2) +GEN_VEXT_V(vfclass_v_w, 4) +GEN_VEXT_V(vfclass_v_d, 8) /* Vector Floating-Point Merge Instruction */ + #define GEN_VFMERGE_VF(NAME, ETYPE, H) \ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = \ + vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4159,6 +4393,8 @@ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ = (!vm && !vext_elem_mask(v0, i) ? s2 : s1); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2) @@ -4170,33 +4406,33 @@ GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8) RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16) RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32) RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64) -GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2, 2) -GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4, 4) -GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8, 8) +GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2) +GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4) +GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8) /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */ RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16) RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32) RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64) -GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2, 2) -GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4, 4) -GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8, 8) +GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2) +GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4) +GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8) /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */ RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16) RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32) RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64) -GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2, 2) -GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4, 4) -GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8, 8) +GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2) +GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4) +GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8) /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */ RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16) RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32) RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64) -GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2, 2) -GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4, 4) -GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8, 8) +GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2) +GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4) +GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8) /* Widening Floating-Point/Integer Type-Convert Instructions */ /* (TD, T2, TX2) */ @@ -4206,30 +4442,30 @@ GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8, 8) /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/ RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32) RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64) -GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 2, 4) -GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 4, 8) +GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4) +GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8) /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */ RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32) RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64) -GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 2, 4) -GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 4, 8) +GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4) +GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8) /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */ RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16) RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32) RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64) -GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 1, 2) -GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 2, 4) -GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 4, 8) +GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2) +GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4) +GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8) /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */ RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16) RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32) RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64) -GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 1, 2) -GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 2, 4) -GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 4, 8) +GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2) +GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4) +GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8) /* * vfwcvt.f.f.v vd, vs2, vm @@ -4242,8 +4478,8 @@ static uint32_t vfwcvtffv16(uint16_t a, float_status *s) RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16) RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64) -GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 2, 4) -GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 4, 8) +GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4) +GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8) /* Narrowing Floating-Point/Integer Type-Convert Instructions */ /* (TD, T2, TX2) */ @@ -4254,29 +4490,29 @@ GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 4, 8) RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8) RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16) RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32) -GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1, 1) -GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2, 2) -GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4, 4) +GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1) +GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2) +GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4) /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */ RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8) RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16) RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32) -GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1, 1) -GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2, 2) -GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4, 4) +GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1) +GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2) +GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4) /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */ RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16) RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32) -GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2, 2) -GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4, 4) +GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2) +GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4) /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */ RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16) RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32) -GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2, 2) -GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4, 4) +GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2) +GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4) /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */ static uint16_t vfncvtffv16(uint32_t a, float_status *s) @@ -4286,8 +4522,8 @@ static uint16_t vfncvtffv16(uint32_t a, float_status *s) RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16) RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32) -GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2, 2) -GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4, 4) +GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2) +GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4) /* *** Vector Reduction Operations @@ -4299,6 +4535,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(TD); \ + uint32_t vlenb = simd_maxsz(desc); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ TD s1 = *((TD *)vs1 + HD(0)); \ \ @@ -4311,6 +4550,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ } \ *((TD *)vd + HD(0)) = s1; \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, esz, vlenb); \ } /* vd[0] = sum(vs1[0], vs2[*]) */ @@ -4380,6 +4621,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(TD); \ + uint32_t vlenb = simd_maxsz(desc); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ TD s1 = *((TD *)vs1 + HD(0)); \ \ @@ -4392,6 +4636,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ } \ *((TD *)vd + HD(0)) = s1; \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, esz, vlenb); \ } /* Unordered sum */ @@ -4416,6 +4662,9 @@ void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1, { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t esz = sizeof(uint32_t); + uint32_t vlenb = simd_maxsz(desc); + uint32_t vta = vext_vta(desc); uint32_t i; uint32_t s1 = *((uint32_t *)vs1 + H4(0)); @@ -4429,6 +4678,8 @@ void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1, } *((uint32_t *)vd + H4(0)) = s1; env->vstart = 0; + /* set tail elements to 1s */ + vext_set_elems_1s(vd, vta, esz, vlenb); } void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1, @@ -4436,6 +4687,9 @@ void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1, { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t esz = sizeof(uint64_t); + uint32_t vlenb = simd_maxsz(desc); + uint32_t vta = vext_vta(desc); uint32_t i; uint64_t s1 = *((uint64_t *)vs1); @@ -4449,6 +4703,8 @@ void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1, } *((uint64_t *)vd) = s1; env->vstart = 0; + /* set tail elements to 1s */ + vext_set_elems_1s(vd, vta, esz, vlenb); } /* @@ -4461,6 +4717,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t total_elems = env_archcpu(env)->cfg.vlen; \ + uint32_t vta_all_1s = vext_vta_all_1s(desc); \ uint32_t i; \ int a, b; \ \ @@ -4470,6 +4728,15 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ vext_set_elem_mask(vd, i, OP(b, a)); \ } \ env->vstart = 0; \ + /* mask destination register are always tail- \ + * agnostic \ + */ \ + /* set tail elements to 1s */ \ + if (vta_all_1s) { \ + for (; i < total_elems; i++) { \ + vext_set_elem_mask(vd, i, 1); \ + } \ + } \ } #define DO_NAND(N, M) (!(N & M)) @@ -4537,6 +4804,8 @@ static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env, { uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; + uint32_t total_elems = env_archcpu(env)->cfg.vlen; + uint32_t vta_all_1s = vext_vta_all_1s(desc); int i; bool first_mask_bit = false; @@ -4565,6 +4834,13 @@ static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env, } } env->vstart = 0; + /* mask destination register are always tail-agnostic */ + /* set tail elements to 1s */ + if (vta_all_1s) { + for (; i < total_elems; i++) { + vext_set_elem_mask(vd, i, 1); + } + } } void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env, @@ -4592,6 +4868,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t sum = 0; \ int i; \ \ @@ -4605,6 +4884,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env, \ } \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VIOTA_M(viota_m_b, uint8_t, H1) @@ -4618,6 +4899,9 @@ void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc) \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ int i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4627,6 +4911,8 @@ void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc) \ *((ETYPE *)vd + H(i)) = i; \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VID_V(vid_v_b, uint8_t, H1) @@ -4645,6 +4931,9 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ target_ulong offset = s1, i_min, i; \ \ i_min = MAX(env->vstart, offset); \ @@ -4654,6 +4943,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \ } \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */ @@ -4669,6 +4960,9 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ target_ulong i_max, i; \ \ i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart); \ @@ -4685,6 +4979,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */ @@ -4693,13 +4989,16 @@ GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2) GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4) GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8) -#define GEN_VEXT_VSLIE1UP(ESZ, H) \ -static void vslide1up_##ESZ(void *vd, void *v0, target_ulong s1, void *vs2, \ - CPURISCVState *env, uint32_t desc) \ +#define GEN_VEXT_VSLIE1UP(BITWIDTH, H) \ +static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1, \ + void *vs2, CPURISCVState *env, uint32_t desc) \ { \ - typedef uint##ESZ##_t ETYPE; \ + typedef uint##BITWIDTH##_t ETYPE; \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4713,6 +5012,8 @@ static void vslide1up_##ESZ(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VSLIE1UP(8, H1) @@ -4720,11 +5021,11 @@ GEN_VEXT_VSLIE1UP(16, H2) GEN_VEXT_VSLIE1UP(32, H4) GEN_VEXT_VSLIE1UP(64, H8) -#define GEN_VEXT_VSLIDE1UP_VX(NAME, ESZ) \ +#define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH) \ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - vslide1up_##ESZ(vd, v0, s1, vs2, env, desc); \ + vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ } /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */ @@ -4733,13 +5034,16 @@ GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16) GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32) GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64) -#define GEN_VEXT_VSLIDE1DOWN(ESZ, H) \ -static void vslide1down_##ESZ(void *vd, void *v0, target_ulong s1, void *vs2, \ - CPURISCVState *env, uint32_t desc) \ +#define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H) \ +static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1, \ + void *vs2, CPURISCVState *env, uint32_t desc) \ { \ - typedef uint##ESZ##_t ETYPE; \ + typedef uint##BITWIDTH##_t ETYPE; \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4753,6 +5057,8 @@ static void vslide1down_##ESZ(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_VSLIDE1DOWN(8, H1) @@ -4760,11 +5066,11 @@ GEN_VEXT_VSLIDE1DOWN(16, H2) GEN_VEXT_VSLIDE1DOWN(32, H4) GEN_VEXT_VSLIDE1DOWN(64, H8) -#define GEN_VEXT_VSLIDE1DOWN_VX(NAME, ESZ) \ +#define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH) \ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - vslide1down_##ESZ(vd, v0, s1, vs2, env, desc); \ + vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ } /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */ @@ -4774,11 +5080,11 @@ GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32) GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64) /* Vector Floating-Point Slide Instructions */ -#define GEN_VEXT_VFSLIDE1UP_VF(NAME, ESZ) \ +#define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH) \ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - vslide1up_##ESZ(vd, v0, s1, vs2, env, desc); \ + vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ } /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */ @@ -4786,11 +5092,11 @@ GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16) GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32) GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64) -#define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, ESZ) \ +#define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH) \ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - vslide1down_##ESZ(vd, v0, s1, vs2, env, desc); \ + vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ } /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */ @@ -4806,6 +5112,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2))); \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(TS2); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint64_t index; \ uint32_t i; \ \ @@ -4821,6 +5130,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ } \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */ @@ -4841,6 +5152,9 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint64_t index = s1; \ uint32_t i; \ \ @@ -4855,6 +5169,8 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */ @@ -4869,6 +5185,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t vl = env->vl; \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t num = 0, i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4879,6 +5198,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ num++; \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } /* Compress into vd elements of vs2 where vs1 is enabled */ @@ -4910,6 +5231,9 @@ void HELPER(NAME)(void *vd, void *v0, void *vs2, \ { \ uint32_t vl = env->vl; \ uint32_t vm = vext_vm(desc); \ + uint32_t esz = sizeof(ETYPE); \ + uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ + uint32_t vta = vext_vta(desc); \ uint32_t i; \ \ for (i = env->vstart; i < vl; i++) { \ @@ -4919,6 +5243,8 @@ void HELPER(NAME)(void *vd, void *v0, void *vs2, \ *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i)); \ } \ env->vstart = 0; \ + /* set tail elements to 1s */ \ + vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t, H2, H1)