From e8f433f80e831ecd81989ae0246f3f1e2d3ac480 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:07 +0200 Subject: [PATCH 01/22] memory: Let ReservedRegion use Range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A reserved region is a range tagged with a type. Let's directly use the Range type in the prospect to reuse some of the library helpers shipped with the Range type. Signed-off-by: Eric Auger Reviewed-by: David Hildenbrand Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cédric Le Goater Reviewed-by: "Michael S. Tsirkin" Signed-off-by: Cédric Le Goater --- hw/core/qdev-properties-system.c | 9 ++++++--- hw/virtio/virtio-iommu.c | 6 +++--- include/exec/memory.h | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 2f1dbb3fd7..b5ccafa29e 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -705,7 +705,7 @@ static void get_reserved_region(Object *obj, Visitor *v, const char *name, int rc; rc = snprintf(buffer, sizeof(buffer), "0x%"PRIx64":0x%"PRIx64":%u", - rr->low, rr->high, rr->type); + range_lob(&rr->range), range_upb(&rr->range), rr->type); assert(rc < sizeof(buffer)); visit_type_str(v, name, &p, errp); @@ -717,6 +717,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, Property *prop = opaque; ReservedRegion *rr = object_field_prop_ptr(obj, prop); const char *endptr; + uint64_t lob, upb; char *str; int ret; @@ -724,7 +725,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, return; } - ret = qemu_strtou64(str, &endptr, 16, &rr->low); + ret = qemu_strtou64(str, &endptr, 16, &lob); if (ret) { error_setg(errp, "start address of '%s'" " must be a hexadecimal integer", name); @@ -734,7 +735,7 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, goto separator_error; } - ret = qemu_strtou64(endptr + 1, &endptr, 16, &rr->high); + ret = qemu_strtou64(endptr + 1, &endptr, 16, &upb); if (ret) { error_setg(errp, "end address of '%s'" " must be a hexadecimal integer", name); @@ -744,6 +745,8 @@ static void set_reserved_region(Object *obj, Visitor *v, const char *name, goto separator_error; } + range_set_bounds(&rr->range, lob, upb); + ret = qemu_strtoui(endptr + 1, &endptr, 10, &rr->type); if (ret) { error_setg(errp, "type of '%s'" diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index be51635895..e5e46e1b55 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -645,8 +645,8 @@ static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM); prop.head.length = cpu_to_le16(length); prop.subtype = subtype; - prop.start = cpu_to_le64(s->reserved_regions[i].low); - prop.end = cpu_to_le64(s->reserved_regions[i].high); + prop.start = cpu_to_le64(range_lob(&s->reserved_regions[i].range)); + prop.end = cpu_to_le64(range_upb(&s->reserved_regions[i].range)); memcpy(buf, &prop, size); @@ -897,7 +897,7 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, for (i = 0; i < s->nb_reserved_regions; i++) { ReservedRegion *reg = &s->reserved_regions[i]; - if (addr >= reg->low && addr <= reg->high) { + if (range_contains(®->range, addr)) { switch (reg->type) { case VIRTIO_IOMMU_RESV_MEM_T_MSI: entry.perm = flag; diff --git a/include/exec/memory.h b/include/exec/memory.h index 9087d02769..d94314d6fc 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -24,6 +24,7 @@ #include "qemu/bswap.h" #include "qemu/queue.h" #include "qemu/int128.h" +#include "qemu/range.h" #include "qemu/notify.h" #include "qom/object.h" #include "qemu/rcu.h" @@ -79,8 +80,7 @@ extern unsigned int global_dirty_tracking; typedef struct MemoryRegionOps MemoryRegionOps; struct ReservedRegion { - hwaddr low; - hwaddr high; + Range range; unsigned type; }; From 51478a8ef5694cbd92b9a3436ec4489464210a8e Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:08 +0200 Subject: [PATCH 02/22] memory: Introduce memory_region_iommu_set_iova_ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will allow to convey information about valid IOVA ranges to virtual IOMMUS. Signed-off-by: Eric Auger Acked-by: Peter Xu Reviewed-by: "Michael S. Tsirkin" [ clg: fixes in memory_region_iommu_set_iova_ranges() and iommu_set_iova_ranges() documentation ] Signed-off-by: Cédric Le Goater --- include/exec/memory.h | 32 ++++++++++++++++++++++++++++++++ system/memory.c | 13 +++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/exec/memory.h b/include/exec/memory.h index d94314d6fc..831f7c996d 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -527,6 +527,26 @@ struct IOMMUMemoryRegionClass { int (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu, uint64_t page_size_mask, Error **errp); + /** + * @iommu_set_iova_ranges: + * + * Propagate information about the usable IOVA ranges for a given IOMMU + * memory region. Used for example to propagate host physical device + * reserved memory region constraints to the virtual IOMMU. + * + * Optional method: if this method is not provided, then the default IOVA + * aperture is used. + * + * @iommu: the IOMMUMemoryRegion + * + * @iova_ranges: list of ordered IOVA ranges (at least one range) + * + * Returns 0 on success, or a negative error. In case of failure, the error + * object must be created. + */ + int (*iommu_set_iova_ranges)(IOMMUMemoryRegion *iommu, + GList *iova_ranges, + Error **errp); }; typedef struct RamDiscardListener RamDiscardListener; @@ -1856,6 +1876,18 @@ int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, uint64_t page_size_mask, Error **errp); +/** + * memory_region_iommu_set_iova_ranges - Set the usable IOVA ranges + * for a given IOMMU MR region + * + * @iommu: IOMMU memory region + * @iova_ranges: list of ordered IOVA ranges (at least one range) + * @errp: pointer to Error*, to store an error if it happens. + */ +int memory_region_iommu_set_iova_ranges(IOMMUMemoryRegion *iommu, + GList *iova_ranges, + Error **errp); + /** * memory_region_name: get a memory region's name * diff --git a/system/memory.c b/system/memory.c index 4928f2525d..304fa843ea 100644 --- a/system/memory.c +++ b/system/memory.c @@ -1921,6 +1921,19 @@ int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, return ret; } +int memory_region_iommu_set_iova_ranges(IOMMUMemoryRegion *iommu_mr, + GList *iova_ranges, + Error **errp) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + int ret = 0; + + if (imrc->iommu_set_iova_ranges) { + ret = imrc->iommu_set_iova_ranges(iommu_mr, iova_ranges, errp); + } + return ret; +} + int memory_region_register_iommu_notifier(MemoryRegion *mr, IOMMUNotifier *n, Error **errp) { From e4a8ae09c538880440ba866174b0015f147c8c9e Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:09 +0200 Subject: [PATCH 03/22] vfio: Collect container iova range info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collect iova range information if VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE capability is supported. This allows to propagate the information though the IOMMU MR set_iova_ranges() callback so that virtual IOMMUs get aware of those aperture constraints. This is only done if the info is available and the number of iova ranges is greater than 0. A new vfio_get_info_iova_range helper is introduced matching the coding style of existing vfio_get_info_dma_avail. The boolean returned value isn't used though. Code is aligned between both. Signed-off-by: Eric Auger Reviewed-by: Alex Williamson Tested-by: Yanghang Liu Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 9 ++++++++ hw/vfio/container.c | 42 ++++++++++++++++++++++++++++++++--- include/hw/vfio/vfio-common.h | 1 + 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d806057b40..9c5c6433f2 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -693,6 +693,15 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } + if (container->iova_ranges) { + ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, + container->iova_ranges, &err); + if (ret) { + g_free(giommu); + goto fail; + } + } + ret = memory_region_register_iommu_notifier(section->mr, &giommu->n, &err); if (ret) { diff --git a/hw/vfio/container.c b/hw/vfio/container.c index adc467210f..fc88222377 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -382,7 +382,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, /* If the capability cannot be found, assume no DMA limiting */ hdr = vfio_get_iommu_type1_info_cap(info, VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); - if (hdr == NULL) { + if (!hdr) { return false; } @@ -394,6 +394,32 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, return true; } +static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, + VFIOContainer *container) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_iova_range *cap; + + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + if (!hdr) { + return false; + } + + cap = (void *)hdr; + + for (int i = 0; i < cap->nr_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, cap->iova_ranges[i].start, + cap->iova_ranges[i].end); + container->iova_ranges = + range_list_insert(container->iova_ranges, range); + } + + return true; +} + static void vfio_kvm_device_add_group(VFIOGroup *group) { Error *err = NULL; @@ -535,6 +561,12 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, } } +static void vfio_free_container(VFIOContainer *container) +{ + g_list_free_full(container->iova_ranges, g_free); + g_free(container); +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -616,6 +648,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->error = NULL; container->dirty_pages_supported = false; container->dma_max_mappings = 0; + container->iova_ranges = NULL; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); QLIST_INIT(&container->vrdl_list); @@ -652,6 +685,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { container->dma_max_mappings = 65535; } + + vfio_get_info_iova_range(info, container); + vfio_get_iommu_info_migration(container, info); g_free(info); @@ -765,7 +801,7 @@ enable_discards_exit: vfio_ram_block_discard_disable(container, false); free_container_exit: - g_free(container); + vfio_free_container(container); close_fd_exit: close(fd); @@ -819,7 +855,7 @@ static void vfio_disconnect_container(VFIOGroup *group) trace_vfio_disconnect_container(container->fd); close(container->fd); - g_free(container); + vfio_free_container(container); vfio_put_address_space(space); } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 7780b9073a..0c3d390e8b 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -99,6 +99,7 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainer) next; QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; } VFIOContainer; typedef struct VFIOGuestIOMMU { From 41cc70cdf53268cd1bc9719014acf739932b51e5 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:10 +0200 Subject: [PATCH 04/22] virtio-iommu: Rename reserved_regions into prop_resv_regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename VirtIOIOMMU (nb_)reserved_regions fields with the "prop_" prefix to highlight those fields are set through a property, at machine level. They are IOMMU wide. A subsequent patch will introduce per IOMMUDevice reserved regions that will include both those IOMMU wide property reserved regions plus, sometimes, host reserved regions, if the device is backed by a host device protected by a physical IOMMU. Also change nb_ prefix by nr_. Signed-off-by: Eric Auger Reviewed-by: Cédric Le Goater Reviewed-by: "Michael S. Tsirkin" Signed-off-by: Cédric Le Goater --- hw/virtio/virtio-iommu-pci.c | 8 ++++---- hw/virtio/virtio-iommu.c | 15 ++++++++------- include/hw/virtio/virtio-iommu.h | 4 ++-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c index 7ef2f9dcdb..9459fbf6ed 100644 --- a/hw/virtio/virtio-iommu-pci.c +++ b/hw/virtio/virtio-iommu-pci.c @@ -37,7 +37,7 @@ struct VirtIOIOMMUPCI { static Property virtio_iommu_pci_properties[] = { DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI, - vdev.nb_reserved_regions, vdev.reserved_regions, + vdev.nr_prop_resv_regions, vdev.prop_resv_regions, qdev_prop_reserved_region, ReservedRegion), DEFINE_PROP_END_OF_LIST(), }; @@ -54,9 +54,9 @@ static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) "for the virtio-iommu-pci device"); return; } - for (int i = 0; i < s->nb_reserved_regions; i++) { - if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && - s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { + for (int i = 0; i < s->nr_prop_resv_regions; i++) { + if (s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && + s->prop_resv_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { error_setg(errp, "reserved region %d has an invalid type", i); error_append_hint(errp, "Valid values are 0 and 1\n"); return; diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index e5e46e1b55..979cdb5648 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -631,22 +631,23 @@ static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, size_t size = sizeof(prop), length = size - sizeof(prop.head), total; int i; - total = size * s->nb_reserved_regions; + total = size * s->nr_prop_resv_regions; if (total > free) { return -ENOSPC; } - for (i = 0; i < s->nb_reserved_regions; i++) { - unsigned subtype = s->reserved_regions[i].type; + for (i = 0; i < s->nr_prop_resv_regions; i++) { + unsigned subtype = s->prop_resv_regions[i].type; + Range *range = &s->prop_resv_regions[i].range; assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED || subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI); prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM); prop.head.length = cpu_to_le16(length); prop.subtype = subtype; - prop.start = cpu_to_le64(range_lob(&s->reserved_regions[i].range)); - prop.end = cpu_to_le64(range_upb(&s->reserved_regions[i].range)); + prop.start = cpu_to_le64(range_lob(range)); + prop.end = cpu_to_le64(range_upb(range)); memcpy(buf, &prop, size); @@ -894,8 +895,8 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, goto unlock; } - for (i = 0; i < s->nb_reserved_regions; i++) { - ReservedRegion *reg = &s->reserved_regions[i]; + for (i = 0; i < s->nr_prop_resv_regions; i++) { + ReservedRegion *reg = &s->prop_resv_regions[i]; if (range_contains(®->range, addr)) { switch (reg->type) { diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index a93fc5383e..eea4564782 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -55,8 +55,8 @@ struct VirtIOIOMMU { GHashTable *as_by_busptr; IOMMUPciBus *iommu_pcibus_by_bus_num[PCI_BUS_MAX]; PCIBus *primary_bus; - ReservedRegion *reserved_regions; - uint32_t nb_reserved_regions; + ReservedRegion *prop_resv_regions; + uint32_t nr_prop_resv_regions; GTree *domains; QemuRecMutex mutex; GTree *endpoints; From 43f04cbeff863ae68b6ead432af5e771b92b934c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:11 +0200 Subject: [PATCH 05/22] range: Make range_compare() public MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's expose range_compare() in the header so that it can be reused outside of util/range.c Signed-off-by: Eric Auger Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Cédric Le Goater --- include/qemu/range.h | 6 ++++++ util/range.c | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/qemu/range.h b/include/qemu/range.h index 7e2b1cc447..aa671da143 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -217,6 +217,12 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, return !(last2 < first1 || last1 < first2); } +/* + * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. + * Both @a and @b must not be empty. + */ +int range_compare(Range *a, Range *b); + GList *range_list_insert(GList *list, Range *data); #endif diff --git a/util/range.c b/util/range.c index 098d9d2dc0..782cb8b21c 100644 --- a/util/range.c +++ b/util/range.c @@ -20,11 +20,7 @@ #include "qemu/osdep.h" #include "qemu/range.h" -/* - * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. - * Both @a and @b must not be empty. - */ -static inline int range_compare(Range *a, Range *b) +int range_compare(Range *a, Range *b) { assert(!range_is_empty(a) && !range_is_empty(b)); From c3104847363f4ac5d4e76e8ed637280f7be1ee68 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:12 +0200 Subject: [PATCH 06/22] util/reserved-region: Add new ReservedRegion helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce resv_region_list_insert() helper which inserts a new ReservedRegion into a sorted list of reserved region. In case of overlap, the new region has higher priority and hides the existing overlapped segments. If the overlap is partial, new regions are created for parts which are not overlapped. The new region has higher priority independently on the type of the regions. Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker Tested-by: Yanghang Liu Signed-off-by: Cédric Le Goater --- include/qemu/reserved-region.h | 32 ++++++++++++ util/meson.build | 1 + util/reserved-region.c | 91 ++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 include/qemu/reserved-region.h create mode 100644 util/reserved-region.c diff --git a/include/qemu/reserved-region.h b/include/qemu/reserved-region.h new file mode 100644 index 0000000000..8e6f0a97e2 --- /dev/null +++ b/include/qemu/reserved-region.h @@ -0,0 +1,32 @@ +/* + * QEMU ReservedRegion helpers + * + * Copyright (c) 2023 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef QEMU_RESERVED_REGION_H +#define QEMU_RESERVED_REGION_H + +#include "exec/memory.h" + +/* + * Insert a new region into a sorted list of reserved regions. In case + * there is overlap with existing regions, the new added region has + * higher priority and replaces the overlapped segment. + */ +GList *resv_region_list_insert(GList *list, ReservedRegion *reg); + +#endif diff --git a/util/meson.build b/util/meson.build index 769b24f2e0..c3a24af5c5 100644 --- a/util/meson.build +++ b/util/meson.build @@ -52,6 +52,7 @@ util_ss.add(files('qdist.c')) util_ss.add(files('qht.c')) util_ss.add(files('qsp.c')) util_ss.add(files('range.c')) +util_ss.add(files('reserved-region.c')) util_ss.add(files('stats64.c')) util_ss.add(files('systemd.c')) util_ss.add(files('transactions.c')) diff --git a/util/reserved-region.c b/util/reserved-region.c new file mode 100644 index 0000000000..18f83eb4c6 --- /dev/null +++ b/util/reserved-region.c @@ -0,0 +1,91 @@ +/* + * QEMU ReservedRegion helpers + * + * Copyright (c) 2023 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "qemu/range.h" +#include "qemu/reserved-region.h" + +GList *resv_region_list_insert(GList *list, ReservedRegion *reg) +{ + ReservedRegion *resv_iter, *new_reg; + Range *r = ®->range; + Range *range_iter; + GList *l; + + for (l = list; l ; ) { + resv_iter = (ReservedRegion *)l->data; + range_iter = &resv_iter->range; + + /* Skip all list elements strictly less than range to add */ + if (range_compare(range_iter, r) < 0) { + l = l->next; + } else if (range_compare(range_iter, r) > 0) { + return g_list_insert_before(list, l, reg); + } else { /* there is an overlap */ + if (range_contains_range(r, range_iter)) { + /* new range contains current item, simply remove this latter */ + GList *prev = l->prev; + g_free(l->data); + list = g_list_delete_link(list, l); + if (prev) { + l = prev->next; + } else { + l = list; + } + } else if (range_contains_range(range_iter, r)) { + /* new region is included in the current region */ + if (range_lob(range_iter) == range_lob(r)) { + /* adjacent on the left side, derives into 2 regions */ + range_set_bounds(range_iter, range_upb(r) + 1, + range_upb(range_iter)); + return g_list_insert_before(list, l, reg); + } else if (range_upb(range_iter) == range_upb(r)) { + /* adjacent on the right side, derives into 2 regions */ + range_set_bounds(range_iter, range_lob(range_iter), + range_lob(r) - 1); + l = l->next; + } else { + uint64_t lob = range_lob(range_iter); + /* + * the new range is in the middle of an existing one, + * split this latter into 3 regs instead + */ + range_set_bounds(range_iter, range_upb(r) + 1, + range_upb(range_iter)); + new_reg = g_new0(ReservedRegion, 1); + new_reg->type = resv_iter->type; + range_set_bounds(&new_reg->range, + lob, range_lob(r) - 1); + list = g_list_insert_before(list, l, new_reg); + return g_list_insert_before(list, l, reg); + } + } else if (range_lob(r) < range_lob(range_iter)) { + range_set_bounds(range_iter, range_upb(r) + 1, + range_upb(range_iter)); + return g_list_insert_before(list, l, reg); + } else { /* intersection on the upper range */ + range_set_bounds(range_iter, range_lob(range_iter), + range_lob(r) - 1); + l = l->next; + } + } /* overlap */ + } + return g_list_append(list, reg); +} + From 908cae0de4fd63a58f5a7dc447f843a5be9cff46 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:13 +0200 Subject: [PATCH 07/22] virtio-iommu: Introduce per IOMMUDevice reserved regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the time being the per device reserved regions are just a duplicate of IOMMU wide reserved regions. Subsequent patches will combine those with host reserved regions, if any. Signed-off-by: Eric Auger Tested-by: Yanghang Liu Reviewed-by: "Michael S. Tsirkin" Signed-off-by: Cédric Le Goater --- hw/virtio/virtio-iommu.c | 37 +++++++++++++++++++++++++------- include/hw/virtio/virtio-iommu.h | 1 + 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 979cdb5648..0e2370663d 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -26,6 +26,7 @@ #include "sysemu/kvm.h" #include "sysemu/reset.h" #include "sysemu/sysemu.h" +#include "qemu/reserved-region.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -378,6 +379,19 @@ static void virtio_iommu_put_domain(gpointer data) g_free(domain); } +static void add_prop_resv_regions(IOMMUDevice *sdev) +{ + VirtIOIOMMU *s = sdev->viommu; + int i; + + for (i = 0; i < s->nr_prop_resv_regions; i++) { + ReservedRegion *reg = g_new0(ReservedRegion, 1); + + *reg = s->prop_resv_regions[i]; + sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg); + } +} + static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, int devfn) { @@ -408,6 +422,7 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU); + add_prop_resv_regions(sdev); /* * Build the IOMMU disabled container with aliases to the @@ -629,17 +644,23 @@ static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, { struct virtio_iommu_probe_resv_mem prop = {}; size_t size = sizeof(prop), length = size - sizeof(prop.head), total; - int i; + IOMMUDevice *sdev; + GList *l; - total = size * s->nr_prop_resv_regions; + sdev = container_of(virtio_iommu_mr(s, ep), IOMMUDevice, iommu_mr); + if (!sdev) { + return -EINVAL; + } + total = size * g_list_length(sdev->resv_regions); if (total > free) { return -ENOSPC; } - for (i = 0; i < s->nr_prop_resv_regions; i++) { - unsigned subtype = s->prop_resv_regions[i].type; - Range *range = &s->prop_resv_regions[i].range; + for (l = sdev->resv_regions; l; l = l->next) { + ReservedRegion *reg = l->data; + unsigned subtype = reg->type; + Range *range = ®->range; assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED || subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI); @@ -857,7 +878,7 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, bool bypass_allowed; int granule; bool found; - int i; + GList *l; interval.low = addr; interval.high = addr + 1; @@ -895,8 +916,8 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, goto unlock; } - for (i = 0; i < s->nr_prop_resv_regions; i++) { - ReservedRegion *reg = &s->prop_resv_regions[i]; + for (l = sdev->resv_regions; l; l = l->next) { + ReservedRegion *reg = l->data; if (range_contains(®->range, addr)) { switch (reg->type) { diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index eea4564782..70b8ace34d 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -39,6 +39,7 @@ typedef struct IOMMUDevice { AddressSpace as; MemoryRegion root; /* The root container of the device */ MemoryRegion bypass_mr; /* The alias of shared memory MR */ + GList *resv_regions; } IOMMUDevice; typedef struct IOMMUPciBus { From b439595a08d79120325de4684698bb7b6516aa8a Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:14 +0200 Subject: [PATCH 08/22] range: Introduce range_inverse_array() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper reverses a list of regions within a [low, high] span, turning original regions into holes and original holes into actual regions, covering the whole UINT64_MAX span. Signed-off-by: Eric Auger Tested-by: Yanghang Liu Reviewed-by: "Michael S. Tsirkin" Signed-off-by: Cédric Le Goater --- include/qemu/range.h | 8 +++++++ util/range.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/qemu/range.h b/include/qemu/range.h index aa671da143..205e1da76d 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -225,4 +225,12 @@ int range_compare(Range *a, Range *b); GList *range_list_insert(GList *list, Range *data); +/* + * Inverse an array of sorted ranges over the [low, high] span, ie. + * original ranges becomes holes in the newly allocated inv_ranges + */ +void range_inverse_array(GList *in_ranges, + GList **out_ranges, + uint64_t low, uint64_t high); + #endif diff --git a/util/range.c b/util/range.c index 782cb8b21c..9605ccfcbe 100644 --- a/util/range.c +++ b/util/range.c @@ -66,3 +66,58 @@ GList *range_list_insert(GList *list, Range *data) return list; } + +static inline +GList *append_new_range(GList *list, uint64_t lob, uint64_t upb) +{ + Range *new = g_new0(Range, 1); + + range_set_bounds(new, lob, upb); + return g_list_append(list, new); +} + + +void range_inverse_array(GList *in, GList **rev, + uint64_t low, uint64_t high) +{ + Range *r, *rn; + GList *l = in, *out = *rev; + + for (l = in; l && range_upb(l->data) < low; l = l->next) { + continue; + } + + if (!l) { + out = append_new_range(out, low, high); + goto exit; + } + r = (Range *)l->data; + + /* first range lob is greater than min, insert a first range */ + if (range_lob(r) > low) { + out = append_new_range(out, low, MIN(range_lob(r) - 1, high)); + } + + /* insert a range inbetween each original range until we reach high */ + for (; l->next; l = l->next) { + r = (Range *)l->data; + rn = (Range *)l->next->data; + if (range_lob(r) >= high) { + goto exit; + } + if (range_compare(r, rn)) { + out = append_new_range(out, range_upb(r) + 1, + MIN(range_lob(rn) - 1, high)); + } + } + + /* last range */ + r = (Range *)l->data; + + /* last range upb is less than max, insert a last range */ + if (range_upb(r) < high) { + out = append_new_range(out, range_upb(r) + 1, high); + } +exit: + *rev = out; +} From 09b4c3d6a2f098e64cc25aa63f388ea943990279 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:15 +0200 Subject: [PATCH 09/22] virtio-iommu: Record whether a probe request has been issued MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an IOMMUDevice 'probe_done' flag to record that the driver already issued a probe request on that device. This will be useful to double check host reserved regions aren't notified after the probe and hence are not taken into account by the driver. Signed-off-by: Eric Auger Suggested-by: Jean-Philippe Brucker Reviewed-by: "Michael S. Tsirkin" Tested-by: Yanghang Liu Signed-off-by: Cédric Le Goater --- hw/virtio/virtio-iommu.c | 20 +++++++++++--------- include/hw/virtio/virtio-iommu.h | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 0e2370663d..13c3c087fe 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -639,19 +639,13 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, return ret; } -static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, +static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep, uint8_t *buf, size_t free) { struct virtio_iommu_probe_resv_mem prop = {}; size_t size = sizeof(prop), length = size - sizeof(prop.head), total; - IOMMUDevice *sdev; GList *l; - sdev = container_of(virtio_iommu_mr(s, ep), IOMMUDevice, iommu_mr); - if (!sdev) { - return -EINVAL; - } - total = size * g_list_length(sdev->resv_regions); if (total > free) { return -ENOSPC; @@ -688,19 +682,27 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, uint8_t *buf) { uint32_t ep_id = le32_to_cpu(req->endpoint); + IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id); size_t free = VIOMMU_PROBE_SIZE; + IOMMUDevice *sdev; ssize_t count; - if (!virtio_iommu_mr(s, ep_id)) { + if (!iommu_mr) { return VIRTIO_IOMMU_S_NOENT; } - count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free); + sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr); + if (!sdev) { + return -EINVAL; + } + + count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free); if (count < 0) { return VIRTIO_IOMMU_S_INVAL; } buf += count; free -= count; + sdev->probe_done = true; return VIRTIO_IOMMU_S_OK; } diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index 70b8ace34d..1dd11ae81a 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -40,6 +40,7 @@ typedef struct IOMMUDevice { MemoryRegion root; /* The root container of the device */ MemoryRegion bypass_mr; /* The alias of shared memory MR */ GList *resv_regions; + bool probe_done; } IOMMUDevice; typedef struct IOMMUPciBus { From 30d40e39bdcb50e67f7cca7bee8bf59234c4ec12 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:16 +0200 Subject: [PATCH 10/22] virtio-iommu: Implement set_iova_ranges() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The implementation populates the array of per IOMMUDevice host reserved ranges. It is forbidden to have conflicting sets of host IOVA ranges to be applied onto the same IOMMU MR (implied by different host devices). In case the callback is called after the probe request has been issues by the driver, a warning is issued. Signed-off-by: Eric Auger Reviewed-by: "Michael S. Tsirkin" Tested-by: Yanghang Liu Signed-off-by: Cédric Le Goater --- hw/virtio/virtio-iommu.c | 67 ++++++++++++++++++++++++++++++++ include/hw/virtio/virtio-iommu.h | 1 + 2 files changed, 68 insertions(+) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 13c3c087fe..15aadd6fdd 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -20,6 +20,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "qemu/iov.h" +#include "qemu/range.h" #include "exec/target_page.h" #include "hw/qdev-properties.h" #include "hw/virtio/virtio.h" @@ -1155,6 +1156,71 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, return 0; } +/** + * virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges + * + * The function turns those into reserved ranges. Once some + * reserved ranges have been set, new reserved regions cannot be + * added outside of the original ones. + * + * @mr: IOMMU MR + * @iova_ranges: list of usable IOVA ranges + * @errp: error handle + */ +static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr, + GList *iova_ranges, + Error **errp) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + GList *current_ranges = sdev->host_resv_ranges; + GList *l, *tmp, *new_ranges = NULL; + int ret = -EINVAL; + + /* check that each new resv region is included in an existing one */ + if (sdev->host_resv_ranges) { + range_inverse_array(iova_ranges, + &new_ranges, + 0, UINT64_MAX); + + for (tmp = new_ranges; tmp; tmp = tmp->next) { + Range *newr = (Range *)tmp->data; + bool included = false; + + for (l = current_ranges; l; l = l->next) { + Range * r = (Range *)l->data; + + if (range_contains_range(r, newr)) { + included = true; + break; + } + } + if (!included) { + goto error; + } + } + /* all new reserved ranges are included in existing ones */ + ret = 0; + goto out; + } + + if (sdev->probe_done) { + warn_report("%s: Notified about new host reserved regions after probe", + mr->parent_obj.name); + } + + range_inverse_array(iova_ranges, + &sdev->host_resv_ranges, + 0, UINT64_MAX); + + return 0; +error: + error_setg(errp, "IOMMU mr=%s Conflicting host reserved ranges set!", + mr->parent_obj.name); +out: + g_list_free_full(new_ranges, g_free); + return ret; +} + static void virtio_iommu_system_reset(void *opaque) { VirtIOIOMMU *s = opaque; @@ -1450,6 +1516,7 @@ static void virtio_iommu_memory_region_class_init(ObjectClass *klass, imrc->replay = virtio_iommu_replay; imrc->notify_flag_changed = virtio_iommu_notify_flag_changed; imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask; + imrc->iommu_set_iova_ranges = virtio_iommu_set_iova_ranges; } static const TypeInfo virtio_iommu_info = { diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index 1dd11ae81a..781ebaea8f 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -40,6 +40,7 @@ typedef struct IOMMUDevice { MemoryRegion root; /* The root container of the device */ MemoryRegion bypass_mr; /* The alias of shared memory MR */ GList *resv_regions; + GList *host_resv_ranges; bool probe_done; } IOMMUDevice; From 5c476ba3fad3b7659988f28aae8b1b8d5331effc Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:17 +0200 Subject: [PATCH 11/22] virtio-iommu: Consolidate host reserved regions and property set ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Up to now we were exposing to the RESV_MEM probe requests the reserved memory regions set though the reserved-regions array property. Combine those with the host reserved memory regions if any. Those latter are tagged as RESERVED. We don't have more information about them besides then cannot be mapped. Reserved regions set by property have higher priority. Signed-off-by: Eric Auger Reviewed-by: "Michael S. Tsirkin" Tested-by: Yanghang Liu Signed-off-by: Cédric Le Goater --- hw/virtio/trace-events | 1 + hw/virtio/virtio-iommu.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 0af7a2886c..637cac4edf 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -135,6 +135,7 @@ virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64 +virtio_iommu_host_resv_regions(const char *name, uint32_t index, uint64_t lob, uint64_t upb) "mr=%s host-resv-reg[%d] = [0x%"PRIx64",0x%"PRIx64"]" # virtio-mem.c virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 15aadd6fdd..dede0d41aa 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -21,6 +21,7 @@ #include "qemu/log.h" #include "qemu/iov.h" #include "qemu/range.h" +#include "qemu/reserved-region.h" #include "exec/target_page.h" #include "hw/qdev-properties.h" #include "hw/virtio/virtio.h" @@ -1156,6 +1157,40 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, return 0; } +/** + * rebuild_resv_regions: rebuild resv regions with both the + * info of host resv ranges and property set resv ranges + */ +static int rebuild_resv_regions(IOMMUDevice *sdev) +{ + GList *l; + int i = 0; + + /* free the existing list and rebuild it from scratch */ + g_list_free_full(sdev->resv_regions, g_free); + sdev->resv_regions = NULL; + + /* First add host reserved regions if any, all tagged as RESERVED */ + for (l = sdev->host_resv_ranges; l; l = l->next) { + ReservedRegion *reg = g_new0(ReservedRegion, 1); + Range *r = (Range *)l->data; + + reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED; + range_set_bounds(®->range, range_lob(r), range_upb(r)); + sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg); + trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i, + range_lob(®->range), + range_upb(®->range)); + i++; + } + /* + * then add higher priority reserved regions set by the machine + * through properties + */ + add_prop_resv_regions(sdev); + return 0; +} + /** * virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges * @@ -1211,6 +1246,7 @@ static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr, range_inverse_array(iova_ranges, &sdev->host_resv_ranges, 0, UINT64_MAX); + rebuild_resv_regions(sdev); return 0; error: From 71177490a87c08e7ac53ab00bad06d68f439a4cd Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Oct 2023 15:45:18 +0200 Subject: [PATCH 12/22] test: Add some tests for range and resv-mem helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add unit tests for both resv_region_list_insert() and range_inverse_array(). Signed-off-by: Eric Auger Reviewed-by: "Michael S. Tsirkin" [ clg: Removal of unused variable in compare_ranges() ] Signed-off-by: Cédric Le Goater --- tests/unit/meson.build | 1 + tests/unit/test-resv-mem.c | 316 +++++++++++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 tests/unit/test-resv-mem.c diff --git a/tests/unit/meson.build b/tests/unit/meson.build index f33ae64b8d..e6c51e7a86 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -21,6 +21,7 @@ tests = { 'test-opts-visitor': [testqapi], 'test-visitor-serialization': [testqapi], 'test-bitmap': [], + 'test-resv-mem': [], # all code tested by test-x86-cpuid is inside topology.h 'test-x86-cpuid': [], 'test-cutils': [], diff --git a/tests/unit/test-resv-mem.c b/tests/unit/test-resv-mem.c new file mode 100644 index 0000000000..5963274e2c --- /dev/null +++ b/tests/unit/test-resv-mem.c @@ -0,0 +1,316 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * reserved-region/range.c unit-tests. + * + * Copyright (C) 2023, Red Hat, Inc. + * + * Author: Eric Auger + */ + +#include "qemu/osdep.h" +#include "qemu/range.h" +#include "exec/memory.h" +#include "qemu/reserved-region.h" + +#define DEBUG 0 + +#if DEBUG +static void print_ranges(const char *prefix, GList *ranges) +{ + GList *l; + int i = 0; + + if (!g_list_length(ranges)) { + printf("%s is void\n", prefix); + return; + } + for (l = ranges; l; l = l->next) { + Range *r = (Range *)l->data; + + printf("%s rev[%i] = [0x%"PRIx64",0x%"PRIx64"]\n", + prefix, i, range_lob(r), range_upb(r)); + i++; + } +} +#endif + +static void compare_ranges(const char *prefix, GList *ranges, + GList *expected) +{ + GList *l, *e; + +#if DEBUG + print_ranges("out", ranges); + print_ranges("expected", expected); +#endif + g_assert_cmpint(g_list_length(ranges), ==, g_list_length(expected)); + for (l = ranges, e = expected; l ; l = l->next, e = e->next) { + Range *r = (Range *)l->data; + Range *er = (Range *)e->data; + + g_assert_true(range_lob(r) == range_lob(er) && + range_upb(r) == range_upb(er)); + } +} + +static GList *insert_sorted_range(GList *list, uint64_t lob, uint64_t upb) +{ + Range *new = g_new0(Range, 1); + + range_set_bounds(new, lob, upb); + return range_list_insert(list, new); +} + +static void reset(GList **in, GList **out, GList **expected) +{ + g_list_free_full(*in, g_free); + g_list_free_full(*out, g_free); + g_list_free_full(*expected, g_free); + *in = NULL; + *out = NULL; + *expected = NULL; +} + +static void +run_range_inverse_array(const char *prefix, GList **in, GList **expected, + uint64_t low, uint64_t high) +{ + GList *out = NULL; + range_inverse_array(*in, &out, low, high); + compare_ranges(prefix, out, *expected); + reset(in, &out, expected); +} + +static void check_range_reverse_array(void) +{ + GList *in = NULL, *expected = NULL; + + /* test 1 */ + + in = insert_sorted_range(in, 0x10000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 2 */ + + in = insert_sorted_range(in, 0x10000, 0xFFFFFFFFFFFF); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + expected = insert_sorted_range(expected, 0x1000000000000, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 3 */ + + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x10000, 0x2FFFF); + expected = insert_sorted_range(expected, 0x30000, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 4 */ + + in = insert_sorted_range(in, 0x50000, 0x5FFFF); + in = insert_sorted_range(in, 0x60000, 0xFFFFFFFFFFFF); + expected = insert_sorted_range(expected, 0x0, 0x4FFFF); + expected = insert_sorted_range(expected, 0x1000000000000, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 5 */ + + in = insert_sorted_range(in, 0x0, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); + + /* test 6 */ + in = insert_sorted_range(in, 0x10000, 0x1FFFF); + in = insert_sorted_range(in, 0x30000, 0x6FFFF); + in = insert_sorted_range(in, 0x90000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + expected = insert_sorted_range(expected, 0x20000, 0x2FFFF); + expected = insert_sorted_range(expected, 0x70000, 0x8FFFF); + run_range_inverse_array("test1", &in, &expected, 0x0, UINT64_MAX); +} + +static void check_range_reverse_array_low_end(void) +{ + GList *in = NULL, *expected = NULL; + + /* test 1 */ + in = insert_sorted_range(in, 0x0, UINT64_MAX); + run_range_inverse_array("test1", &in, &expected, 0x10000, 0xFFFFFF); + + /* test 2 */ + + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + expected = insert_sorted_range(expected, 0x40000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test2", &in, &expected, 0x40000, 0xFFFFFFFFFFFF); + + /* test 3 */ + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + in = insert_sorted_range(in, 0x1000000000000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x40000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test3", &in, &expected, 0x40000, 0xFFFFFFFFFFFF); + + /* test 4 */ + + in = insert_sorted_range(in, 0x0, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + in = insert_sorted_range(in, 0x1000000000000, UINT64_MAX); + expected = insert_sorted_range(expected, 0x30000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test4", &in, &expected, 0x20000, 0xFFFFFFFFFFFF); + + /* test 5 */ + + in = insert_sorted_range(in, 0x2000, 0xFFFF); + in = insert_sorted_range(in, 0x20000, 0x2FFFF); + in = insert_sorted_range(in, 0x100000000, 0x1FFFFFFFF); + expected = insert_sorted_range(expected, 0x1000, 0x1FFF); + expected = insert_sorted_range(expected, 0x10000, 0x1FFFF); + expected = insert_sorted_range(expected, 0x30000, 0xFFFFFFFF); + expected = insert_sorted_range(expected, 0x200000000, 0xFFFFFFFFFFFF); + run_range_inverse_array("test5", &in, &expected, 0x1000, 0xFFFFFFFFFFFF); + + /* test 6 */ + + in = insert_sorted_range(in, 0x10000000 , 0x1FFFFFFF); + in = insert_sorted_range(in, 0x100000000, 0x1FFFFFFFF); + expected = insert_sorted_range(expected, 0x0, 0xFFFF); + run_range_inverse_array("test6", &in, &expected, 0x0, 0xFFFF); +} + +static ReservedRegion *alloc_resv_mem(unsigned type, uint64_t lob, uint64_t upb) +{ + ReservedRegion *r; + + r = g_new0(ReservedRegion, 1); + r->type = type; + range_set_bounds(&r->range, lob, upb); + return r; +} + +static void print_resv_region_list(const char *prefix, GList *list, + uint32_t expected_length) +{ + int i = g_list_length(list); + + g_assert_cmpint(i, ==, expected_length); +#if DEBUG + i = 0; + for (GList *l = list; l; l = l->next) { + ReservedRegion *r = (ReservedRegion *)l->data; + Range *range = &r->range; + + printf("%s item[%d]=[0x%x, 0x%"PRIx64", 0x%"PRIx64"]\n", + prefix, i++, r->type, range_lob(range), range_upb(range)); + } +#endif +} + +static void free_resv_region(gpointer data) +{ + ReservedRegion *reg = (ReservedRegion *)data; + + g_free(reg); +} + +static void check_resv_region_list_insert(void) +{ + ReservedRegion *r[10]; + GList *l = NULL; + + r[0] = alloc_resv_mem(0xA, 0, 0xFFFF); + r[1] = alloc_resv_mem(0xA, 0x20000, 0x2FFFF); + l = resv_region_list_insert(l, r[0]); + l = resv_region_list_insert(l, r[1]); + print_resv_region_list("test1", l, 2); + + /* adjacent on left */ + r[2] = alloc_resv_mem(0xB, 0x0, 0xFFF); + l = resv_region_list_insert(l, r[2]); + /* adjacent on right */ + r[3] = alloc_resv_mem(0xC, 0x21000, 0x2FFFF); + l = resv_region_list_insert(l, r[3]); + print_resv_region_list("test2", l, 4); + + /* exact overlap of D into C*/ + r[4] = alloc_resv_mem(0xD, 0x21000, 0x2FFFF); + l = resv_region_list_insert(l, r[4]); + print_resv_region_list("test3", l, 4); + + /* in the middle */ + r[5] = alloc_resv_mem(0xE, 0x22000, 0x23FFF); + l = resv_region_list_insert(l, r[5]); + print_resv_region_list("test4", l, 6); + + /* overwrites several existing ones */ + r[6] = alloc_resv_mem(0xF, 0x10000, 0x2FFFF); + l = resv_region_list_insert(l, r[6]); + print_resv_region_list("test5", l, 3); + + /* contiguous at the end */ + r[7] = alloc_resv_mem(0x0, 0x30000, 0x40000); + l = resv_region_list_insert(l, r[7]); + print_resv_region_list("test6", l, 4); + + g_list_free_full(l, free_resv_region); + l = NULL; + + r[0] = alloc_resv_mem(0x0, 0x10000, 0x1FFFF); + l = resv_region_list_insert(l, r[0]); + /* insertion before the 1st item */ + r[1] = alloc_resv_mem(0x1, 0x0, 0xFF); + l = resv_region_list_insert(l, r[1]); + print_resv_region_list("test8", l, 2); + + /* collision on the left side */ + r[2] = alloc_resv_mem(0xA, 0x1200, 0x11FFF); + l = resv_region_list_insert(l, r[2]); + print_resv_region_list("test9", l, 3); + + /* collision on the right side */ + r[3] = alloc_resv_mem(0xA, 0x1F000, 0x2FFFF); + l = resv_region_list_insert(l, r[3]); + print_resv_region_list("test10", l, 4); + + /* override everything */ + r[4] = alloc_resv_mem(0xF, 0x0, UINT64_MAX); + l = resv_region_list_insert(l, r[4]); + print_resv_region_list("test11", l, 1); + + g_list_free_full(l, free_resv_region); + l = NULL; + + r[0] = alloc_resv_mem(0xF, 0x1000000000000, UINT64_MAX); + l = resv_region_list_insert(l, r[0]); + print_resv_region_list("test12", l, 1); + + r[1] = alloc_resv_mem(0xA, 0x0, 0xFFFFFFF); + l = resv_region_list_insert(l, r[1]); + print_resv_region_list("test12", l, 2); + + r[2] = alloc_resv_mem(0xB, 0x100000000, 0x1FFFFFFFF); + l = resv_region_list_insert(l, r[2]); + print_resv_region_list("test12", l, 3); + + r[3] = alloc_resv_mem(0x0, 0x010000000, 0x2FFFFFFFF); + l = resv_region_list_insert(l, r[3]); + print_resv_region_list("test12", l, 3); + + g_list_free_full(l, free_resv_region); +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + + g_test_add_func("/resv-mem/range_reverse_array", + check_range_reverse_array); + g_test_add_func("/resv-mem/range_reverse_array_low_end", + check_range_reverse_array_low_end); + g_test_add_func("/resv-mem/resv_region_list_insert", + check_resv_region_list_insert); + + g_test_run(); + + return 0; +} From ba7d12eb8ce2d7367615071c0569947457d36803 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 17 Oct 2023 18:14:04 +0200 Subject: [PATCH 13/22] hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch modifies pci_setup_iommu() to set PCIIOMMUOps instead of setting PCIIOMMUFunc. PCIIOMMUFunc is used to get an address space for a PCI device in vendor specific way. The PCIIOMMUOps still offers this functionality. But using PCIIOMMUOps leaves space to add more iommu related vendor specific operations. Cc: Kevin Tian Cc: Jacob Pan Cc: Peter Xu Cc: Eric Auger Cc: Yi Sun Cc: David Gibson Cc: "Michael S. Tsirkin" Cc: Eric Auger Cc: Peter Maydell Cc: Paolo Bonzini Cc: Peter Xu Cc: Jason Wang Cc: Andrey Smirnov Cc: Helge Deller Cc: Hervé Poussineau Cc: Mark Cave-Ayland Cc: BALATON Zoltan Cc: Elena Ufimtseva Cc: Jagannathan Raman Cc: Matthew Rosato Cc: Eric Farman Cc: Halil Pasic Cc: Christian Borntraeger Cc: Thomas Huth Cc: Helge Deller Reviewed-by: David Gibson Reviewed-by: Peter Xu Signed-off-by: Yi Liu Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Auger Acked-by: Michael S. Tsirkin [ clg: - refreshed on latest QEMU - included hw/remote/iommu.c - documentation update - asserts in pci_setup_iommu() - removed checks on iommu_bus->iommu_ops->get_address_space - included Elroy PCI host (PA-RISC) ] Signed-off-by: Cédric Le Goater --- docs/devel/index-api.rst | 1 + docs/devel/pci.rst | 8 ++++++++ hw/alpha/typhoon.c | 6 +++++- hw/arm/smmu-common.c | 6 +++++- hw/i386/amd_iommu.c | 6 +++++- hw/i386/intel_iommu.c | 6 +++++- hw/pci-host/astro.c | 6 +++++- hw/pci-host/designware.c | 6 +++++- hw/pci-host/dino.c | 6 +++++- hw/pci-host/pnv_phb3.c | 6 +++++- hw/pci-host/pnv_phb4.c | 6 +++++- hw/pci-host/ppce500.c | 6 +++++- hw/pci-host/raven.c | 6 +++++- hw/pci-host/sabre.c | 6 +++++- hw/pci/pci.c | 18 +++++++++++++----- hw/ppc/ppc440_pcix.c | 6 +++++- hw/ppc/spapr_pci.c | 6 +++++- hw/remote/iommu.c | 6 +++++- hw/s390x/s390-pci-bus.c | 8 ++++++-- hw/virtio/virtio-iommu.c | 6 +++++- include/hw/pci/pci.h | 36 ++++++++++++++++++++++++++++++++++-- include/hw/pci/pci_bus.h | 2 +- 22 files changed, 143 insertions(+), 26 deletions(-) create mode 100644 docs/devel/pci.rst diff --git a/docs/devel/index-api.rst b/docs/devel/index-api.rst index 539ad29c21..fe01b2b488 100644 --- a/docs/devel/index-api.rst +++ b/docs/devel/index-api.rst @@ -11,6 +11,7 @@ generated from in-code annotations to function prototypes. loads-stores memory modules + pci qom-api qdev-api ui diff --git a/docs/devel/pci.rst b/docs/devel/pci.rst new file mode 100644 index 0000000000..68739334f3 --- /dev/null +++ b/docs/devel/pci.rst @@ -0,0 +1,8 @@ +============= +PCI subsystem +============= + +API Reference +------------- + +.. kernel-doc:: include/hw/pci/pci.h diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c index 49a80550c5..e8711ae16a 100644 --- a/hw/alpha/typhoon.c +++ b/hw/alpha/typhoon.c @@ -738,6 +738,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &s->pchip.iommu_as; } +static const PCIIOMMUOps typhoon_iommu_ops = { + .get_address_space = typhoon_pci_dma_iommu, +}; + static void typhoon_set_irq(void *opaque, int irq, int level) { TyphoonState *s = opaque; @@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, qemu_irq *p_isa_irq, "iommu-typhoon", UINT64_MAX); address_space_init(&s->pchip.iommu_as, MEMORY_REGION(&s->pchip.iommu), "pchip0-pci"); - pci_setup_iommu(b, typhoon_pci_dma_iommu, s); + pci_setup_iommu(b, &typhoon_iommu_ops, s); /* Pchip0 PCI special/interrupt acknowledge, 0x801.F800.0000, 64MB. */ memory_region_init_io(&s->pchip.reg_iack, OBJECT(s), &alpha_pci_iack_ops, diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index f35ae9aa22..9a8ac45431 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -605,6 +605,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) return &sdev->as; } +static const PCIIOMMUOps smmu_ops = { + .get_address_space = smmu_find_add_as, +}; + IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) { uint8_t bus_n, devfn; @@ -661,7 +665,7 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); if (s->primary_bus) { - pci_setup_iommu(s->primary_bus, smmu_find_add_as, s); + pci_setup_iommu(s->primary_bus, &smmu_ops, s); } else { error_setg(errp, "SMMU is not attached to any PCI bus!"); } diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 7965415b47..4203144da9 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1450,6 +1450,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &iommu_as[devfn]->as; } +static const PCIIOMMUOps amdvi_iommu_ops = { + .get_address_space = amdvi_host_dma_iommu, +}; + static const MemoryRegionOps mmio_mem_ops = { .read = amdvi_mmio_read, .write = amdvi_mmio_write, @@ -1581,7 +1585,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) AMDVI_MMIO_SIZE); memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR, &s->mmio); - pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + pci_setup_iommu(bus, &amdvi_iommu_ops, s); amdvi_init(s); } diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 1c6c18622f..a6f1a7aa52 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -4088,6 +4088,10 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &vtd_as->as; } +static PCIIOMMUOps vtd_iommu_ops = { + .get_address_space = vtd_host_dma_iommu, +}; + static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); @@ -4210,7 +4214,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, g_free, g_free); vtd_init(s); - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); diff --git a/hw/pci-host/astro.c b/hw/pci-host/astro.c index 4b2d7caf2d..84e0ca14ac 100644 --- a/hw/pci-host/astro.c +++ b/hw/pci-host/astro.c @@ -345,6 +345,10 @@ static AddressSpace *elroy_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->astro->iommu_as; } +static const PCIIOMMUOps elroy_pcihost_iommu_ops = { + .get_address_space = elroy_pcihost_set_iommu, +}; + /* * Encoding in IOSAPIC: * base_addr == 0xfffa0000, we want to get 0xa0ff0000. @@ -834,7 +838,7 @@ static void astro_realize(DeviceState *obj, Error **errp) &elroy->pci_io); /* Host memory as seen from the PCI side, via the IOMMU. */ - pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, elroy_pcihost_set_iommu, + pci_setup_iommu(PCI_HOST_BRIDGE(elroy)->bus, &elroy_pcihost_iommu_ops, elroy); } } diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c index 6f5442f108..f477f97847 100644 --- a/hw/pci-host/designware.c +++ b/hw/pci-host/designware.c @@ -663,6 +663,10 @@ static AddressSpace *designware_pcie_host_set_iommu(PCIBus *bus, void *opaque, return &s->pci.address_space; } +static const PCIIOMMUOps designware_iommu_ops = { + .get_address_space = designware_pcie_host_set_iommu, +}; + static void designware_pcie_host_realize(DeviceState *dev, Error **errp) { PCIHostState *pci = PCI_HOST_BRIDGE(dev); @@ -705,7 +709,7 @@ static void designware_pcie_host_realize(DeviceState *dev, Error **errp) address_space_init(&s->pci.address_space, &s->pci.address_space_root, "pcie-bus-address-space"); - pci_setup_iommu(pci->bus, designware_pcie_host_set_iommu, s); + pci_setup_iommu(pci->bus, &designware_iommu_ops, s); qdev_realize(DEVICE(&s->root), BUS(pci->bus), &error_fatal); } diff --git a/hw/pci-host/dino.c b/hw/pci-host/dino.c index 82503229fa..5b0947a16c 100644 --- a/hw/pci-host/dino.c +++ b/hw/pci-host/dino.c @@ -354,6 +354,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps dino_iommu_ops = { + .get_address_space = dino_pcihost_set_iommu, +}; + /* * Dino interrupts are connected as shown on Page 78, Table 23 * (Little-endian bit numbers) @@ -481,7 +485,7 @@ static void dino_pcihost_init(Object *obj) g_free(name); } - pci_setup_iommu(phb->bus, dino_pcihost_set_iommu, s); + pci_setup_iommu(phb->bus, &dino_iommu_ops, s); sysbus_init_mmio(sbd, &s->this_mem); diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c index c5e58f4086..2a74dbe45f 100644 --- a/hw/pci-host/pnv_phb3.c +++ b/hw/pci-host/pnv_phb3.c @@ -968,6 +968,10 @@ static AddressSpace *pnv_phb3_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &ds->dma_as; } +static PCIIOMMUOps pnv_phb3_iommu_ops = { + .get_address_space = pnv_phb3_dma_iommu, +}; + static void pnv_phb3_instance_init(Object *obj) { PnvPHB3 *phb = PNV_PHB3(obj); @@ -1012,7 +1016,7 @@ void pnv_phb3_bus_init(DeviceState *dev, PnvPHB3 *phb) object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id, &error_abort); - pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb); + pci_setup_iommu(pci->bus, &pnv_phb3_iommu_ops, phb); } static void pnv_phb3_realize(DeviceState *dev, Error **errp) diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c index 29cb11a5d9..37c7afc18c 100644 --- a/hw/pci-host/pnv_phb4.c +++ b/hw/pci-host/pnv_phb4.c @@ -1518,6 +1518,10 @@ static void pnv_phb4_xscom_realize(PnvPHB4 *phb) &phb->phb_regs_mr); } +static PCIIOMMUOps pnv_phb4_iommu_ops = { + .get_address_space = pnv_phb4_dma_iommu, +}; + static void pnv_phb4_instance_init(Object *obj) { PnvPHB4 *phb = PNV_PHB4(obj); @@ -1557,7 +1561,7 @@ void pnv_phb4_bus_init(DeviceState *dev, PnvPHB4 *phb) object_property_set_int(OBJECT(pci->bus), "chip-id", phb->chip_id, &error_abort); - pci_setup_iommu(pci->bus, pnv_phb4_dma_iommu, phb); + pci_setup_iommu(pci->bus, &pnv_phb4_iommu_ops, phb); pci->bus->flags |= PCI_BUS_EXTENDED_CONFIG_SPACE; } diff --git a/hw/pci-host/ppce500.c b/hw/pci-host/ppce500.c index 38814247f2..453a4e6ed3 100644 --- a/hw/pci-host/ppce500.c +++ b/hw/pci-host/ppce500.c @@ -435,6 +435,10 @@ static AddressSpace *e500_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps ppce500_iommu_ops = { + .get_address_space = e500_pcihost_set_iommu, +}; + static void e500_pcihost_realize(DeviceState *dev, Error **errp) { SysBusDevice *sbd = SYS_BUS_DEVICE(dev); @@ -469,7 +473,7 @@ static void e500_pcihost_realize(DeviceState *dev, Error **errp) memory_region_init(&s->bm, OBJECT(s), "bm-e500", UINT64_MAX); memory_region_add_subregion(&s->bm, 0x0, &s->busmem); address_space_init(&s->bm_as, &s->bm, "pci-bm"); - pci_setup_iommu(b, e500_pcihost_set_iommu, s); + pci_setup_iommu(b, &ppce500_iommu_ops, s); pci_create_simple(b, 0, "e500-host-bridge"); diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c index 9a11ac4b2b..86c3a49087 100644 --- a/hw/pci-host/raven.c +++ b/hw/pci-host/raven.c @@ -223,6 +223,10 @@ static AddressSpace *raven_pcihost_set_iommu(PCIBus *bus, void *opaque, return &s->bm_as; } +static const PCIIOMMUOps raven_iommu_ops = { + .get_address_space = raven_pcihost_set_iommu, +}; + static void raven_change_gpio(void *opaque, int n, int level) { PREPPCIState *s = opaque; @@ -320,7 +324,7 @@ static void raven_pcihost_initfn(Object *obj) memory_region_add_subregion(&s->bm, 0 , &s->bm_pci_memory_alias); memory_region_add_subregion(&s->bm, 0x80000000, &s->bm_ram_alias); address_space_init(&s->bm_as, &s->bm, "raven-bm"); - pci_setup_iommu(&s->pci_bus, raven_pcihost_set_iommu, s); + pci_setup_iommu(&s->pci_bus, &raven_iommu_ops, s); h->bus = &s->pci_bus; diff --git a/hw/pci-host/sabre.c b/hw/pci-host/sabre.c index dcb2e230b6..d0851b48b0 100644 --- a/hw/pci-host/sabre.c +++ b/hw/pci-host/sabre.c @@ -112,6 +112,10 @@ static AddressSpace *sabre_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &is->iommu_as; } +static const PCIIOMMUOps sabre_iommu_ops = { + .get_address_space = sabre_pci_dma_iommu, +}; + static void sabre_config_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { @@ -384,7 +388,7 @@ static void sabre_realize(DeviceState *dev, Error **errp) /* IOMMU */ memory_region_add_subregion_overlap(&s->sabre_config, 0x200, sysbus_mmio_get_region(SYS_BUS_DEVICE(s->iommu), 0), 1); - pci_setup_iommu(phb->bus, sabre_pci_dma_iommu, s->iommu); + pci_setup_iommu(phb->bus, &sabre_iommu_ops, s->iommu); /* APB secondary busses */ pci_dev = pci_new_multifunction(PCI_DEVFN(1, 0), TYPE_SIMBA_PCI_BRIDGE); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 885c04b6f5..c49417abb2 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2678,7 +2678,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) PCIBus *iommu_bus = bus; uint8_t devfn = dev->devfn; - while (iommu_bus && !iommu_bus->iommu_fn && iommu_bus->parent_dev) { + while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); /* @@ -2717,15 +2717,23 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) { - return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn); + if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { + return iommu_bus->iommu_ops->get_address_space(bus, + iommu_bus->iommu_opaque, devfn); } return &address_space_memory; } -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) +void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { - bus->iommu_fn = fn; + /* + * If called, pci_setup_iommu() should provide a minimum set of + * useful callbacks for the bus. + */ + assert(ops); + assert(ops->get_address_space); + + bus->iommu_ops = ops; bus->iommu_opaque = opaque; } diff --git a/hw/ppc/ppc440_pcix.c b/hw/ppc/ppc440_pcix.c index 672090de94..df4ee374d0 100644 --- a/hw/ppc/ppc440_pcix.c +++ b/hw/ppc/ppc440_pcix.c @@ -449,6 +449,10 @@ static AddressSpace *ppc440_pcix_set_iommu(PCIBus *b, void *opaque, int devfn) return &s->bm_as; } +static const PCIIOMMUOps ppc440_iommu_ops = { + .get_address_space = ppc440_pcix_set_iommu, +}; + /* * Some guests on sam460ex write all kinds of garbage here such as * missing enable bit and low bits set and still expect this to work @@ -503,7 +507,7 @@ static void ppc440_pcix_realize(DeviceState *dev, Error **errp) memory_region_init(&s->bm, OBJECT(s), "bm-ppc440-pcix", UINT64_MAX); memory_region_add_subregion(&s->bm, 0x0, &s->busmem); address_space_init(&s->bm_as, &s->bm, "pci-bm"); - pci_setup_iommu(h->bus, ppc440_pcix_set_iommu, s); + pci_setup_iommu(h->bus, &ppc440_iommu_ops, s); memory_region_init(&s->container, OBJECT(s), "pci-container", PCI_ALL_SIZE); memory_region_init_io(&h->conf_mem, OBJECT(s), &ppc440_pcix_host_conf_ops, diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 370c5a90f2..a27024e45a 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -780,6 +780,10 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &phb->iommu_as; } +static const PCIIOMMUOps spapr_iommu_ops = { + .get_address_space = spapr_pci_dma_iommu, +}; + static char *spapr_phb_vfio_get_loc_code(SpaprPhbState *sphb, PCIDevice *pdev) { g_autofree char *path = NULL; @@ -1978,7 +1982,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) memory_region_add_subregion(&sphb->iommu_root, SPAPR_PCI_MSI_WINDOW, &sphb->msiwindow); - pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb); + pci_setup_iommu(bus, &spapr_iommu_ops, sphb); pci_bus_set_route_irq_fn(bus, spapr_route_intx_pin_to_irq); diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c index 1391dd712c..7c56aad0fc 100644 --- a/hw/remote/iommu.c +++ b/hw/remote/iommu.c @@ -100,6 +100,10 @@ static void remote_iommu_finalize(Object *obj) iommu->elem_by_devfn = NULL; } +static const PCIIOMMUOps remote_iommu_ops = { + .get_address_space = remote_iommu_find_add_as, +}; + void remote_iommu_setup(PCIBus *pci_bus) { RemoteIommu *iommu = NULL; @@ -108,7 +112,7 @@ void remote_iommu_setup(PCIBus *pci_bus) iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU)); - pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu); + pci_setup_iommu(pci_bus, &remote_iommu_ops, iommu); object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu)); diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 2ca36f9f3b..347580ebac 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -652,6 +652,10 @@ static AddressSpace *s390_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &iommu->as; } +static const PCIIOMMUOps s390_iommu_ops = { + .get_address_space = s390_pci_dma_iommu, +}; + static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set) { uint8_t expected, actual; @@ -839,7 +843,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) b = pci_register_root_bus(dev, NULL, s390_pci_set_irq, s390_pci_map_irq, NULL, get_system_memory(), get_system_io(), 0, 64, TYPE_PCI_BUS); - pci_setup_iommu(b, s390_pci_dma_iommu, s); + pci_setup_iommu(b, &s390_iommu_ops, s); bus = BUS(b); qbus_set_hotplug_handler(bus, OBJECT(dev)); @@ -1058,7 +1062,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pdev = PCI_DEVICE(dev); pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq); - pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s); + pci_setup_iommu(&pb->sec_bus, &s390_iommu_ops, s); qbus_set_hotplug_handler(BUS(&pb->sec_bus), OBJECT(s)); diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index dede0d41aa..89fb5767d1 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -461,6 +461,10 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, return &sdev->as; } +static const PCIIOMMUOps virtio_iommu_ops = { + .get_address_space = virtio_iommu_find_add_as, +}; + static int virtio_iommu_attach(VirtIOIOMMU *s, struct virtio_iommu_req_attach *req) { @@ -1332,7 +1336,7 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free); if (s->primary_bus) { - pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s); + pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s); } else { error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); } diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index ea5aff118b..fa6313aabc 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -363,10 +363,42 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range); void pci_device_deassert_intx(PCIDevice *dev); -typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); + +/** + * struct PCIIOMMUOps: callbacks structure for specific IOMMU handlers + * of a PCIBus + * + * Allows to modify the behavior of some IOMMU operations of the PCI + * framework for a set of devices on a PCI bus. + */ +typedef struct PCIIOMMUOps { + /** + * @get_address_space: get the address space for a set of devices + * on a PCI bus. + * + * Mandatory callback which returns a pointer to an #AddressSpace + * + * @bus: the #PCIBus being accessed. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number + */ + AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); +} PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); + +/** + * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus + * + * Let PCI host bridges define specific operations. + * + * @bus: the #PCIBus being updated. + * @ops: the #PCIIOMMUOps + * @opaque: passed to callbacks of the @ops structure. + */ +void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque); pcibus_t pci_bar_address(PCIDevice *d, int reg, uint8_t type, pcibus_t size); diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h index 5653175957..2261312546 100644 --- a/include/hw/pci/pci_bus.h +++ b/include/hw/pci/pci_bus.h @@ -33,7 +33,7 @@ enum PCIBusFlags { struct PCIBus { BusState qbus; enum PCIBusFlags flags; - PCIIOMMUFunc iommu_fn; + const PCIIOMMUOps *iommu_ops; void *iommu_opaque; uint8_t devfn_min; uint32_t slot_reserved_mask; From 721da0396cfa0a4859cefb57e32cc79d19d80f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 26 Oct 2023 09:06:34 +0200 Subject: [PATCH 14/22] util/uuid: Add UUID_STR_LEN definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu_uuid_unparse() includes a trailing NUL when writing the uuid string and the buffer size should be UUID_FMT_LEN + 1 bytes. Add a define for this size and use it where required. Cc: Fam Zheng Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Juan Quintela Reviewed-by: "Denis V. Lunev" Signed-off-by: Cédric Le Goater --- block/parallels-ext.c | 2 +- block/vdi.c | 2 +- hw/core/qdev-properties-system.c | 2 +- hw/hyperv/vmbus.c | 4 ++-- include/qemu/uuid.h | 1 + migration/savevm.c | 4 ++-- tests/unit/test-uuid.c | 2 +- util/uuid.c | 2 +- 8 files changed, 10 insertions(+), 9 deletions(-) diff --git a/block/parallels-ext.c b/block/parallels-ext.c index 8a109f005a..4d8ecf5047 100644 --- a/block/parallels-ext.c +++ b/block/parallels-ext.c @@ -130,7 +130,7 @@ static BdrvDirtyBitmap *parallels_load_bitmap(BlockDriverState *bs, g_autofree uint64_t *l1_table = NULL; BdrvDirtyBitmap *bitmap; QemuUUID uuid; - char uuidstr[UUID_FMT_LEN + 1]; + char uuidstr[UUID_STR_LEN]; int i; if (data_size < sizeof(bf)) { diff --git a/block/vdi.c b/block/vdi.c index c647d72895..7cfd12b50d 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -239,7 +239,7 @@ static void vdi_header_to_le(VdiHeader *header) static void vdi_header_print(VdiHeader *header) { - char uuidstr[37]; + char uuidstr[UUID_STR_LEN]; QemuUUID uuid; logout("text %s", header->text); logout("signature 0x%08x\n", header->signature); diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b5ccafa29e..b46d16cd2c 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -1114,7 +1114,7 @@ static void get_uuid(Object *obj, Visitor *v, const char *name, void *opaque, { Property *prop = opaque; QemuUUID *uuid = object_field_prop_ptr(obj, prop); - char buffer[UUID_FMT_LEN + 1]; + char buffer[UUID_STR_LEN]; char *p = buffer; qemu_uuid_unparse(uuid, buffer); diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c index 271289f902..c64eaa5a46 100644 --- a/hw/hyperv/vmbus.c +++ b/hw/hyperv/vmbus.c @@ -2271,7 +2271,7 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp) VMBus *vmbus = VMBUS(qdev_get_parent_bus(dev)); BusChild *child; Error *err = NULL; - char idstr[UUID_FMT_LEN + 1]; + char idstr[UUID_STR_LEN]; assert(!qemu_uuid_is_null(&vdev->instanceid)); @@ -2467,7 +2467,7 @@ static char *vmbus_get_dev_path(DeviceState *dev) static char *vmbus_get_fw_dev_path(DeviceState *dev) { VMBusDevice *vdev = VMBUS_DEVICE(dev); - char uuid[UUID_FMT_LEN + 1]; + char uuid[UUID_STR_LEN]; qemu_uuid_unparse(&vdev->instanceid, uuid); return g_strdup_printf("%s@%s", qdev_fw_name(dev), uuid); diff --git a/include/qemu/uuid.h b/include/qemu/uuid.h index e24a1099e4..4e7afaf1d5 100644 --- a/include/qemu/uuid.h +++ b/include/qemu/uuid.h @@ -79,6 +79,7 @@ typedef struct { "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" #define UUID_FMT_LEN 36 +#define UUID_STR_LEN (UUID_FMT_LEN + 1) #define UUID_NONE "00000000-0000-0000-0000-000000000000" diff --git a/migration/savevm.c b/migration/savevm.c index bc98c2ea6f..eec5503a42 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -471,8 +471,8 @@ static bool vmstate_uuid_needed(void *opaque) static int vmstate_uuid_post_load(void *opaque, int version_id) { SaveState *state = opaque; - char uuid_src[UUID_FMT_LEN + 1]; - char uuid_dst[UUID_FMT_LEN + 1]; + char uuid_src[UUID_STR_LEN]; + char uuid_dst[UUID_STR_LEN]; if (!qemu_uuid_set) { /* diff --git a/tests/unit/test-uuid.c b/tests/unit/test-uuid.c index aedc125ae9..739b91583c 100644 --- a/tests/unit/test-uuid.c +++ b/tests/unit/test-uuid.c @@ -145,7 +145,7 @@ static void test_uuid_unparse(void) int i; for (i = 0; i < ARRAY_SIZE(uuid_test_data); i++) { - char out[37]; + char out[UUID_STR_LEN]; if (!uuid_test_data[i].check_unparse) { continue; diff --git a/util/uuid.c b/util/uuid.c index d71aa79e5e..234619dd5e 100644 --- a/util/uuid.c +++ b/util/uuid.c @@ -51,7 +51,7 @@ int qemu_uuid_is_equal(const QemuUUID *lhv, const QemuUUID *rhv) void qemu_uuid_unparse(const QemuUUID *uuid, char *out) { const unsigned char *uu = &uuid->data[0]; - snprintf(out, UUID_FMT_LEN + 1, UUID_FMT, + snprintf(out, UUID_STR_LEN, UUID_FMT, uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); } From f8d6f3b16c37bd516a026e92a31dade5d761d3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 26 Oct 2023 09:06:35 +0200 Subject: [PATCH 15/22] vfio/pci: Fix buffer overrun when writing the VF token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu_uuid_unparse() includes a trailing NUL when writing the uuid string and the buffer size should be UUID_FMT_LEN + 1 bytes. Use the recently added UUID_STR_LEN which defines the correct size. Fixes: CID 1522913 Fixes: 2dca1b37a760 ("vfio/pci: add support for VF token") Cc: Alex Williamson Reviewed-by: Alex Williamson Reviewed-by: Juan Quintela Reviewed-by: "Denis V. Lunev" Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b27011cee7..c62c02f7b6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3081,7 +3081,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) struct stat st; int i, ret; bool is_mdev; - char uuid[UUID_FMT_LEN]; + char uuid[UUID_STR_LEN]; char *name; if (!vbasedev->sysfsdev) { From 4ef9d97b1a37b8cfd152cc3ac5f9576e406868b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 26 Oct 2023 09:06:36 +0200 Subject: [PATCH 16/22] util/uuid: Remove UUID_FMT_LEN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dangerous and now unused. Cc: Fam Zheng Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: "Denis V. Lunev" Reviewed-by: Juan Quintela Signed-off-by: Cédric Le Goater --- include/qemu/uuid.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/qemu/uuid.h b/include/qemu/uuid.h index 4e7afaf1d5..356efe7b57 100644 --- a/include/qemu/uuid.h +++ b/include/qemu/uuid.h @@ -78,8 +78,7 @@ typedef struct { "%02hhx%02hhx-" \ "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" -#define UUID_FMT_LEN 36 -#define UUID_STR_LEN (UUID_FMT_LEN + 1) +#define UUID_STR_LEN (36 + 1) #define UUID_NONE "00000000-0000-0000-0000-000000000000" From 5fe51934b1cd94a75007dd456fecc2ff6ee622e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 25 Oct 2023 12:12:44 +0200 Subject: [PATCH 17/22] util/uuid: Define UUID_STR_LEN from UUID_NONE string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Fam Zheng Suggested-by: Philippe Mathieu-Daudé Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Cédric Le Goater --- include/qemu/uuid.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/qemu/uuid.h b/include/qemu/uuid.h index 356efe7b57..869f84af09 100644 --- a/include/qemu/uuid.h +++ b/include/qemu/uuid.h @@ -78,9 +78,10 @@ typedef struct { "%02hhx%02hhx-" \ "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" -#define UUID_STR_LEN (36 + 1) - #define UUID_NONE "00000000-0000-0000-0000-000000000000" +QEMU_BUILD_BUG_ON(sizeof(UUID_NONE) - 1 != 36); + +#define UUID_STR_LEN sizeof(UUID_NONE) void qemu_uuid_generate(QemuUUID *out); From 54876d25fe298e0761556253f1c07af6160d5c10 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 2 Nov 2023 15:12:22 +0800 Subject: [PATCH 18/22] vfio/container: Move IBM EEH related functions into spapr_pci_vfio.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With vfio_eeh_as_ok/vfio_eeh_as_op moved and made static, vfio.h becomes empty and is deleted. No functional changes intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Acked-by: Eric Farman Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater --- hw/ppc/spapr_pci_vfio.c | 100 +++++++++++++++++++++++++++++++++++++++- hw/vfio/ap.c | 1 - hw/vfio/ccw.c | 1 - hw/vfio/common.c | 1 - hw/vfio/container.c | 98 --------------------------------------- hw/vfio/helpers.c | 1 - include/hw/vfio/vfio.h | 7 --- 7 files changed, 99 insertions(+), 110 deletions(-) delete mode 100644 include/hw/vfio/vfio.h diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index 9016720547..f283f7e38d 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -18,14 +18,112 @@ */ #include "qemu/osdep.h" +#include #include #include "hw/ppc/spapr.h" #include "hw/pci-host/spapr.h" #include "hw/pci/msix.h" #include "hw/pci/pci_device.h" -#include "hw/vfio/vfio.h" +#include "hw/vfio/vfio-common.h" #include "qemu/error-report.h" +/* + * Interfaces for IBM EEH (Enhanced Error Handling) + */ +static bool vfio_eeh_container_ok(VFIOContainer *container) +{ + /* + * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO + * implementation is broken if there are multiple groups in a + * container. The hardware works in units of Partitionable + * Endpoints (== IOMMU groups) and the EEH operations naively + * iterate across all groups in the container, without any logic + * to make sure the groups have their state synchronized. For + * certain operations (ENABLE) that might be ok, until an error + * occurs, but for others (GET_STATE) it's clearly broken. + */ + + /* + * XXX Once fixed kernels exist, test for them here + */ + + if (QLIST_EMPTY(&container->group_list)) { + return false; + } + + if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { + return false; + } + + return true; +} + +static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) +{ + struct vfio_eeh_pe_op pe_op = { + .argsz = sizeof(pe_op), + .op = op, + }; + int ret; + + if (!vfio_eeh_container_ok(container)) { + error_report("vfio/eeh: EEH_PE_OP 0x%x: " + "kernel requires a container with exactly one group", op); + return -EPERM; + } + + ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); + if (ret < 0) { + error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); + return -errno; + } + + return ret; +} + +static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) +{ + VFIOAddressSpace *space = vfio_get_address_space(as); + VFIOContainer *container = NULL; + + if (QLIST_EMPTY(&space->containers)) { + /* No containers to act on */ + goto out; + } + + container = QLIST_FIRST(&space->containers); + + if (QLIST_NEXT(container, next)) { + /* + * We don't yet have logic to synchronize EEH state across + * multiple containers + */ + container = NULL; + goto out; + } + +out: + vfio_put_address_space(space); + return container; +} + +static bool vfio_eeh_as_ok(AddressSpace *as) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + return (container != NULL) && vfio_eeh_container_ok(container); +} + +static int vfio_eeh_as_op(AddressSpace *as, uint32_t op) +{ + VFIOContainer *container = vfio_eeh_as_container(as); + + if (!container) { + return -ENODEV; + } + return vfio_eeh_container_op(container, op); +} + bool spapr_phb_eeh_available(SpaprPhbState *sphb) { return vfio_eeh_as_ok(&sphb->iommu_as); diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 5f257bffb9..bbf69ff55a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -14,7 +14,6 @@ #include #include #include "qapi/error.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/ap-device.h" #include "qemu/error-report.h" diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 6623ae237b..d857bb8d0f 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -20,7 +20,6 @@ #include #include "qapi/error.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 9c5c6433f2..e72055e752 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -26,7 +26,6 @@ #include #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "hw/vfio/pci.h" #include "exec/address-spaces.h" #include "exec/memory.h" diff --git a/hw/vfio/container.c b/hw/vfio/container.c index fc88222377..83c0f05bba 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -26,7 +26,6 @@ #include #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" @@ -1011,103 +1010,6 @@ static void vfio_put_base_device(VFIODevice *vbasedev) close(vbasedev->fd); } -/* - * Interfaces for IBM EEH (Enhanced Error Handling) - */ -static bool vfio_eeh_container_ok(VFIOContainer *container) -{ - /* - * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO - * implementation is broken if there are multiple groups in a - * container. The hardware works in units of Partitionable - * Endpoints (== IOMMU groups) and the EEH operations naively - * iterate across all groups in the container, without any logic - * to make sure the groups have their state synchronized. For - * certain operations (ENABLE) that might be ok, until an error - * occurs, but for others (GET_STATE) it's clearly broken. - */ - - /* - * XXX Once fixed kernels exist, test for them here - */ - - if (QLIST_EMPTY(&container->group_list)) { - return false; - } - - if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) { - return false; - } - - return true; -} - -static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) -{ - struct vfio_eeh_pe_op pe_op = { - .argsz = sizeof(pe_op), - .op = op, - }; - int ret; - - if (!vfio_eeh_container_ok(container)) { - error_report("vfio/eeh: EEH_PE_OP 0x%x: " - "kernel requires a container with exactly one group", op); - return -EPERM; - } - - ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op); - if (ret < 0) { - error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op); - return -errno; - } - - return ret; -} - -static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) -{ - VFIOAddressSpace *space = vfio_get_address_space(as); - VFIOContainer *container = NULL; - - if (QLIST_EMPTY(&space->containers)) { - /* No containers to act on */ - goto out; - } - - container = QLIST_FIRST(&space->containers); - - if (QLIST_NEXT(container, next)) { - /* - * We don't yet have logic to synchronize EEH state across - * multiple containers - */ - container = NULL; - goto out; - } - -out: - vfio_put_address_space(space); - return container; -} - -bool vfio_eeh_as_ok(AddressSpace *as) -{ - VFIOContainer *container = vfio_eeh_as_container(as); - - return (container != NULL) && vfio_eeh_container_ok(container); -} - -int vfio_eeh_as_op(AddressSpace *as, uint32_t op) -{ - VFIOContainer *container = vfio_eeh_as_container(as); - - if (!container) { - return -ENODEV; - } - return vfio_eeh_container_op(container, op); -} - static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) { char *tmp, group_path[PATH_MAX], *group_name; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 7e5da21b31..168847e7c5 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -23,7 +23,6 @@ #include #include "hw/vfio/vfio-common.h" -#include "hw/vfio/vfio.h" #include "hw/hw.h" #include "trace.h" #include "qapi/error.h" diff --git a/include/hw/vfio/vfio.h b/include/hw/vfio/vfio.h deleted file mode 100644 index 86248f5436..0000000000 --- a/include/hw/vfio/vfio.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef HW_VFIO_H -#define HW_VFIO_H - -bool vfio_eeh_as_ok(AddressSpace *as); -int vfio_eeh_as_op(AddressSpace *as, uint32_t op); - -#endif From 521c8f4ebc5923576436228211f8947f306b9d82 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 2 Nov 2023 15:12:23 +0800 Subject: [PATCH 19/22] vfio/container: Move vfio_container_add/del_section_window into spapr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_container_add/del_section_window are spapr specific functions, so move them into spapr.c to make container.c cleaner. No functional changes intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 90 --------------------------------------------- hw/vfio/spapr.c | 90 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 90 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 83c0f05bba..7a3f005d1b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -20,9 +20,6 @@ #include "qemu/osdep.h" #include -#ifdef CONFIG_KVM -#include -#endif #include #include "hw/vfio/vfio-common.h" @@ -32,7 +29,6 @@ #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/range.h" -#include "sysemu/kvm.h" #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" @@ -204,92 +200,6 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp) -{ - VFIOHostDMAWindow *hostwin; - hwaddr pgsize = 0; - int ret; - - if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { - return 0; - } - - /* For now intersections are not allowed, we may relax this later */ - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (ranges_overlap(hostwin->min_iova, - hostwin->max_iova - hostwin->min_iova + 1, - section->offset_within_address_space, - int128_get64(section->size))) { - error_setg(errp, - "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" - "host DMA window [0x%"PRIx64",0x%"PRIx64"]", - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1, - hostwin->min_iova, hostwin->max_iova); - return -EINVAL; - } - } - - ret = vfio_spapr_create_window(container, section, &pgsize); - if (ret) { - error_setg_errno(errp, -ret, "Failed to create SPAPR window"); - return ret; - } - - vfio_host_win_add(container, section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1, pgsize); -#ifdef CONFIG_KVM - if (kvm_enabled()) { - VFIOGroup *group; - IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); - struct kvm_vfio_spapr_tce param; - struct kvm_device_attr attr = { - .group = KVM_DEV_VFIO_GROUP, - .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, - .addr = (uint64_t)(unsigned long)¶m, - }; - - if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, - ¶m.tablefd)) { - QLIST_FOREACH(group, &container->group_list, container_next) { - param.groupfd = group->fd; - if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { - error_setg_errno(errp, errno, - "vfio: failed GROUP_SET_SPAPR_TCE for " - "KVM VFIO device %d and group fd %d", - param.tablefd, param.groupfd); - return -errno; - } - trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); - } - } - } -#endif - return 0; -} - -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section) -{ - if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { - return; - } - - vfio_spapr_remove_window(container, - section->offset_within_address_space); - if (vfio_host_win_del(container, - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1) < 0) { - hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, - __func__, section->offset_within_address_space); - } -} - int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) { int ret; diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 9ec1e95f6d..9a7517c042 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -11,6 +11,10 @@ #include "qemu/osdep.h" #include #include +#ifdef CONFIG_KVM +#include +#endif +#include "sysemu/kvm.h" #include "hw/vfio/vfio-common.h" #include "hw/hw.h" @@ -253,3 +257,89 @@ int vfio_spapr_remove_window(VFIOContainer *container, return 0; } + +int vfio_container_add_section_window(VFIOContainer *container, + MemoryRegionSection *section, + Error **errp) +{ + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; + int ret; + + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return 0; + } + + /* For now intersections are not allowed, we may relax this later */ + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + section->offset_within_address_space, + int128_get64(section->size))) { + error_setg(errp, + "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" + "host DMA window [0x%"PRIx64",0x%"PRIx64"]", + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, + hostwin->min_iova, hostwin->max_iova); + return -EINVAL; + } + } + + ret = vfio_spapr_create_window(container, section, &pgsize); + if (ret) { + error_setg_errno(errp, -ret, "Failed to create SPAPR window"); + return ret; + } + + vfio_host_win_add(container, section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, pgsize); +#ifdef CONFIG_KVM + if (kvm_enabled()) { + VFIOGroup *group; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + struct kvm_vfio_spapr_tce param; + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_GROUP, + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, + .addr = (uint64_t)(unsigned long)¶m, + }; + + if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, + ¶m.tablefd)) { + QLIST_FOREACH(group, &container->group_list, container_next) { + param.groupfd = group->fd; + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_setg_errno(errp, errno, + "vfio: failed GROUP_SET_SPAPR_TCE for " + "KVM VFIO device %d and group fd %d", + param.tablefd, param.groupfd); + return -errno; + } + trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); + } + } + } +#endif + return 0; +} + +void vfio_container_del_section_window(VFIOContainer *container, + MemoryRegionSection *section) +{ + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return; + } + + vfio_spapr_remove_window(container, + section->offset_within_address_space); + if (vfio_host_win_del(container, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { + hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, + __func__, section->offset_within_address_space); + } +} From 770c3b6e431e5160b81f889ccb76c07dcfeb5da5 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 2 Nov 2023 15:12:24 +0800 Subject: [PATCH 20/22] vfio/container: Move spapr specific init/deinit into spapr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move spapr specific init/deinit code into spapr.c and wrap them with vfio_spapr_container_init/deinit, this way footprint of spapr is further reduced, vfio_prereg_listener could also be made static. vfio_listener_release is unnecessary when prereg_listener is moved out, so have it removed. No functional changes intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 82 +++++------------------------------ hw/vfio/spapr.c | 81 +++++++++++++++++++++++++++++++++- include/hw/vfio/vfio-common.h | 4 +- 3 files changed, 95 insertions(+), 72 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 7a3f005d1b..204b244b11 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -264,14 +264,6 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, return ret; } -static void vfio_listener_release(VFIOContainer *container) -{ - memory_listener_unregister(&container->listener); - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - memory_listener_unregister(&container->prereg_listener); - } -} - static struct vfio_info_cap_header * vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) { @@ -612,69 +604,11 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, case VFIO_SPAPR_TCE_v2_IOMMU: case VFIO_SPAPR_TCE_IOMMU: { - struct vfio_iommu_spapr_tce_info info; - bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; - - /* - * The host kernel code implementing VFIO_IOMMU_DISABLE is called - * when container fd is closed so we do not call it explicitly - * in this file. - */ - if (!v2) { - ret = ioctl(fd, VFIO_IOMMU_ENABLE); - if (ret) { - error_setg_errno(errp, errno, "failed to enable container"); - ret = -errno; - goto enable_discards_exit; - } - } else { - container->prereg_listener = vfio_prereg_listener; - - memory_listener_register(&container->prereg_listener, - &address_space_memory); - if (container->error) { - memory_listener_unregister(&container->prereg_listener); - ret = -1; - error_propagate_prepend(errp, container->error, - "RAM memory listener initialization failed: "); - goto enable_discards_exit; - } - } - - info.argsz = sizeof(info); - ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + ret = vfio_spapr_container_init(container, errp); if (ret) { - error_setg_errno(errp, errno, - "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); - ret = -errno; - if (v2) { - memory_listener_unregister(&container->prereg_listener); - } goto enable_discards_exit; } - - if (v2) { - container->pgsizes = info.ddw.pgsizes; - /* - * There is a default window in just created container. - * To make region_add/del simpler, we better remove this - * window now and let those iommu_listener callbacks - * create/remove them when needed. - */ - ret = vfio_spapr_remove_window(container, info.dma32_window_start); - if (ret) { - error_setg_errno(errp, -ret, - "failed to remove existing window"); - goto enable_discards_exit; - } - } else { - /* The default table uses 4K pages */ - container->pgsizes = 0x1000; - vfio_host_win_add(container, info.dma32_window_start, - info.dma32_window_start + - info.dma32_window_size - 1, - 0x1000); - } + break; } } @@ -704,7 +638,11 @@ listener_release_exit: QLIST_REMOVE(group, container_next); QLIST_REMOVE(container, next); vfio_kvm_device_del_group(group); - vfio_listener_release(container); + memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + vfio_spapr_container_deinit(container); + } enable_discards_exit: vfio_ram_block_discard_disable(container, false); @@ -734,7 +672,11 @@ static void vfio_disconnect_container(VFIOGroup *group) * group. */ if (QLIST_EMPTY(&container->group_list)) { - vfio_listener_release(container); + memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + vfio_spapr_container_deinit(container); + } } if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 9a7517c042..00dbd7af11 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -15,6 +15,7 @@ #include #endif #include "sysemu/kvm.h" +#include "exec/address-spaces.h" #include "hw/vfio/vfio-common.h" #include "hw/hw.h" @@ -139,7 +140,7 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener, trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); } -const MemoryListener vfio_prereg_listener = { +static const MemoryListener vfio_prereg_listener = { .name = "vfio-pre-reg", .region_add = vfio_prereg_listener_region_add, .region_del = vfio_prereg_listener_region_del, @@ -343,3 +344,81 @@ void vfio_container_del_section_window(VFIOContainer *container, __func__, section->offset_within_address_space); } } + +int vfio_spapr_container_init(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_spapr_tce_info info; + bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; + int ret, fd = container->fd; + + /* + * The host kernel code implementing VFIO_IOMMU_DISABLE is called + * when container fd is closed so we do not call it explicitly + * in this file. + */ + if (!v2) { + ret = ioctl(fd, VFIO_IOMMU_ENABLE); + if (ret) { + error_setg_errno(errp, errno, "failed to enable container"); + return -errno; + } + } else { + container->prereg_listener = vfio_prereg_listener; + + memory_listener_register(&container->prereg_listener, + &address_space_memory); + if (container->error) { + ret = -1; + error_propagate_prepend(errp, container->error, + "RAM memory listener initialization failed: "); + goto listener_unregister_exit; + } + } + + info.argsz = sizeof(info); + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + error_setg_errno(errp, errno, + "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); + ret = -errno; + goto listener_unregister_exit; + } + + if (v2) { + container->pgsizes = info.ddw.pgsizes; + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this + * window now and let those iommu_listener callbacks + * create/remove them when needed. + */ + ret = vfio_spapr_remove_window(container, info.dma32_window_start); + if (ret) { + error_setg_errno(errp, -ret, + "failed to remove existing window"); + goto listener_unregister_exit; + } + } else { + /* The default table uses 4K pages */ + container->pgsizes = 0x1000; + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); + } + + return 0; + +listener_unregister_exit: + if (v2) { + memory_listener_unregister(&container->prereg_listener); + } + return ret; +} + +void vfio_spapr_container_deinit(VFIOContainer *container) +{ + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } +} diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0c3d390e8b..ed5a8e4754 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -225,11 +225,14 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); +/* SPAPR specific */ int vfio_container_add_section_window(VFIOContainer *container, MemoryRegionSection *section, Error **errp); void vfio_container_del_section_window(VFIOContainer *container, MemoryRegionSection *section); +int vfio_spapr_container_init(VFIOContainer *container, Error **errp); +void vfio_spapr_container_deinit(VFIOContainer *container); void vfio_disable_irqindex(VFIODevice *vbasedev, int index); void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index); @@ -289,7 +292,6 @@ vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); #endif -extern const MemoryListener vfio_prereg_listener; int vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, From a17879f0e2e82c5e85440eb1c3e8a3eeef469a3e Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 2 Nov 2023 15:12:25 +0800 Subject: [PATCH 21/22] vfio/spapr: Make vfio_spapr_create/remove_window static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_spapr_create_window calls vfio_spapr_remove_window, With reoder of definition of the two, we can make vfio_spapr_create/remove_window static. No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater --- hw/vfio/spapr.c | 48 +++++++++++++++++------------------ include/hw/vfio/vfio-common.h | 6 ----- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 00dbd7af11..4428990c28 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -146,9 +146,30 @@ static const MemoryListener vfio_prereg_listener = { .region_del = vfio_prereg_listener_region_del, }; -int vfio_spapr_create_window(VFIOContainer *container, - MemoryRegionSection *section, - hwaddr *pgsize) +static int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space) +{ + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = offset_within_address_space, + }; + int ret; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("Failed to remove window at %"PRIx64, + (uint64_t)remove.start_addr); + return -errno; + } + + trace_vfio_spapr_remove_window(offset_within_address_space); + + return 0; +} + +static int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize) { int ret = 0; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); @@ -238,27 +259,6 @@ int vfio_spapr_create_window(VFIOContainer *container, return 0; } -int vfio_spapr_remove_window(VFIOContainer *container, - hwaddr offset_within_address_space) -{ - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - .start_addr = offset_within_address_space, - }; - int ret; - - ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); - if (ret) { - error_report("Failed to remove window at %"PRIx64, - (uint64_t)remove.start_addr); - return -errno; - } - - trace_vfio_spapr_remove_window(offset_within_address_space); - - return 0; -} - int vfio_container_add_section_window(VFIOContainer *container, MemoryRegionSection *section, Error **errp) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ed5a8e4754..87848982bd 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -293,12 +293,6 @@ struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); #endif -int vfio_spapr_create_window(VFIOContainer *container, - MemoryRegionSection *section, - hwaddr *pgsize); -int vfio_spapr_remove_window(VFIOContainer *container, - hwaddr offset_within_address_space); - bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); From a2347c60a86a7c2a227ebab186a195d16e1e3901 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 2 Nov 2023 15:12:26 +0800 Subject: [PATCH 22/22] vfio/common: Move vfio_host_win_add/del into spapr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only spapr supports a customed host window list, other vfio driver assume 64bit host window. So remove the check in listener callback and move vfio_host_win_add/del into spapr.c and make it static. With the check removed, we still need to do the same check for VFIO_SPAPR_TCE_IOMMU which allows a single host window range [dma32_window_start, dma32_window_size). Move vfio_find_hostwin into spapr.c and do same check in vfio_container_add_section_window instead. When mapping a ram device section, if it's unaligned with hostwin->iova_pgsizes, this mapping is bypassed. With hostwin moved into spapr, we changed to check container->pgsizes. Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 70 +---------------------------- hw/vfio/container.c | 16 ------- hw/vfio/spapr.c | 83 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 5 --- 4 files changed, 85 insertions(+), 89 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e72055e752..e70fdf5e0c 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -245,44 +245,6 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) return true; } -void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, - hwaddr max_iova, uint64_t iova_pgsizes) -{ - VFIOHostDMAWindow *hostwin; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (ranges_overlap(hostwin->min_iova, - hostwin->max_iova - hostwin->min_iova + 1, - min_iova, - max_iova - min_iova + 1)) { - hw_error("%s: Overlapped IOMMU are not enabled", __func__); - } - } - - hostwin = g_malloc0(sizeof(*hostwin)); - - hostwin->min_iova = min_iova; - hostwin->max_iova = max_iova; - hostwin->iova_pgsizes = iova_pgsizes; - QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); -} - -int vfio_host_win_del(VFIOContainer *container, - hwaddr min_iova, hwaddr max_iova) -{ - VFIOHostDMAWindow *hostwin; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { - QLIST_REMOVE(hostwin, hostwin_next); - g_free(hostwin); - return 0; - } - } - - return -1; -} - static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -531,22 +493,6 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } -static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, - hwaddr iova, hwaddr end) -{ - VFIOHostDMAWindow *hostwin; - bool hostwin_found = false; - - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - - return hostwin_found ? hostwin : NULL; -} - static bool vfio_known_safe_misalignment(MemoryRegionSection *section) { MemoryRegion *mr = section->mr; @@ -625,7 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener, Int128 llend, llsize; void *vaddr; int ret; - VFIOHostDMAWindow *hostwin; Error *err = NULL; if (!vfio_listener_valid_section(section, "region_add")) { @@ -647,13 +592,6 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } - hostwin = vfio_find_hostwin(container, iova, end); - if (!hostwin) { - error_setg(&err, "Container %p can't map guest IOVA region" - " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); - goto fail; - } - memory_region_ref(section->mr); if (memory_region_is_iommu(section->mr)) { @@ -734,7 +672,7 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { trace_vfio_listener_region_add_no_dma_map( @@ -833,12 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; - VFIOHostDMAWindow *hostwin; - hostwin = vfio_find_hostwin(container, iova, end); - assert(hostwin); /* or region_add() would have failed */ - - pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + pgmask = (1ULL << ctz64(container->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { vfio_unregister_ram_discard_listener(container, section); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 204b244b11..242010036a 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -551,7 +551,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->dma_max_mappings = 0; container->iova_ranges = NULL; QLIST_INIT(&container->giommu_list); - QLIST_INIT(&container->hostwin_list); QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); @@ -591,14 +590,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_get_iommu_info_migration(container, info); g_free(info); - - /* - * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE - * information to get the actual window extent rather than assume - * a 64-bit IOVA address space. - */ - vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes); - break; } case VFIO_SPAPR_TCE_v2_IOMMU: @@ -687,7 +678,6 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; VFIOGuestIOMMU *giommu, *tmp; - VFIOHostDMAWindow *hostwin, *next; QLIST_REMOVE(container, next); @@ -698,12 +688,6 @@ static void vfio_disconnect_container(VFIOGroup *group) g_free(giommu); } - QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, - next) { - QLIST_REMOVE(hostwin, hostwin_next); - g_free(hostwin); - } - trace_vfio_disconnect_container(container->fd); close(container->fd); vfio_free_container(container); diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 4428990c28..83da2f7ec2 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -146,6 +146,60 @@ static const MemoryListener vfio_prereg_listener = { .region_del = vfio_prereg_listener_region_del, }; +static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, + hwaddr max_iova, uint64_t iova_pgsizes) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + min_iova, + max_iova - min_iova + 1)) { + hw_error("%s: Overlapped IOMMU are not enabled", __func__); + } + } + + hostwin = g_malloc0(sizeof(*hostwin)); + + hostwin->min_iova = min_iova; + hostwin->max_iova = max_iova; + hostwin->iova_pgsizes = iova_pgsizes; + QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); +} + +static int vfio_host_win_del(VFIOContainer *container, + hwaddr min_iova, hwaddr max_iova) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + return 0; + } + } + + return -1; +} + +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, + hwaddr iova, hwaddr end) +{ + VFIOHostDMAWindow *hostwin; + bool hostwin_found = false; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + return hostwin_found ? hostwin : NULL; +} + static int vfio_spapr_remove_window(VFIOContainer *container, hwaddr offset_within_address_space) { @@ -267,6 +321,26 @@ int vfio_container_add_section_window(VFIOContainer *container, hwaddr pgsize = 0; int ret; + /* + * VFIO_SPAPR_TCE_IOMMU supports a single host window between + * [dma32_window_start, dma32_window_size), we need to ensure + * the section fall in this range. + */ + if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { + hwaddr iova, end; + + iova = section->offset_within_address_space; + end = iova + int128_get64(section->size) - 1; + + if (!vfio_find_hostwin(container, iova, end)) { + error_setg(errp, "Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, + iova, end); + return -EINVAL; + } + return 0; + } + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return 0; } @@ -351,6 +425,8 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; + QLIST_INIT(&container->hostwin_list); + /* * The host kernel code implementing VFIO_IOMMU_DISABLE is called * when container fd is closed so we do not call it explicitly @@ -418,7 +494,14 @@ listener_unregister_exit: void vfio_spapr_container_deinit(VFIOContainer *container) { + VFIOHostDMAWindow *hostwin, *next; + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { memory_listener_unregister(&container->prereg_listener); } + QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + } } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 87848982bd..a4a22accb9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -207,11 +207,6 @@ typedef struct { hwaddr pages; } VFIOBitmap; -void vfio_host_win_add(VFIOContainer *container, - hwaddr min_iova, hwaddr max_iova, - uint64_t iova_pgsizes); -int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova, - hwaddr max_iova); VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container);