From 4292d5019345a65f14c85e8207c7059e3791773e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 4 Dec 2020 09:42:40 +0800 Subject: [PATCH 01/10] vfio: Fix vfio_listener_log_sync function name typo There is an obvious typo in the function name of the .log_sync() callback. Spell it correctly. Signed-off-by: Zenghui Yu Message-Id: <20201204014240.772-1-yuzenghui@huawei.com> Signed-off-by: Alex Williamson --- hw/vfio/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6ff1daa763..d360d6f2da 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1118,7 +1118,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, int128_get64(section->size), ram_addr); } -static void vfio_listerner_log_sync(MemoryListener *listener, +static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); @@ -1136,7 +1136,7 @@ static void vfio_listerner_log_sync(MemoryListener *listener, static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, - .log_sync = vfio_listerner_log_sync, + .log_sync = vfio_listener_log_sync, }; static void vfio_listener_release(VFIOContainer *container) From 4eda914cacc32c7c069bc57777dac0f338133e31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Fri, 5 Feb 2021 18:18:17 +0100 Subject: [PATCH 02/10] hw/vfio/pci-quirks: Replace the word 'blacklist' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow the inclusive terminology from the "Conscious Language in your Open Source Projects" guidelines [*] and replace the word "blacklist" appropriately. [*] https://github.com/conscious-lang/conscious-lang-docs/blob/main/faq.md Reviewed-by: Alex Williamson Acked-by: Alex Williamson Reviewed-by: Daniel P. Berrangé Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20210205171817.2108907-9-philmd@redhat.com> Signed-off-by: Alex Williamson --- hw/vfio/pci-quirks.c | 14 +++++++------- hw/vfio/pci.c | 4 ++-- hw/vfio/pci.h | 2 +- hw/vfio/trace-events | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index c5c4c61d01..b90cf3d37c 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -44,19 +44,19 @@ static const struct { uint32_t vendor; uint32_t device; -} romblacklist[] = { +} rom_denylist[] = { { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ }; -bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) +bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev) { int i; - for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) { - if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) { - trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name, - romblacklist[i].vendor, - romblacklist[i].device); + for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) { + if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) { + trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name, + rom_denylist[i].vendor, + rom_denylist[i].device); return true; } } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f74be78209..759a3b1abf 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -900,7 +900,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ - if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { + if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) { warn_report("Device at %s is known to cause system instability" " issues during option rom execution", vdev->vbasedev.name); @@ -927,7 +927,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) return; } - if (vfio_blacklist_opt_rom(vdev)) { + if (vfio_opt_rom_in_denylist(vdev)) { if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { warn_report("Device at %s is known to cause system instability" " issues during option rom execution", diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 1574ef983f..64777516d1 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -197,7 +197,7 @@ void vfio_pci_write_config(PCIDevice *pdev, uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size); void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); -bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev); +bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev); void vfio_vga_quirk_setup(VFIOPCIDevice *vdev); void vfio_vga_quirk_exit(VFIOPCIDevice *vdev); void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index c0e75f24b7..079f53acf2 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -49,7 +49,7 @@ vfio_pci_emulated_sub_vendor_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_sub_device_id(const char *name, uint16_t val) "%s 0x%04x" # pci-quirks.c -vfio_quirk_rom_blacklisted(const char *name, uint16_t vid, uint16_t did) "%s %04x:%04x" +vfio_quirk_rom_in_denylist(const char *name, uint16_t vid, uint16_t did) "%s %04x:%04x" vfio_quirk_generic_window_address_write(const char *name, const char * region_name, uint64_t data) "%s %s 0x%"PRIx64 vfio_quirk_generic_window_data_read(const char *name, const char * region_name, uint64_t data) "%s %s 0x%"PRIx64 vfio_quirk_generic_window_data_write(const char *name, const char * region_name, uint64_t data) "%s %s 0x%"PRIx64 From 4e779bf1a55e8b951f1640e3ea46fc459066f64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 2 Feb 2021 16:56:11 +0100 Subject: [PATCH 03/10] MAINTAINERS: Cover docs/igd-assign.txt in VFIO section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20210202155611.998424-1-philmd@redhat.com> Signed-off-by: Alex Williamson --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b6ab3d25a7..5b74df54dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1809,6 +1809,7 @@ M: Alex Williamson S: Supported F: hw/vfio/* F: include/hw/vfio/ +F: docs/igd-assign.txt vfio-ccw M: Cornelia Huck From 8dca037b484fc8caeb6d6689745bc7475ce27174 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Feb 2021 22:32:32 +0100 Subject: [PATCH 04/10] vfio: Do not register any IOMMU_NOTIFIER_DEVIOTLB_UNMAP notifier In an attempt to fix smmu/virtio-iommu - vhost regression, commit 958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support") broke virtio-iommu integration. This is due to the fact VFIO registers IOMMU_NOTIFIER_ALL notifiers, which includes IOMMU_NOTIFIER_DEVIOTLB_UNMAP and this latter now is rejected by the virtio-iommu. As a consequence, the registration fails. VHOST behaves like a device with an ATC cache. The VFIO device does not support this scheme yet. Let's register only legacy MAP and UNMAP notifiers. Fixes: 958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support") Signed-off-by: Eric Auger Message-Id: <20210209213233.40985-2-eric.auger@redhat.com> Acked-by: Jason Wang Acked-by: Alex Williamson Signed-off-by: Alex Williamson --- hw/vfio/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d360d6f2da..35a41fd052 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -787,7 +787,7 @@ static void vfio_listener_region_add(MemoryListener *listener, iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, MEMTXATTRS_UNSPECIFIED); iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, - IOMMU_NOTIFIER_ALL, + IOMMU_NOTIFIER_IOTLB_EVENTS, section->offset_within_region, int128_get64(llend), iommu_idx); From 1a8e22bd20c2586df0bc0fdce8d5a3b42fffb1ac Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 9 Feb 2021 22:32:33 +0100 Subject: [PATCH 05/10] spapr_iommu: Fix vhost integration regression Previous work on dev-iotlb message broke spapr_iommu/vhost integration as it did for SMMU and virtio-iommu. The spapr_iommu currently only sends IOMMU_NOTIFIER_UNMAP notifications. Since commit 958ec334bca3 ("vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support"), VHOST first tries to register IOMMU_NOTIFIER_DEVIOTLB_UNMAP notifier and if it fails, falls back to legacy IOMMU_NOTIFIER_UNMAP. So spapr_iommu must fail on the IOMMU_NOTIFIER_DEVIOTLB_UNMAP registration. Reported-by: Peter Xu Fixes: b68ba1ca5767 ("memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP IOMMUTLBNotificationType") Signed-off-by: Eric Auger Message-Id: <20210209213233.40985-3-eric.auger@redhat.com> Acked-by: David Gibson Acked-by: Jason Wang Reviewed-by: Michael S. Tsirkin Reviewed-by: Greg Kurz Signed-off-by: Alex Williamson --- hw/ppc/spapr_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 30352df00e..24537ffcbd 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -212,6 +212,11 @@ static int spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu, { struct SpaprTceTable *tbl = container_of(iommu, SpaprTceTable, iommu); + if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { + error_setg(errp, "spart_tce does not support dev-iotlb yet"); + return -EINVAL; + } + if (old == IOMMU_NOTIFIER_NONE && new != IOMMU_NOTIFIER_NONE) { spapr_tce_set_need_vfio(tbl, true); } else if (old != IOMMU_NOTIFIER_NONE && new == IOMMU_NOTIFIER_NONE) { From d329f5032e17f3ecc7f8c2c3c5f130ec671000d2 Mon Sep 17 00:00:00 2001 From: Shenming Lu Date: Wed, 10 Mar 2021 11:02:31 +0800 Subject: [PATCH 06/10] vfio: Move the saving of the config space to the right place in VFIO migration On ARM64 the VFIO SET_IRQS ioctl is dependent on the VM interrupt setup, if the restoring of the VFIO PCI device config space is before the VGIC, an error might occur in the kernel. So we move the saving of the config space to the non-iterable process, thus it will be called after the VGIC according to their priorities. As for the possible dependence of the device specific migration data on it's config space, we can let the vendor driver to include any config info it needs in its own data stream. Signed-off-by: Shenming Lu Reviewed-by: Kirti Wankhede Message-Id: <20210310030233.1133-2-lushenming@huawei.com> Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 134bdccc4f..003786f3cd 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -575,11 +575,6 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) return ret; } - ret = vfio_save_device_config_state(f, opaque); - if (ret) { - return ret; - } - ret = vfio_update_pending(vbasedev); if (ret) { return ret; @@ -620,6 +615,19 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) return ret; } +static void vfio_save_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + int ret; + + ret = vfio_save_device_config_state(f, opaque); + if (ret) { + error_report("%s: Failed to save device config space", + vbasedev->name); + qemu_file_set_error(f, ret); + } +} + static int vfio_load_setup(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; @@ -670,11 +678,7 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) switch (data) { case VFIO_MIG_FLAG_DEV_CONFIG_STATE: { - ret = vfio_load_device_config_state(f, opaque); - if (ret) { - return ret; - } - break; + return vfio_load_device_config_state(f, opaque); } case VFIO_MIG_FLAG_DEV_SETUP_STATE: { @@ -720,6 +724,7 @@ static SaveVMHandlers savevm_vfio_handlers = { .save_live_pending = vfio_save_pending, .save_live_iterate = vfio_save_iterate, .save_live_complete_precopy = vfio_save_complete_precopy, + .save_state = vfio_save_state, .load_setup = vfio_load_setup, .load_cleanup = vfio_load_cleanup, .load_state = vfio_load_state, From 8ce1ff990eff6affbdd0492fe4fc95e113235e35 Mon Sep 17 00:00:00 2001 From: Shenming Lu Date: Wed, 10 Mar 2021 11:02:32 +0800 Subject: [PATCH 07/10] vfio: Set the priority of the VFIO VM state change handler explicitly In the VFIO VM state change handler when stopping the VM, the _RUNNING bit in device_state is cleared which makes the VFIO device stop, including no longer generating interrupts. Then we can save the pending states of all interrupts in the GIC VM state change handler (on ARM). So we have to set the priority of the VFIO VM state change handler explicitly (like virtio devices) to ensure it is called before the GIC's in saving. Signed-off-by: Shenming Lu Reviewed-by: Kirti Wankhede Reviewed-by: Cornelia Huck Message-Id: <20210310030233.1133-3-lushenming@huawei.com> Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 003786f3cd..eafb778947 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -862,7 +862,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, vbasedev); - migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, + migration->vm_state = qdev_add_vm_change_state_handler(vbasedev->dev, + vfio_vmstate_change, vbasedev); migration->migration_state.notify = vfio_migration_state_notifier; add_migration_state_change_notifier(&migration->migration_state); From ecebe53fe99379243695e817450124d69e061e39 Mon Sep 17 00:00:00 2001 From: Shenming Lu Date: Wed, 10 Mar 2021 11:02:33 +0800 Subject: [PATCH 08/10] vfio: Avoid disabling and enabling vectors repeatedly in VFIO migration In VFIO migration resume phase and some guest startups, there are already unmasked vectors in the vector table when calling vfio_msix_enable(). So in order to avoid inefficiently disabling and enabling vectors repeatedly, let's allocate all needed vectors first and then enable these unmasked vectors one by one without disabling. Signed-off-by: Shenming Lu Message-Id: <20210310030233.1133-4-lushenming@huawei.com> Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 759a3b1abf..5c65aa0a98 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -569,6 +569,9 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) static void vfio_msix_enable(VFIOPCIDevice *vdev) { + PCIDevice *pdev = &vdev->pdev; + unsigned int nr, max_vec = 0; + vfio_disable_interrupts(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries); @@ -587,11 +590,22 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * triggering to userspace, then immediately release the vector, leaving * the physical device with no vectors enabled, but MSI-X enabled, just * like the guest view. + * If there are already unmasked vectors (in migration resume phase and + * some guest startups) which will be enabled soon, we can allocate all + * of them here to avoid inefficiently disabling and enabling vectors + * repeatedly later. */ - vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); - vfio_msix_vector_release(&vdev->pdev, 0); + if (!pdev->msix_function_masked) { + for (nr = 0; nr < msix_nr_vectors_allocated(pdev); nr++) { + if (!msix_is_masked(pdev, nr)) { + max_vec = nr; + } + } + } + vfio_msix_vector_do_use(pdev, max_vec, NULL, NULL); + vfio_msix_vector_release(pdev, max_vec); - if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, + if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } From 1eb7f642750c1a1499423e00f408820c6d37b129 Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Thu, 4 Mar 2021 21:34:46 +0800 Subject: [PATCH 09/10] vfio: Support host translation granule size The cpu_physical_memory_set_dirty_lebitmap() can quickly deal with the dirty pages of memory by bitmap-traveling, regardless of whether the bitmap is aligned correctly or not. cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of host page size. So it'd better to set bitmap_pgsize to host page size to support more translation granule sizes. [aw: The Fixes commit below introduced code to restrict migration support to configurations where the target page size intersects the host dirty page support. For example, a 4K guest on a 4K host. Due to the above flexibility in bitmap handling, this restriction unnecessarily prevents mixed target/host pages size that could otherwise be supported. Use host page size for dirty bitmap.] Fixes: 87ea529c502 ("vfio: Get migration capability flags for container") Signed-off-by: Kunkun Jiang Message-Id: <20210304133446.1521-1-jiangkunkun@huawei.com> Signed-off-by: Alex Williamson --- hw/vfio/common.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 35a41fd052..ad08dfd729 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -378,7 +378,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, { struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; - uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; + uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size; int ret; unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); @@ -390,12 +390,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, bitmap = (struct vfio_bitmap *)&unmap->data; /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to - * TARGET_PAGE_SIZE. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize + * to qemu_real_host_page_size. */ - bitmap->pgsize = TARGET_PAGE_SIZE; + bitmap->pgsize = qemu_real_host_page_size; bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; @@ -674,16 +674,17 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != - (section->offset_within_region & ~TARGET_PAGE_MASK))) { + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { error_report("%s received unaligned region", __func__); return; } - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); if (int128_ge(int128_make64(iova), llend)) { return; @@ -892,8 +893,9 @@ static void vfio_listener_region_del(MemoryListener *listener, return; } - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != - (section->offset_within_region & ~TARGET_PAGE_MASK))) { + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { error_report("%s received unaligned region", __func__); return; } @@ -921,10 +923,10 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); if (int128_ge(int128_make64(iova), llend)) { return; @@ -1004,13 +1006,13 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, range->size = size; /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to - * TARGET_PAGE_SIZE. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize + * to qemu_real_host_page_size. */ - range->bitmap.pgsize = TARGET_PAGE_SIZE; + range->bitmap.pgsize = qemu_real_host_page_size; - pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; + pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size; range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; range->bitmap.data = g_try_malloc0(range->bitmap.size); @@ -1114,8 +1116,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, section->offset_within_region; return vfio_get_dirty_bitmap(container, - TARGET_PAGE_ALIGN(section->offset_within_address_space), - int128_get64(section->size), ram_addr); + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); } static void vfio_listener_log_sync(MemoryListener *listener, @@ -1655,10 +1657,10 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, header); /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. */ - if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) { container->dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; From 758b96b61d5cbc19204f340012d5a325f0a2105b Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 9 Mar 2021 11:19:13 +0800 Subject: [PATCH 10/10] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener For now the switch of vfio dirty page tracking is integrated into @vfio_save_handler. The reason is that some PCI vendor driver may start to track dirty base on _SAVING state of device, so if dirty tracking is started before setting device state, vfio will report full-dirty to QEMU. However, the dirty bmap of all ramblocks are fully set when setup ram saving, so it's not matter whether the device is in _SAVING state when start vfio dirty tracking. Moreover, this logic causes some problems [1]. The object of dirty tracking is guest memory, but the object of @vfio_save_handler is device state, which produces unnecessary coupling and conflicts: 1. Coupling: Their saving granule is different (perVM vs perDevice). vfio will enable dirty_page_tracking for each devices, actually once is enough. 2. Conflicts: The ram_save_setup() traverses all memory_listeners to execute their log_start() and log_sync() hooks to get the first round dirty bitmap, which is used by the bulk stage of ram saving. However, as vfio dirty tracking is not yet started, it can't get dirty bitmap from vfio. Then we give up the chance to handle vfio dirty page at bulk stage. Move the switch of vfio dirty_page_tracking into vfio_memory_listener can solve above problems. Besides, Do not require devices in SAVING state for vfio_sync_dirty_bitmap(). [1] https://www.spinics.net/lists/kvm/msg229967.html Reported-by: Zenghui Yu Signed-off-by: Keqian Zhu Suggested-by: Paolo Bonzini Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com> Signed-off-by: Alex Williamson --- hw/vfio/common.c | 49 ++++++++++++++++++++++++++++++++++++--------- hw/vfio/migration.c | 35 -------------------------------- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ad08dfd729..ae5654fcdb 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -311,7 +311,7 @@ bool vfio_mig_active(void) return true; } -static bool vfio_devices_all_saving(VFIOContainer *container) +static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; VFIODevice *vbasedev; @@ -329,13 +329,8 @@ static bool vfio_devices_all_saving(VFIOContainer *container) return false; } - if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { - if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) - && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { - return false; - } - continue; - } else { + if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) + && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { return false; } } @@ -989,6 +984,40 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +{ + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + error_report("Failed to set dirty tracking flag 0x%x errno: %d", + dirty.flags, errno); + } +} + +static void vfio_listener_log_global_start(MemoryListener *listener) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + vfio_set_dirty_page_tracking(container, true); +} + +static void vfio_listener_log_global_stop(MemoryListener *listener) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + vfio_set_dirty_page_tracking(container, false); +} + static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { @@ -1130,7 +1159,7 @@ static void vfio_listener_log_sync(MemoryListener *listener, return; } - if (vfio_devices_all_saving(container)) { + if (vfio_devices_all_dirty_tracking(container)) { vfio_sync_dirty_bitmap(container, section); } } @@ -1138,6 +1167,8 @@ static void vfio_listener_log_sync(MemoryListener *listener, static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, .log_sync = vfio_listener_log_sync, }; diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index eafb778947..384576cfc0 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -395,40 +395,10 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } -static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start) -{ - int ret; - VFIOMigration *migration = vbasedev->migration; - VFIOContainer *container = vbasedev->group->container; - struct vfio_iommu_type1_dirty_bitmap dirty = { - .argsz = sizeof(dirty), - }; - - if (start) { - if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; - } else { - return -EINVAL; - } - } else { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; - } - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); - if (ret) { - error_report("Failed to set dirty tracking flag 0x%x errno: %d", - dirty.flags, errno); - return -errno; - } - return ret; -} - static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; - vfio_set_dirty_page_tracking(vbasedev, false); - if (migration->region.mmaps) { vfio_region_unmap(&migration->region); } @@ -469,11 +439,6 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) return ret; } - ret = vfio_set_dirty_page_tracking(vbasedev, true); - if (ret) { - return ret; - } - qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ret = qemu_file_get_error(f);