From 3b5948f808e3b99aedfa0aff45cffbe8b7ec07ed Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 20 Oct 2024 16:01:06 +0300 Subject: [PATCH 1/4] vfio/migration: Report only stop-copy size in vfio_state_pending_exact() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_state_pending_exact() is used to update migration core how much device data is left for the device migration. Currently, the sum of pre-copy and stop-copy sizes of the VFIO device are reported. The pre-copy size is obtained via the VFIO_MIG_GET_PRECOPY_INFO ioctl, which returns the amount of device data available to be transferred while the device is in the PRE_COPY states. The stop-copy size is obtained via the VFIO_DEVICE_FEATURE_MIG_DATA_SIZE ioctl, which returns the total amount of device data left to be transferred in order to complete the device migration. According to the above, current implementation is wrong -- it reports extra overlapping data because pre-copy size is already contained in stop-copy size. Fix it by reporting only stop-copy size. Fixes: eda7362af959 ("vfio/migration: Add VFIO migration pre-copy support") Signed-off-by: Avihai Horon Reviewed-by: Cédric Le Goater --- hw/vfio/migration.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 17199b73ae..992dc3b102 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -576,9 +576,6 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, if (vfio_device_state_is_precopy(vbasedev)) { vfio_query_precopy_size(migration); - - *must_precopy += - migration->precopy_init_size + migration->precopy_dirty_size; } trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, From fa4e20defe239e42af0a1b5c030dec114f799f56 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 20 Oct 2024 16:01:08 +0300 Subject: [PATCH 2/4] vfio/migration: Change trace formats from hex to decimal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Data sizes in VFIO migration trace events are printed in hex format while in migration core trace events they are printed in decimal format. This inconsistency makes it less readable when using both trace event types. Hence, change the data sizes print format to decimal in VFIO migration trace events. Signed-off-by: Avihai Horon Reviewed-by: Cédric Le Goater --- hw/vfio/trace-events | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index c475c273fd..29789e8d27 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -151,7 +151,7 @@ vfio_display_edid_write_error(void) "" vfio_load_cleanup(const char *name) " (%s)" vfio_load_device_config_state(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 -vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d" +vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d" vfio_migration_realize(const char *name) " (%s)" vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s" vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s" @@ -160,10 +160,10 @@ vfio_save_block(const char *name, int data_size) " (%s) data_size %d" vfio_save_cleanup(const char *name) " (%s)" vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" -vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 -vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64 -vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 -vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 +vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64 +vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size %"PRIu64 +vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy %"PRIu64" postcopy %"PRIu64" precopy initial size %"PRIu64" precopy dirty size %"PRIu64 +vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy %"PRIu64" postcopy %"PRIu64" stopcopy size %"PRIu64" precopy initial size %"PRIu64" precopy dirty size %"PRIu64 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" From 49915c0d2c9868e6f25e52e4d839943611b69e98 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 22 Oct 2024 14:08:28 -0600 Subject: [PATCH 3/4] vfio/helpers: Refactor vfio_region_mmap() error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move error handling code to the end of the function so that it can more easily be shared by new mmap failure conditions. No functional change intended. Signed-off-by: Alex Williamson Reviewed-by: Peter Xu Reviewed-by: Cédric Le Goater --- hw/vfio/helpers.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index ea15c79db0..b9e606e364 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -395,7 +395,7 @@ static void vfio_subregion_unmap(VFIORegion *region, int index) int vfio_region_mmap(VFIORegion *region) { - int i, prot = 0; + int i, ret, prot = 0; char *name; if (!region->mem) { @@ -411,22 +411,8 @@ int vfio_region_mmap(VFIORegion *region) region->fd_offset + region->mmaps[i].offset); if (region->mmaps[i].mmap == MAP_FAILED) { - int ret = -errno; - - trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, - region->fd_offset + - region->mmaps[i].offset, - region->fd_offset + - region->mmaps[i].offset + - region->mmaps[i].size - 1, ret); - - region->mmaps[i].mmap = NULL; - - for (i--; i >= 0; i--) { - vfio_subregion_unmap(region, i); - } - - return ret; + ret = -errno; + goto no_mmap; } name = g_strdup_printf("%s mmaps[%d]", @@ -446,6 +432,20 @@ int vfio_region_mmap(VFIORegion *region) } return 0; + +no_mmap: + trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, + region->fd_offset + region->mmaps[i].offset, + region->fd_offset + region->mmaps[i].offset + + region->mmaps[i].size - 1, ret); + + region->mmaps[i].mmap = NULL; + + for (i--; i >= 0; i--) { + vfio_subregion_unmap(region, i); + } + + return ret; } void vfio_region_unmap(VFIORegion *region) From 00b519c0bca0e933ed22e2e6f8bca6b23f41f950 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 22 Oct 2024 14:08:29 -0600 Subject: [PATCH 4/4] vfio/helpers: Align mmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to work by Peter Xu, support is introduced in Linux v6.12 to allow pfnmap insertions at PMD and PUD levels of the page table. This means that provided a properly aligned mmap, the vfio driver is able to map MMIO at significantly larger intervals than PAGE_SIZE. For example on x86_64 (the only architecture currently supporting huge pfnmaps for PUD), rather than 4KiB mappings, we can map device MMIO using 2MiB and even 1GiB page table entries. Typically mmap will already provide PMD aligned mappings, so devices with moderately sized MMIO ranges, even GPUs with standard 256MiB BARs, will already take advantage of this support. However in order to better support devices exposing multi-GiB MMIO, such as 3D accelerators or GPUs with resizable BARs enabled, we need to manually align the mmap. There doesn't seem to be a way for userspace to easily learn about PMD and PUD mapping level sizes, therefore this takes the simple approach to align the mapping to the power-of-two size of the region, up to 1GiB, which is currently the maximum alignment we care about. Cc: Peter Xu Signed-off-by: Alex Williamson Reviewed-by: Peter Xu Reviewed-by: Cédric Le Goater --- hw/vfio/helpers.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index b9e606e364..913796f437 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -27,6 +27,7 @@ #include "trace.h" #include "qapi/error.h" #include "qemu/error-report.h" +#include "qemu/units.h" #include "monitor/monitor.h" /* @@ -406,8 +407,35 @@ int vfio_region_mmap(VFIORegion *region) prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; for (i = 0; i < region->nr_mmaps; i++) { - region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, - MAP_SHARED, region->vbasedev->fd, + size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); + void *map_base, *map_align; + + /* + * Align the mmap for more efficient mapping in the kernel. Ideally + * we'd know the PMD and PUD mapping sizes to use as discrete alignment + * intervals, but we don't. As of Linux v6.12, the largest PUD size + * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set + * on x86_64). Align by power-of-two size, capped at 1GiB. + * + * NB. qemu_memalign() and friends actually allocate memory, whereas + * the region size here can exceed host memory, therefore we manually + * create an oversized anonymous mapping and clean it up for alignment. + */ + map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map_base == MAP_FAILED) { + ret = -errno; + goto no_mmap; + } + + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); + munmap(map_base, map_align - map_base); + munmap(map_align + region->mmaps[i].size, + align - (map_align - map_base)); + + region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, + MAP_SHARED | MAP_FIXED, + region->vbasedev->fd, region->fd_offset + region->mmaps[i].offset); if (region->mmaps[i].mmap == MAP_FAILED) {