From 4e1871c450a14e38b09d4e312922eefd475c1c64 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Mon, 4 Mar 2024 12:53:37 +0200 Subject: [PATCH 01/34] migration: Don't serialize devices in qemu_savevm_state_iterate() Commit 90697be8896c ("live migration: Serialize vmstate saving in stage 2") introduced device serialization in qemu_savevm_state_iterate(). The rationale behind it was to first complete migration of slower changing block devices and only then migrate the RAM, to avoid sending fast changing RAM pages over and over. This commit was added a long time ago, and while it was useful back then, it is not the case anymore: 1. Block migration is deprecated, see commit 66db46ca83b8 ("migration: Deprecate block migration"). 2. Today there are other iterative devices besides RAM and block, such as VFIO, which are registered for migration after RAM. With current serialization behavior, a fast changing device can block other devices from sending their data, which may prevent migration from converging in some cases. The issue described in item 2 was observed in several VFIO migration scenarios with switchover-ack capability enabled, where some workload on the VM prevented RAM from ever reaching a hard zero, thus blocking VFIO initial pre-copy data from being sent. Hence, destination could not ack switchover and migration could not converge. Fix that by not serializing iterative devices in qemu_savevm_state_iterate(). Note that this still doesn't fully prevent device starvation. As correctly pointed out by Peter [1], a fast changing device might constantly consume all allocated bandwidth and block the following devices. However, this scenario is more likely to happen only if max-bandwidth is low. [1] https://lore.kernel.org/qemu-devel/Zd6iw9dBhW6wKNxx@x1n/ Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240304105339.20713-2-avihaih@nvidia.com Signed-off-by: Peter Xu --- migration/savevm.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index dc1fb9c0d3..e84b26e1c8 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1390,7 +1390,8 @@ int qemu_savevm_state_resume_prepare(MigrationState *s) int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) { SaveStateEntry *se; - int ret = 1; + bool all_finished = true; + int ret; trace_savevm_state_iterate(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { @@ -1431,16 +1432,12 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) "%d(%s): %d", se->section_id, se->idstr, ret); qemu_file_set_error(f, ret); - } - if (ret <= 0) { - /* Do not proceed to the next vmstate before this one reported - completion of the current stage. This serializes the migration - and reduces the probability that a faster changing state is - synchronized over and over again. */ - break; + return ret; + } else if (!ret) { + all_finished = false; } } - return ret; + return all_finished; } static bool should_send_vmdesc(void) From 3f6ed59ec400a58646e766a323f7f8047f044b98 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Mon, 4 Mar 2024 12:53:38 +0200 Subject: [PATCH 02/34] vfio/migration: Refactor vfio_save_state() return value Currently, vfio_save_state() returns 1 regardless of whether there is more data to send or not. This was done to prevent a fast changing VFIO device from potentially blocking other devices from sending their data, as qemu_savevm_state_iterate() serialized devices. Now that qemu_savevm_state_iterate() no longer serializes devices, there is no need for that. Refactor vfio_save_state() to return 0 if more data is available and 1 if no more data is available. Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240304105339.20713-3-avihaih@nvidia.com Signed-off-by: Peter Xu --- hw/vfio/migration.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 50140eda87..0af783a589 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -529,11 +529,7 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, migration->precopy_dirty_size); - /* - * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero. - * Return 1 so following handlers will not be potentially blocked. - */ - return 1; + return !migration->precopy_init_size && !migration->precopy_dirty_size; } static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) From ff64e0ba817d79968bffa88d205397abf05cc6e8 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Mon, 4 Mar 2024 12:53:39 +0200 Subject: [PATCH 03/34] vfio/migration: Add a note about migration rate limiting VFIO migration buffer size is currently limited to 1MB. Therefore, there is no need to check if migration rate exceeded, as in the worst case it will exceed by only 1MB. However, if the buffer size is later changed to a bigger value, vfio_save_iterate() should enforce migration rate (similar to migration RAM code). Add a note about this in vfio_save_iterate() to serve as a reminder. Suggested-by: Peter Xu Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240304105339.20713-4-avihaih@nvidia.com Signed-off-by: Peter Xu --- hw/vfio/migration.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 0af783a589..f82dcabc49 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -505,6 +505,12 @@ static bool vfio_is_active_iterate(void *opaque) return vfio_device_state_is_precopy(vbasedev); } +/* + * Note about migration rate limiting: VFIO migration buffer size is currently + * limited to 1MB, so there is no need to check if migration rate exceeded (as + * in the worst case it will exceed by 1MB). However, if the buffer size is + * later changed to a bigger value, migration rate should be enforced here. + */ static int vfio_save_iterate(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; From 12ab1e4fe84aafac37e006673f0d01f716a9a058 Mon Sep 17 00:00:00 2001 From: Maksim Davydov Date: Mon, 4 Mar 2024 17:42:03 +0300 Subject: [PATCH 04/34] migration/ram: add additional check If a migration stream is broken, the address and flag reading can return zero. Thus, an irrelevant flag error will be returned instead of EIO. It can be fixed by additional check after the reading. Signed-off-by: Maksim Davydov Link: https://lore.kernel.org/r/20240304144203.158477-1-davydov-max@yandex-team.ru Signed-off-by: Peter Xu --- migration/ram.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 003c28e133..2cd936d9ce 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -4214,6 +4214,12 @@ static int ram_load_precopy(QEMUFile *f) i++; addr = qemu_get_be64(f); + ret = qemu_file_get_error(f); + if (ret) { + error_report("Getting RAM address failed"); + break; + } + flags = addr & ~TARGET_PAGE_MASK; addr &= TARGET_PAGE_MASK; From e8c44363fbf25c2601fa2fbf3ec0d6d37c5f1a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 4 Mar 2024 13:28:24 +0100 Subject: [PATCH 05/34] migration: Report error when shutdown fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help detect issues regarding I/O channels usage. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Peter Xu Signed-off-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240304122844.1888308-7-clg@redhat.com Signed-off-by: Peter Xu --- migration/qemu-file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index b10c882629..a10882d47f 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -63,6 +63,8 @@ struct QEMUFile { */ int qemu_file_shutdown(QEMUFile *f) { + Error *err = NULL; + /* * We must set qemufile error before the real shutdown(), otherwise * there can be a race window where we thought IO all went though @@ -91,7 +93,8 @@ int qemu_file_shutdown(QEMUFile *f) return -ENOSYS; } - if (qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL) < 0) { + if (qio_channel_shutdown(f->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, &err) < 0) { + error_report_err(err); return -EIO; } From f61efdee1ef96445af7db832a0170d137fbdb31c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 4 Mar 2024 13:28:25 +0100 Subject: [PATCH 06/34] migration: Remove SaveStateHandler and LoadStateHandler typedefs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They are only used once. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240304122844.1888308-8-clg@redhat.com Signed-off-by: Peter Xu --- include/migration/register.h | 4 ++-- include/qemu/typedefs.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/migration/register.h b/include/migration/register.h index 9ab1f79512..2e6a7d766e 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -18,7 +18,7 @@ typedef struct SaveVMHandlers { /* This runs inside the BQL. */ - SaveStateHandler *save_state; + void (*save_state)(QEMUFile *f, void *opaque); /* * save_prepare is called early, even before migration starts, and can be @@ -71,7 +71,7 @@ typedef struct SaveVMHandlers { /* This calculate the exact remaining data to transfer */ void (*state_pending_exact)(void *opaque, uint64_t *must_precopy, uint64_t *can_postcopy); - LoadStateHandler *load_state; + int (*load_state)(QEMUFile *f, void *opaque, int version_id); int (*load_setup)(QEMUFile *f, void *opaque); int (*load_cleanup)(void *opaque); /* Called when postcopy migration wants to resume from failure */ diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index a028dba4d0..50c277cf0b 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -151,8 +151,6 @@ typedef struct IRQState *qemu_irq; /* * Function types */ -typedef void SaveStateHandler(QEMUFile *f, void *opaque); -typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); typedef void (*qemu_irq_handler)(void *opaque, int n, int level); #endif /* QEMU_TYPEDEFS_H */ From ee8bb867ec48085404e6282e8f53f8446261cb84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 4 Mar 2024 13:28:26 +0100 Subject: [PATCH 07/34] migration: Add documentation for SaveVMHandlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SaveVMHandlers structure is still in use for complex subsystems and devices. Document the handlers since we are going to modify a few later. Reviewed-by: Peter Xu Signed-off-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240304122844.1888308-9-clg@redhat.com Signed-off-by: Peter Xu --- include/migration/register.h | 259 +++++++++++++++++++++++++++++++---- 1 file changed, 235 insertions(+), 24 deletions(-) diff --git a/include/migration/register.h b/include/migration/register.h index 2e6a7d766e..d7b70a8be6 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -16,30 +16,130 @@ #include "hw/vmstate-if.h" +/** + * struct SaveVMHandlers: handler structure to finely control + * migration of complex subsystems and devices, such as RAM, block and + * VFIO. + */ typedef struct SaveVMHandlers { - /* This runs inside the BQL. */ + + /* The following handlers run inside the BQL. */ + + /** + * @save_state + * + * Saves state section on the source using the latest state format + * version. + * + * Legacy method. Should be deprecated when all users are ported + * to VMStateDescription. + * + * @f: QEMUFile where to send the data + * @opaque: data pointer passed to register_savevm_live() + */ void (*save_state)(QEMUFile *f, void *opaque); - /* - * save_prepare is called early, even before migration starts, and can be - * used to perform early checks. + /** + * @save_prepare + * + * Called early, even before migration starts, and can be used to + * perform early checks. + * + * @opaque: data pointer passed to register_savevm_live() + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns zero to indicate success and negative for error */ int (*save_prepare)(void *opaque, Error **errp); + + /** + * @save_setup + * + * Initializes the data structures on the source and transmits + * first section containing information on the device + * + * @f: QEMUFile where to send the data + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ int (*save_setup)(QEMUFile *f, void *opaque); + + /** + * @save_cleanup + * + * Uninitializes the data structures on the source + * + * @opaque: data pointer passed to register_savevm_live() + */ void (*save_cleanup)(void *opaque); + + /** + * @save_live_complete_postcopy + * + * Called at the end of postcopy for all postcopyable devices. + * + * @f: QEMUFile where to send the data + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque); + + /** + * @save_live_complete_precopy + * + * Transmits the last section for the device containing any + * remaining data at the end of a precopy phase. When postcopy is + * enabled, devices that support postcopy will skip this step, + * where the final data will be flushed at the end of postcopy via + * @save_live_complete_postcopy instead. + * + * @f: QEMUFile where to send the data + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); /* This runs both outside and inside the BQL. */ + + /** + * @is_active + * + * Will skip a state section if not active + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns true if state section is active else false + */ bool (*is_active)(void *opaque); + + /** + * @has_postcopy + * + * Checks if a device supports postcopy + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns true for postcopy support else false + */ bool (*has_postcopy)(void *opaque); - /* is_active_iterate - * If it is not NULL then qemu_savevm_state_iterate will skip iteration if - * it returns false. For example, it is needed for only-postcopy-states, - * which needs to be handled by qemu_savevm_state_setup and - * qemu_savevm_state_pending, but do not need iterations until not in - * postcopy stage. + /** + * @is_active_iterate + * + * As #SaveVMHandlers.is_active(), will skip an inactive state + * section in qemu_savevm_state_iterate. + * + * For example, it is needed for only-postcopy-states, which needs + * to be handled by qemu_savevm_state_setup() and + * qemu_savevm_state_pending(), but do not need iterations until + * not in postcopy stage. + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns true if state section is active else false */ bool (*is_active_iterate)(void *opaque); @@ -48,44 +148,155 @@ typedef struct SaveVMHandlers { * use data that is local to the migration thread or protected * by other locks. */ + + /** + * @save_live_iterate + * + * Should send a chunk of data until the point that stream + * bandwidth limits tell it to stop. Each call generates one + * section. + * + * @f: QEMUFile where to send the data + * @opaque: data pointer passed to register_savevm_live() + * + * Returns 0 to indicate that there is still more data to send, + * 1 that there is no more data to send and + * negative to indicate an error. + */ int (*save_live_iterate)(QEMUFile *f, void *opaque); /* This runs outside the BQL! */ - /* Note for save_live_pending: - * must_precopy: - * - must be migrated in precopy or in stopped state - * - i.e. must be migrated before target start + + /** + * @state_pending_estimate * - * can_postcopy: - * - can migrate in postcopy or in stopped state - * - i.e. can migrate after target start - * - some can also be migrated during precopy (RAM) - * - some must be migrated after source stops (block-dirty-bitmap) + * This estimates the remaining data to transfer * - * Sum of can_postcopy and must_postcopy is the whole amount of + * Sum of @can_postcopy and @must_postcopy is the whole amount of * pending data. + * + * @opaque: data pointer passed to register_savevm_live() + * @must_precopy: amount of data that must be migrated in precopy + * or in stopped state, i.e. that must be migrated + * before target start. + * @can_postcopy: amount of data that can be migrated in postcopy + * or in stopped state, i.e. after target start. + * Some can also be migrated during precopy (RAM). + * Some must be migrated after source stops + * (block-dirty-bitmap) */ - /* This estimates the remaining data to transfer */ void (*state_pending_estimate)(void *opaque, uint64_t *must_precopy, uint64_t *can_postcopy); - /* This calculate the exact remaining data to transfer */ + + /** + * @state_pending_exact + * + * This calculates the exact remaining data to transfer + * + * Sum of @can_postcopy and @must_postcopy is the whole amount of + * pending data. + * + * @opaque: data pointer passed to register_savevm_live() + * @must_precopy: amount of data that must be migrated in precopy + * or in stopped state, i.e. that must be migrated + * before target start. + * @can_postcopy: amount of data that can be migrated in postcopy + * or in stopped state, i.e. after target start. + * Some can also be migrated during precopy (RAM). + * Some must be migrated after source stops + * (block-dirty-bitmap) + */ void (*state_pending_exact)(void *opaque, uint64_t *must_precopy, uint64_t *can_postcopy); + + /** + * @load_state + * + * Load sections generated by any of the save functions that + * generate sections. + * + * Legacy method. Should be deprecated when all users are ported + * to VMStateDescription. + * + * @f: QEMUFile where to receive the data + * @opaque: data pointer passed to register_savevm_live() + * @version_id: the maximum version_id supported + * + * Returns zero to indicate success and negative for error + */ int (*load_state)(QEMUFile *f, void *opaque, int version_id); + + /** + * @load_setup + * + * Initializes the data structures on the destination. + * + * @f: QEMUFile where to receive the data + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ int (*load_setup)(QEMUFile *f, void *opaque); + + /** + * @load_cleanup + * + * Uninitializes the data structures on the destination. + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ int (*load_cleanup)(void *opaque); - /* Called when postcopy migration wants to resume from failure */ + + /** + * @resume_prepare + * + * Called when postcopy migration wants to resume from failure + * + * @s: Current migration state + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ int (*resume_prepare)(MigrationState *s, void *opaque); - /* Checks if switchover ack should be used. Called only in dest */ + + /** + * @switchover_ack_needed + * + * Checks if switchover ack should be used. Called only on + * destination. + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns true if switchover ack should be used and false + * otherwise + */ bool (*switchover_ack_needed)(void *opaque); } SaveVMHandlers; +/** + * register_savevm_live: Register a set of custom migration handlers + * + * @idstr: state section identifier + * @instance_id: instance id + * @version_id: version id supported + * @ops: SaveVMHandlers structure + * @opaque: data pointer passed to SaveVMHandlers handlers + */ int register_savevm_live(const char *idstr, uint32_t instance_id, int version_id, const SaveVMHandlers *ops, void *opaque); +/** + * unregister_savevm: Unregister custom migration handlers + * + * @obj: object associated with state section + * @idstr: state section identifier + * @opaque: data pointer passed to register_savevm_live() + */ void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque); #endif From e6e08e83239a067449b9698874c7547164a38414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 4 Mar 2024 13:28:27 +0100 Subject: [PATCH 08/34] migration: Do not call PRECOPY_NOTIFY_SETUP notifiers in case of error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When commit bd2270608fa0 ("migration/ram.c: add a notifier chain for precopy") added PRECOPY_NOTIFY_SETUP notifiers at the end of qemu_savevm_state_setup(), it didn't take into account a possible error in the loop calling vmstate_save() or .save_setup() handlers. Check ret value before calling the notifiers. Reviewed-by: Peter Xu Signed-off-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240304122844.1888308-10-clg@redhat.com Signed-off-by: Peter Xu --- migration/savevm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/migration/savevm.c b/migration/savevm.c index e84b26e1c8..76b57a9888 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1317,7 +1317,7 @@ void qemu_savevm_state_setup(QEMUFile *f) MigrationState *ms = migrate_get_current(); SaveStateEntry *se; Error *local_err = NULL; - int ret; + int ret = 0; json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); json_writer_start_array(ms->vmdesc, "devices"); @@ -1351,6 +1351,10 @@ void qemu_savevm_state_setup(QEMUFile *f) } } + if (ret) { + return; + } + if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) { error_report_err(local_err); } From 61dec060821de5d81a0c5f6b827e4f0282cd92ca Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 5 Mar 2024 16:56:29 -0300 Subject: [PATCH 09/34] migration/multifd: Don't fsync when closing QIOChannelFile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bc38feddeb ("io: fsync before closing a file channel") added a fsync/fdatasync at the closing point of the QIOChannelFile to ensure integrity of the migration stream in case of QEMU crash. The decision to do the sync at qio_channel_close() was not the best since that function runs in the main thread and the fsync can cause QEMU to hang for several minutes, depending on the migration size and disk speed. To fix the hang, remove the fsync from qio_channel_file_close(). At this moment, the migration code is the only user of the fsync and we're taking the tradeoff of not having a sync at all, leaving the responsibility to the upper layers. Fixes: bc38feddeb ("io: fsync before closing a file channel") Reviewed-by: "Daniel P. Berrangé" Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240305195629.9922-1-farosas@suse.de Link: https://lore.kernel.org/r/20240305174332.2553-1-farosas@suse.de [peterx: add more comment to the qio_channel_close()] Signed-off-by: Peter Xu --- docs/devel/migration/main.rst | 3 ++- io/channel-file.c | 5 ----- migration/multifd.c | 28 +++++++++++++++++++--------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 8024275d6d..54385a23e5 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -44,7 +44,8 @@ over any transport. - file migration: do the migration using a file that is passed to QEMU by path. A file offset option is supported to allow a management application to add its own metadata to the start of the file without - QEMU interference. + QEMU interference. Note that QEMU does not flush cached file + data/metadata at the end of migration. In addition, support is included for migration using RDMA, which transports the page data using ``RDMA``, where the hardware takes care of diff --git a/io/channel-file.c b/io/channel-file.c index d4706fa592..a6ad7770c6 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -242,11 +242,6 @@ static int qio_channel_file_close(QIOChannel *ioc, { QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); - if (qemu_fdatasync(fioc->fd) < 0) { - error_setg_errno(errp, errno, - "Unable to synchronize file data with storage device"); - return -1; - } if (qemu_close(fioc->fd) < 0) { error_setg_errno(errp, errno, "Unable to close file"); diff --git a/migration/multifd.c b/migration/multifd.c index d4a44da559..bf9d483f7a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -710,16 +710,26 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) if (p->c) { migration_ioc_unregister_yank(p->c); /* - * An explicit close() on the channel here is normally not - * required, but can be helpful for "file:" iochannels, where it - * will include fdatasync() to make sure the data is flushed to the - * disk backend. + * The object_unref() cannot guarantee the fd will always be + * released because finalize() of the iochannel is only + * triggered on the last reference and it's not guaranteed + * that we always hold the last refcount when reaching here. * - * The object_unref() cannot guarantee that because: (1) finalize() - * of the iochannel is only triggered on the last reference, and - * it's not guaranteed that we always hold the last refcount when - * reaching here, and, (2) even if finalize() is invoked, it only - * does a close(fd) without data flush. + * Closing the fd explicitly has the benefit that if there is any + * registered I/O handler callbacks on such fd, that will get a + * POLLNVAL event and will further trigger the cleanup to finally + * release the IOC. + * + * FIXME: It should logically be guaranteed that all multifd + * channels have no I/O handler callback registered when reaching + * here, because migration thread will wait for all multifd channel + * establishments to complete during setup. Since + * migrate_fd_cleanup() will be scheduled in main thread too, all + * previous callbacks should guarantee to be completed when + * reaching here. See multifd_send_state.channels_created and its + * usage. In the future, we could replace this with an assert + * making sure we're the last reference, or simply drop it if above + * is more clear to be justified. */ qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); From 69f7b00d057f8832a841a53d5ee31eb303157398 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 6 Mar 2024 09:06:54 +0100 Subject: [PATCH 10/34] migration/rdma: Fix a memory issue for migration In commit 3fa9642ff7 change was made to convert the RDMA backend to accept MigrateAddress struct. However, the assignment of "host" leads to data corruption on the target host and the failure of migration. isock->host = rdma->host; By allocating the memory explicitly for it with g_strdup_printf(), the issue is fixed and the migration doesn't fail any more. Fixes: 3fa9642ff7 ("migration: convert rdma backend to accept MigrateAddress") Cc: qemu-stable Cc: Li Zhijian Link: https://lore.kernel.org/r/CAHEcVy4L_D6tuhJ8h=xLR4WaPaprJE3nnxZAEyUnoTrxQ6CF5w@mail.gmail.com Signed-off-by: Yu Zhang [peterx: use g_strdup() instead of g_strdup_printf(), per Zhijian] Signed-off-by: Peter Xu --- migration/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/rdma.c b/migration/rdma.c index a355dcea89..855753c671 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -3357,7 +3357,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) goto err_rdma_dest_wait; } - isock->host = rdma->host; + isock->host = g_strdup(rdma->host); isock->port = g_strdup_printf("%d", rdma->port); /* From 4c7c8563191fd65d98cba05352a1fc1fbef6d817 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Mar 2024 15:37:07 +0000 Subject: [PATCH 11/34] physmem: Rename addr1 to more informative mr_addr in flatview_read/write() and similar The calls to flatview_read/write[_continue]() have parameters addr and addr1 but the names give no indication of what they are addresses of. Rename addr1 to mr_addr to reflect that it is the translated address offset within the MemoryRegion returned by flatview_translate(). Similarly rename the parameter in address_space_read/write_cached_slow() Suggested-by: Peter Xu Signed-off-by: Jonathan Cameron Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20240307153710.30907-2-Jonathan.Cameron@huawei.com Signed-off-by: Peter Xu --- system/physmem.c | 50 ++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/system/physmem.c b/system/physmem.c index 6e9ed97597..e92bed50a6 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2685,7 +2685,7 @@ static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs, static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, MemTxAttrs attrs, const void *ptr, - hwaddr len, hwaddr addr1, + hwaddr len, hwaddr mr_addr, hwaddr l, MemoryRegion *mr) { uint8_t *ram_ptr; @@ -2695,12 +2695,12 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, const uint8_t *buf = ptr; for (;;) { - if (!flatview_access_allowed(mr, attrs, addr1, l)) { + if (!flatview_access_allowed(mr, attrs, mr_addr, l)) { result |= MEMTX_ACCESS_ERROR; /* Keep going. */ } else if (!memory_access_is_direct(mr, true)) { release_lock |= prepare_mmio_access(mr); - l = memory_access_size(mr, l, addr1); + l = memory_access_size(mr, l, mr_addr); /* XXX: could force current_cpu to NULL to avoid potential bugs */ @@ -2715,13 +2715,13 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, (l == 8 && len >= 8)); #endif val = ldn_he_p(buf, l); - result |= memory_region_dispatch_write(mr, addr1, val, + result |= memory_region_dispatch_write(mr, mr_addr, val, size_memop(l), attrs); } else { /* RAM case */ - ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); + ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, false); memmove(ram_ptr, buf, l); - invalidate_and_set_dirty(mr, addr1, l); + invalidate_and_set_dirty(mr, mr_addr, l); } if (release_lock) { @@ -2738,7 +2738,7 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, } l = len; - mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); + mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs); } return result; @@ -2749,22 +2749,22 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, const void *buf, hwaddr len) { hwaddr l; - hwaddr addr1; + hwaddr mr_addr; MemoryRegion *mr; l = len; - mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); + mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs); if (!flatview_access_allowed(mr, attrs, addr, len)) { return MEMTX_ACCESS_ERROR; } return flatview_write_continue(fv, addr, attrs, buf, len, - addr1, l, mr); + mr_addr, l, mr); } /* Called within RCU critical section. */ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, MemTxAttrs attrs, void *ptr, - hwaddr len, hwaddr addr1, hwaddr l, + hwaddr len, hwaddr mr_addr, hwaddr l, MemoryRegion *mr) { uint8_t *ram_ptr; @@ -2775,14 +2775,14 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, fuzz_dma_read_cb(addr, len, mr); for (;;) { - if (!flatview_access_allowed(mr, attrs, addr1, l)) { + if (!flatview_access_allowed(mr, attrs, mr_addr, l)) { result |= MEMTX_ACCESS_ERROR; /* Keep going. */ } else if (!memory_access_is_direct(mr, false)) { /* I/O case */ release_lock |= prepare_mmio_access(mr); - l = memory_access_size(mr, l, addr1); - result |= memory_region_dispatch_read(mr, addr1, &val, + l = memory_access_size(mr, l, mr_addr); + result |= memory_region_dispatch_read(mr, mr_addr, &val, size_memop(l), attrs); /* @@ -2798,7 +2798,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, stn_he_p(buf, l, val); } else { /* RAM case */ - ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); + ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, false); memcpy(buf, ram_ptr, l); } @@ -2816,7 +2816,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, } l = len; - mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); + mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs); } return result; @@ -2827,16 +2827,16 @@ static MemTxResult flatview_read(FlatView *fv, hwaddr addr, MemTxAttrs attrs, void *buf, hwaddr len) { hwaddr l; - hwaddr addr1; + hwaddr mr_addr; MemoryRegion *mr; l = len; - mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); + mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs); if (!flatview_access_allowed(mr, attrs, addr, len)) { return MEMTX_ACCESS_ERROR; } return flatview_read_continue(fv, addr, attrs, buf, len, - addr1, l, mr); + mr_addr, l, mr); } MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, @@ -3348,15 +3348,15 @@ MemTxResult address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr, void *buf, hwaddr len) { - hwaddr addr1, l; + hwaddr mr_addr, l; MemoryRegion *mr; l = len; - mr = address_space_translate_cached(cache, addr, &addr1, &l, false, + mr = address_space_translate_cached(cache, addr, &mr_addr, &l, false, MEMTXATTRS_UNSPECIFIED); return flatview_read_continue(cache->fv, addr, MEMTXATTRS_UNSPECIFIED, buf, len, - addr1, l, mr); + mr_addr, l, mr); } /* Called from RCU critical section. address_space_write_cached uses this @@ -3366,15 +3366,15 @@ MemTxResult address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr, const void *buf, hwaddr len) { - hwaddr addr1, l; + hwaddr mr_addr, l; MemoryRegion *mr; l = len; - mr = address_space_translate_cached(cache, addr, &addr1, &l, true, + mr = address_space_translate_cached(cache, addr, &mr_addr, &l, true, MEMTXATTRS_UNSPECIFIED); return flatview_write_continue(cache->fv, addr, MEMTXATTRS_UNSPECIFIED, buf, len, - addr1, l, mr); + mr_addr, l, mr); } #define ARG1_DECL MemoryRegionCache *cache From bcfd8ba4f5d887dc9923738131c8598e90e76bb2 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Mar 2024 15:37:08 +0000 Subject: [PATCH 12/34] physmem: Reduce local variable scope in flatview_read/write_continue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precursor to factoring out the inner loops for reuse. Reviewed-by: Peter Xu Signed-off-by: Jonathan Cameron Reviewed-by: David Hildenbrand Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20240307153710.30907-3-Jonathan.Cameron@huawei.com Signed-off-by: Peter Xu --- system/physmem.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/system/physmem.c b/system/physmem.c index e92bed50a6..e35aa29343 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2688,10 +2688,7 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, hwaddr len, hwaddr mr_addr, hwaddr l, MemoryRegion *mr) { - uint8_t *ram_ptr; - uint64_t val; MemTxResult result = MEMTX_OK; - bool release_lock = false; const uint8_t *buf = ptr; for (;;) { @@ -2699,7 +2696,9 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, result |= MEMTX_ACCESS_ERROR; /* Keep going. */ } else if (!memory_access_is_direct(mr, true)) { - release_lock |= prepare_mmio_access(mr); + uint64_t val; + bool release_lock = prepare_mmio_access(mr); + l = memory_access_size(mr, l, mr_addr); /* XXX: could force current_cpu to NULL to avoid potential bugs */ @@ -2717,18 +2716,21 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, val = ldn_he_p(buf, l); result |= memory_region_dispatch_write(mr, mr_addr, val, size_memop(l), attrs); + if (release_lock) { + bql_unlock(); + } + + } else { /* RAM case */ - ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, false); + + uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, + false); + memmove(ram_ptr, buf, l); invalidate_and_set_dirty(mr, mr_addr, l); } - if (release_lock) { - bql_unlock(); - release_lock = false; - } - len -= l; buf += l; addr += l; @@ -2767,10 +2769,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, hwaddr len, hwaddr mr_addr, hwaddr l, MemoryRegion *mr) { - uint8_t *ram_ptr; - uint64_t val; MemTxResult result = MEMTX_OK; - bool release_lock = false; uint8_t *buf = ptr; fuzz_dma_read_cb(addr, len, mr); @@ -2780,7 +2779,9 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, /* Keep going. */ } else if (!memory_access_is_direct(mr, false)) { /* I/O case */ - release_lock |= prepare_mmio_access(mr); + uint64_t val; + bool release_lock = prepare_mmio_access(mr); + l = memory_access_size(mr, l, mr_addr); result |= memory_region_dispatch_read(mr, mr_addr, &val, size_memop(l), attrs); @@ -2796,17 +2797,16 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, (l == 8 && len >= 8)); #endif stn_he_p(buf, l, val); + if (release_lock) { + bql_unlock(); + } } else { /* RAM case */ - ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, false); + uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, + false); memcpy(buf, ram_ptr, l); } - if (release_lock) { - bql_unlock(); - release_lock = false; - } - len -= l; buf += l; addr += l; From e7927d33cf667d2c034f7b6aa0c0d0dde22a9753 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Mar 2024 15:37:09 +0000 Subject: [PATCH 13/34] physmem: Factor out body of flatview_read/write_continue() loop This code will be reused for the address_space_cached accessors shortly. Also reduce scope of result variable now we aren't directly calling this in the loop. Signed-off-by: Jonathan Cameron Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20240307153710.30907-4-Jonathan.Cameron@huawei.com Signed-off-by: Peter Xu --- system/physmem.c | 169 +++++++++++++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 70 deletions(-) diff --git a/system/physmem.c b/system/physmem.c index e35aa29343..737869a3f5 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2681,6 +2681,56 @@ static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs, return false; } +static MemTxResult flatview_write_continue_step(MemTxAttrs attrs, + const uint8_t *buf, + hwaddr len, hwaddr mr_addr, + hwaddr *l, MemoryRegion *mr) +{ + if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) { + return MEMTX_ACCESS_ERROR; + } + + if (!memory_access_is_direct(mr, true)) { + uint64_t val; + MemTxResult result; + bool release_lock = prepare_mmio_access(mr); + + *l = memory_access_size(mr, *l, mr_addr); + /* + * XXX: could force current_cpu to NULL to avoid + * potential bugs + */ + + /* + * Assure Coverity (and ourselves) that we are not going to OVERRUN + * the buffer by following ldn_he_p(). + */ +#ifdef QEMU_STATIC_ANALYSIS + assert((*l == 1 && len >= 1) || + (*l == 2 && len >= 2) || + (*l == 4 && len >= 4) || + (*l == 8 && len >= 8)); +#endif + val = ldn_he_p(buf, *l); + result = memory_region_dispatch_write(mr, mr_addr, val, + size_memop(*l), attrs); + if (release_lock) { + bql_unlock(); + } + + return result; + } else { + /* RAM case */ + uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l, + false); + + memmove(ram_ptr, buf, *l); + invalidate_and_set_dirty(mr, mr_addr, *l); + + return MEMTX_OK; + } +} + /* Called within RCU critical section. */ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, MemTxAttrs attrs, @@ -2692,44 +2742,8 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, const uint8_t *buf = ptr; for (;;) { - if (!flatview_access_allowed(mr, attrs, mr_addr, l)) { - result |= MEMTX_ACCESS_ERROR; - /* Keep going. */ - } else if (!memory_access_is_direct(mr, true)) { - uint64_t val; - bool release_lock = prepare_mmio_access(mr); - - l = memory_access_size(mr, l, mr_addr); - /* XXX: could force current_cpu to NULL to avoid - potential bugs */ - - /* - * Assure Coverity (and ourselves) that we are not going to OVERRUN - * the buffer by following ldn_he_p(). - */ -#ifdef QEMU_STATIC_ANALYSIS - assert((l == 1 && len >= 1) || - (l == 2 && len >= 2) || - (l == 4 && len >= 4) || - (l == 8 && len >= 8)); -#endif - val = ldn_he_p(buf, l); - result |= memory_region_dispatch_write(mr, mr_addr, val, - size_memop(l), attrs); - if (release_lock) { - bql_unlock(); - } - - - } else { - /* RAM case */ - - uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, - false); - - memmove(ram_ptr, buf, l); - invalidate_and_set_dirty(mr, mr_addr, l); - } + result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l, + mr); len -= l; buf += l; @@ -2763,6 +2777,52 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, mr_addr, l, mr); } +static MemTxResult flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf, + hwaddr len, hwaddr mr_addr, + hwaddr *l, + MemoryRegion *mr) +{ + if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) { + return MEMTX_ACCESS_ERROR; + } + + if (!memory_access_is_direct(mr, false)) { + /* I/O case */ + uint64_t val; + MemTxResult result; + bool release_lock = prepare_mmio_access(mr); + + *l = memory_access_size(mr, *l, mr_addr); + result = memory_region_dispatch_read(mr, mr_addr, &val, size_memop(*l), + attrs); + + /* + * Assure Coverity (and ourselves) that we are not going to OVERRUN + * the buffer by following stn_he_p(). + */ +#ifdef QEMU_STATIC_ANALYSIS + assert((*l == 1 && len >= 1) || + (*l == 2 && len >= 2) || + (*l == 4 && len >= 4) || + (*l == 8 && len >= 8)); +#endif + stn_he_p(buf, *l, val); + + if (release_lock) { + bql_unlock(); + } + return result; + } else { + /* RAM case */ + uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l, + false); + + memcpy(buf, ram_ptr, *l); + + return MEMTX_OK; + } +} + /* Called within RCU critical section. */ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, MemTxAttrs attrs, void *ptr, @@ -2774,38 +2834,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, fuzz_dma_read_cb(addr, len, mr); for (;;) { - if (!flatview_access_allowed(mr, attrs, mr_addr, l)) { - result |= MEMTX_ACCESS_ERROR; - /* Keep going. */ - } else if (!memory_access_is_direct(mr, false)) { - /* I/O case */ - uint64_t val; - bool release_lock = prepare_mmio_access(mr); - - l = memory_access_size(mr, l, mr_addr); - result |= memory_region_dispatch_read(mr, mr_addr, &val, - size_memop(l), attrs); - - /* - * Assure Coverity (and ourselves) that we are not going to OVERRUN - * the buffer by following stn_he_p(). - */ -#ifdef QEMU_STATIC_ANALYSIS - assert((l == 1 && len >= 1) || - (l == 2 && len >= 2) || - (l == 4 && len >= 4) || - (l == 8 && len >= 8)); -#endif - stn_he_p(buf, l, val); - if (release_lock) { - bql_unlock(); - } - } else { - /* RAM case */ - uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, &l, - false); - memcpy(buf, ram_ptr, l); - } + result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr); len -= l; buf += l; From 47293c922cb4d70aae5b8100936ffc624d2a4076 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Mar 2024 15:37:10 +0000 Subject: [PATCH 14/34] physmem: Fix wrong address in large address_space_read/write_cached_slow() If the access is bigger than the MemoryRegion supports, flatview_read/write_continue() will attempt to update the Memory Region. but the address passed to flatview_translate() is relative to the cache, not to the FlatView. On arm/virt with interleaved CXL memory emulation and virtio-blk-pci this lead to the first part of descriptor being read from the CXL memory and the second part from PA 0x8 which happens to be a blank region of a flash chip and all ffs on this particular configuration. Note this test requires the out of tree ARM support for CXL, but the problem is more general. Avoid this by adding new address_space_read_continue_cached() and address_space_write_continue_cached() which share all the logic with the flatview versions except for the MemoryRegion lookup which is unnecessary as the MemoryRegionCache only covers one MemoryRegion. Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240307153710.30907-5-Jonathan.Cameron@huawei.com Signed-off-by: Peter Xu --- system/physmem.c | 63 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/system/physmem.c b/system/physmem.c index 737869a3f5..6cfb7a80ab 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -3370,6 +3370,59 @@ static inline MemoryRegion *address_space_translate_cached( return section.mr; } +/* Called within RCU critical section. */ +static MemTxResult address_space_write_continue_cached(MemTxAttrs attrs, + const void *ptr, + hwaddr len, + hwaddr mr_addr, + hwaddr l, + MemoryRegion *mr) +{ + MemTxResult result = MEMTX_OK; + const uint8_t *buf = ptr; + + for (;;) { + result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l, + mr); + + len -= l; + buf += l; + mr_addr += l; + + if (!len) { + break; + } + + l = len; + } + + return result; +} + +/* Called within RCU critical section. */ +static MemTxResult address_space_read_continue_cached(MemTxAttrs attrs, + void *ptr, hwaddr len, + hwaddr mr_addr, hwaddr l, + MemoryRegion *mr) +{ + MemTxResult result = MEMTX_OK; + uint8_t *buf = ptr; + + for (;;) { + result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr); + len -= l; + buf += l; + mr_addr += l; + + if (!len) { + break; + } + l = len; + } + + return result; +} + /* Called from RCU critical section. address_space_read_cached uses this * out of line function when the target is an MMIO or IOMMU region. */ @@ -3383,9 +3436,8 @@ address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr, l = len; mr = address_space_translate_cached(cache, addr, &mr_addr, &l, false, MEMTXATTRS_UNSPECIFIED); - return flatview_read_continue(cache->fv, - addr, MEMTXATTRS_UNSPECIFIED, buf, len, - mr_addr, l, mr); + return address_space_read_continue_cached(MEMTXATTRS_UNSPECIFIED, + buf, len, mr_addr, l, mr); } /* Called from RCU critical section. address_space_write_cached uses this @@ -3401,9 +3453,8 @@ address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr, l = len; mr = address_space_translate_cached(cache, addr, &mr_addr, &l, true, MEMTXATTRS_UNSPECIFIED); - return flatview_write_continue(cache->fv, - addr, MEMTXATTRS_UNSPECIFIED, buf, len, - mr_addr, l, mr); + return address_space_write_continue_cached(MEMTXATTRS_UNSPECIFIED, + buf, len, mr_addr, l, mr); } #define ARG1_DECL MemoryRegionCache *cache From a1bb5dd169f4e47690a3b53c64692808472171a7 Mon Sep 17 00:00:00 2001 From: Anthony PERARD Date: Mon, 11 Mar 2024 12:34:39 +0000 Subject: [PATCH 15/34] migration: Fix format in error message In file_write_ramblock_iov(), "offset" is "uintptr_t" and not "ram_addr_t". While usually they are both equivalent, this is not the case with CONFIG_XEN_BACKEND. Use the right format. This will fix build on 32-bit. Fixes: f427d90b9898 ("migration/multifd: Support outgoing mapped-ram stream format") Signed-off-by: Anthony PERARD Link: https://lore.kernel.org/r/20240311123439.16844-1-anthony.perard@citrix.com Signed-off-by: Peter Xu --- migration/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/file.c b/migration/file.c index 164b079966..5054a60851 100644 --- a/migration/file.c +++ b/migration/file.c @@ -191,7 +191,7 @@ int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, */ offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host; if (offset >= block->used_length) { - error_setg(errp, "offset " RAM_ADDR_FMT + error_setg(errp, "offset %" PRIxPTR "outside of ramblock %s range", offset, block->idstr); ret = -1; break; From f3bff6c44304a21ea99eeed336672bd46ca102d5 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:39 -0700 Subject: [PATCH 16/34] migration: export fewer options A small number of migration options are accessed by migration clients, but to see them clients must include all of options.h, which is mostly for migration core code. migrate_mode() in particular will be needed by multiple clients. Refactor the option declarations so clients can see the necessary few via misc.h, which already exports a portion of the client API. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179319-294320-1-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- hw/vfio/migration.c | 1 - hw/virtio/virtio-balloon.c | 1 - include/migration/client-options.h | 24 ++++++++++++++++++++++++ include/migration/misc.h | 1 + migration/options.h | 6 +----- system/dirtylimit.c | 1 - 6 files changed, 26 insertions(+), 8 deletions(-) create mode 100644 include/migration/client-options.h diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index f82dcabc49..49c0016add 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -18,7 +18,6 @@ #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" #include "migration/migration.h" -#include "migration/options.h" #include "migration/savevm.h" #include "migration/vmstate.h" #include "migration/qemu-file.h" diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 89f853fa9e..a59ff172bd 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -32,7 +32,6 @@ #include "qemu/error-report.h" #include "migration/misc.h" #include "migration/migration.h" -#include "migration/options.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" diff --git a/include/migration/client-options.h b/include/migration/client-options.h new file mode 100644 index 0000000000..887fea1565 --- /dev/null +++ b/include/migration/client-options.h @@ -0,0 +1,24 @@ +/* + * QEMU public migration capabilities + * + * Copyright (c) 2012-2023 Red Hat Inc + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MIGRATION_CLIENT_OPTIONS_H +#define QEMU_MIGRATION_CLIENT_OPTIONS_H + +/* capabilities */ + +bool migrate_background_snapshot(void); +bool migrate_dirty_limit(void); +bool migrate_postcopy_ram(void); +bool migrate_switchover_ack(void); + +/* parameters */ + +MigMode migrate_mode(void); + +#endif diff --git a/include/migration/misc.h b/include/migration/misc.h index 5d1aa593ed..4c226a40bb 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -17,6 +17,7 @@ #include "qemu/notify.h" #include "qapi/qapi-types-migration.h" #include "qapi/qapi-types-net.h" +#include "migration/client-options.h" /* migration/ram.c */ diff --git a/migration/options.h b/migration/options.h index 6ddd8dad9b..b6b69c2bb7 100644 --- a/migration/options.h +++ b/migration/options.h @@ -16,6 +16,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/client-options.h" /* migration properties */ @@ -24,12 +25,10 @@ extern Property migration_properties[]; /* capabilities */ bool migrate_auto_converge(void); -bool migrate_background_snapshot(void); bool migrate_block(void); bool migrate_colo(void); bool migrate_compress(void); bool migrate_dirty_bitmaps(void); -bool migrate_dirty_limit(void); bool migrate_events(void); bool migrate_mapped_ram(void); bool migrate_ignore_shared(void); @@ -38,11 +37,9 @@ bool migrate_multifd(void); bool migrate_pause_before_switchover(void); bool migrate_postcopy_blocktime(void); bool migrate_postcopy_preempt(void); -bool migrate_postcopy_ram(void); bool migrate_rdma_pin_all(void); bool migrate_release_ram(void); bool migrate_return_path(void); -bool migrate_switchover_ack(void); bool migrate_validate_uuid(void); bool migrate_xbzrle(void); bool migrate_zero_blocks(void); @@ -84,7 +81,6 @@ uint8_t migrate_max_cpu_throttle(void); uint64_t migrate_max_bandwidth(void); uint64_t migrate_avail_switchover_bandwidth(void); uint64_t migrate_max_postcopy_bandwidth(void); -MigMode migrate_mode(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); diff --git a/system/dirtylimit.c b/system/dirtylimit.c index b5607eb8c2..774ff44f79 100644 --- a/system/dirtylimit.c +++ b/system/dirtylimit.c @@ -26,7 +26,6 @@ #include "trace.h" #include "migration/misc.h" #include "migration/migration.h" -#include "migration/options.h" /* * Dirtylimit stop working if dirty page rate error From f853fa0714061989b14d7f85667a353b69e2d9b6 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:48 -0700 Subject: [PATCH 17/34] migration: remove migration.h references Remove migration.h from files that no longer need it due to previous commits. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-2-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- hw/vfio/container.c | 1 - hw/virtio/vhost-user.c | 1 - hw/virtio/virtio-balloon.c | 1 - system/qdev-monitor.c | 1 - target/loongarch/kvm/kvm.c | 1 - tests/unit/test-vmstate.c | 1 - 6 files changed, 6 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index bd25b9fbad..ff081a12c2 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -32,7 +32,6 @@ #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" -#include "migration/migration.h" #include "pci.h" VFIOGroupList vfio_group_list = diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index a1eea8547e..1af8621481 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -26,7 +26,6 @@ #include "qemu/sockets.h" #include "sysemu/runstate.h" #include "sysemu/cryptodev.h" -#include "migration/migration.h" #include "migration/postcopy-ram.h" #include "trace.h" #include "exec/ramblock.h" diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index a59ff172bd..609e39a821 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,7 +31,6 @@ #include "trace.h" #include "qemu/error-report.h" #include "migration/misc.h" -#include "migration/migration.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index 09e07cab9b..c1243891c3 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -38,7 +38,6 @@ #include "qemu/option_int.h" #include "sysemu/block-backend.h" #include "migration/misc.h" -#include "migration/migration.h" #include "qemu/cutils.h" #include "hw/qdev-properties.h" #include "hw/clock.h" diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index c19978a970..11a69a3b4e 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -22,7 +22,6 @@ #include "hw/irq.h" #include "qemu/log.h" #include "hw/loader.h" -#include "migration/migration.h" #include "sysemu/runstate.h" #include "cpu-csr.h" #include "kvm_loongarch.h" diff --git a/tests/unit/test-vmstate.c b/tests/unit/test-vmstate.c index c4f9faa273..63f28f26f4 100644 --- a/tests/unit/test-vmstate.c +++ b/tests/unit/test-vmstate.c @@ -24,7 +24,6 @@ #include "qemu/osdep.h" -#include "../migration/migration.h" #include "migration/vmstate.h" #include "migration/qemu-file-types.h" #include "../migration/qemu-file.h" From 7dcb3c87d87cbf3359c9bbef8066e5195ee92f4d Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:49 -0700 Subject: [PATCH 18/34] migration: export migration_is_setup_or_active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete the MigrationState parameter from migration_is_setup_or_active and move it to the public API in misc.h. Signed-off-by: Steve Sistare Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/1710179338-294359-3-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- hw/vfio/common.c | 2 +- include/migration/misc.h | 1 + migration/migration.c | 12 ++++++------ migration/migration.h | 1 - migration/ram.c | 5 ++--- net/vhost-vdpa.c | 3 +-- 6 files changed, 11 insertions(+), 13 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 059bfdc07a..896eab8103 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -152,7 +152,7 @@ static void vfio_set_migration_error(int err) { MigrationState *ms = migrate_get_current(); - if (migration_is_setup_or_active(ms->state)) { + if (migration_is_setup_or_active()) { WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { if (ms->to_dst_file) { qemu_file_set_error(ms->to_dst_file, err); diff --git a/include/migration/misc.h b/include/migration/misc.h index 4c226a40bb..79cff6224e 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -61,6 +61,7 @@ void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); bool migration_is_active(MigrationState *); +bool migration_is_setup_or_active(void); bool migrate_mode_is_cpr(MigrationState *); typedef enum MigrationEventType { diff --git a/migration/migration.c b/migration/migration.c index a49fcd53ee..af21403bad 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1081,9 +1081,11 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) * Return true if we're already in the middle of a migration * (i.e. any of the active or setup states) */ -bool migration_is_setup_or_active(int state) +bool migration_is_setup_or_active(void) { - switch (state) { + MigrationState *s = current_migration; + + switch (s->state) { case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: @@ -1601,10 +1603,8 @@ bool migration_incoming_postcopy_advised(void) bool migration_in_bg_snapshot(void) { - MigrationState *s = migrate_get_current(); - return migrate_background_snapshot() && - migration_is_setup_or_active(s->state); + migration_is_setup_or_active(); } bool migration_is_idle(void) @@ -2297,7 +2297,7 @@ static void *source_return_path_thread(void *opaque) trace_source_return_path_thread_entry(); rcu_register_thread(); - while (migration_is_setup_or_active(ms->state)) { + while (migration_is_setup_or_active()) { trace_source_return_path_thread_loop_top(); header_type = qemu_get_be16(rp); diff --git a/migration/migration.h b/migration/migration.h index 65c0b61cbd..736460aa8b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -479,7 +479,6 @@ bool migrate_has_error(MigrationState *s); void migrate_fd_connect(MigrationState *s, Error *error_in); -bool migration_is_setup_or_active(int state); bool migration_is_running(int state); int migrate_init(MigrationState *s, Error **errp); diff --git a/migration/ram.c b/migration/ram.c index 2cd936d9ce..3ee8cb47d3 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2909,10 +2909,9 @@ void qemu_guest_free_page_hint(void *addr, size_t len) RAMBlock *block; ram_addr_t offset; size_t used_len, start, npages; - MigrationState *s = migrate_get_current(); /* This function is currently expected to be used during live migration */ - if (!migration_is_setup_or_active(s->state)) { + if (!migration_is_setup_or_active()) { return; } @@ -3263,7 +3262,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 - && migration_is_setup_or_active(migrate_get_current()->state)) { + && migration_is_setup_or_active()) { if (migrate_multifd() && migrate_multifd_flush_after_each_section() && !migrate_mapped_ram()) { ret = multifd_send_sync_main(); diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index e6bdb4562d..8564817073 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -26,7 +26,6 @@ #include #include "standard-headers/linux/virtio_net.h" #include "monitor/monitor.h" -#include "migration/migration.h" #include "migration/misc.h" #include "hw/virtio/vhost.h" @@ -355,7 +354,7 @@ static int vhost_vdpa_net_data_start(NetClientState *nc) assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); if (s->always_svq || - migration_is_setup_or_active(migrate_get_current()->state)) { + migration_is_setup_or_active()) { v->shadow_vqs_enabled = true; } else { v->shadow_vqs_enabled = false; From 3a6813b68cf21a366522564f83f885bf2657dcc8 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:50 -0700 Subject: [PATCH 19/34] migration: export migration_is_active Delete the MigrationState parameter from migration_is_active so it can be exported and used without including migration.h. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-4-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- hw/vfio/common.c | 4 ++-- include/migration/misc.h | 2 +- migration/migration.c | 10 ++++++---- system/dirtylimit.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 896eab8103..2dbbf62e15 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -182,7 +182,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); - if (ms->state != MIGRATION_STATUS_ACTIVE && + if (!migration_is_active() && ms->state != MIGRATION_STATUS_DEVICE) { return false; } @@ -225,7 +225,7 @@ vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; - if (!migration_is_active(migrate_get_current())) { + if (!migration_is_active()) { return false; } diff --git a/include/migration/misc.h b/include/migration/misc.h index 79cff6224e..e1f1bf853e 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -60,7 +60,7 @@ void dump_vmstate_json_to_file(FILE *out_fp); void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); -bool migration_is_active(MigrationState *); +bool migration_is_active(void); bool migration_is_setup_or_active(void); bool migrate_mode_is_cpr(MigrationState *); diff --git a/migration/migration.c b/migration/migration.c index af21403bad..17859cbaee 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1406,7 +1406,7 @@ static void migrate_fd_cleanup(MigrationState *s) qemu_fclose(tmp); } - assert(!migration_is_active(s)); + assert(!migration_is_active()); if (s->state == MIGRATION_STATUS_CANCELLING) { migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, @@ -1637,8 +1637,10 @@ bool migration_is_idle(void) return false; } -bool migration_is_active(MigrationState *s) +bool migration_is_active(void) { + MigrationState *s = current_migration; + return (s->state == MIGRATION_STATUS_ACTIVE || s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } @@ -3461,7 +3463,7 @@ static void *migration_thread(void *opaque) trace_migration_thread_setup_complete(); - while (migration_is_active(s)) { + while (migration_is_active()) { if (urgent || !migration_rate_exceeded(s->to_dst_file)) { MigIterateState iter_state = migration_iteration_run(s); if (iter_state == MIG_ITERATE_SKIP) { @@ -3607,7 +3609,7 @@ static void *bg_migration_thread(void *opaque) migration_bh_schedule(bg_migration_vm_start_bh, s); bql_unlock(); - while (migration_is_active(s)) { + while (migration_is_active()) { MigIterateState iter_state = bg_migration_iteration_run(s); if (iter_state == MIG_ITERATE_SKIP) { continue; diff --git a/system/dirtylimit.c b/system/dirtylimit.c index 774ff44f79..051e0311c1 100644 --- a/system/dirtylimit.c +++ b/system/dirtylimit.c @@ -83,7 +83,7 @@ static void vcpu_dirty_rate_stat_collect(void) int64_t period = DIRTYLIMIT_CALC_TIME_MS; if (migrate_dirty_limit() && - migration_is_active(s)) { + migration_is_active()) { period = s->parameters.x_vcpu_dirty_limit_period; } From aeaafb1e59f81f5cc715e656dac23f3fe5db3faa Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:51 -0700 Subject: [PATCH 20/34] migration: export migration_is_running Delete the MigrationState parameter from migration_is_running and move it to the public API in misc.h. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-5-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- include/migration/misc.h | 1 + migration/migration.c | 10 ++++++---- migration/migration.h | 2 -- migration/options.c | 4 ++-- migration/savevm.c | 2 +- system/dirtylimit.c | 2 +- target/riscv/kvm/kvm-cpu.c | 4 ++-- 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index e1f1bf853e..7526977de6 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -106,6 +106,7 @@ int migration_call_notifiers(MigrationState *s, MigrationEventType type, bool migration_in_setup(MigrationState *); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); +bool migration_is_running(void); /* ...and after the device transmission */ /* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */ bool migration_in_incoming_postcopy(void); diff --git a/migration/migration.c b/migration/migration.c index 17859cbaee..546ba86c63 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1103,9 +1103,11 @@ bool migration_is_setup_or_active(void) } } -bool migration_is_running(int state) +bool migration_is_running(void) { - switch (state) { + MigrationState *s = current_migration; + + switch (s->state) { case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: @@ -1477,7 +1479,7 @@ static void migrate_fd_cancel(MigrationState *s) do { old_state = s->state; - if (!migration_is_running(old_state)) { + if (!migration_is_running()) { break; } /* If the migration is paused, kick it out of the pause */ @@ -1962,7 +1964,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return true; } - if (migration_is_running(s->state)) { + if (migration_is_running()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return false; } diff --git a/migration/migration.h b/migration/migration.h index 736460aa8b..e4983db9c9 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -479,8 +479,6 @@ bool migrate_has_error(MigrationState *s); void migrate_fd_connect(MigrationState *s, Error *error_in); -bool migration_is_running(int state); - int migrate_init(MigrationState *s, Error **errp); bool migration_is_blocked(Error **errp); /* True if outgoing migration has entered postcopy phase */ diff --git a/migration/options.c b/migration/options.c index 40eb930940..642cfb00a3 100644 --- a/migration/options.c +++ b/migration/options.c @@ -681,7 +681,7 @@ bool migrate_cap_set(int cap, bool value, Error **errp) MigrationState *s = migrate_get_current(); bool new_caps[MIGRATION_CAPABILITY__MAX]; - if (migration_is_running(s->state)) { + if (migration_is_running()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return false; } @@ -725,7 +725,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, MigrationCapabilityStatusList *cap; bool new_caps[MIGRATION_CAPABILITY__MAX]; - if (migration_is_running(s->state) || migration_in_colo_state()) { + if (migration_is_running() || migration_in_colo_state()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return; } diff --git a/migration/savevm.c b/migration/savevm.c index 76b57a9888..388d7af7cd 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1706,7 +1706,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) MigrationState *ms = migrate_get_current(); MigrationStatus status; - if (migration_is_running(ms->state)) { + if (migration_is_running()) { error_setg(errp, QERR_MIGRATION_ACTIVE); return -EINVAL; } diff --git a/system/dirtylimit.c b/system/dirtylimit.c index 051e0311c1..1622bb7426 100644 --- a/system/dirtylimit.c +++ b/system/dirtylimit.c @@ -451,7 +451,7 @@ static bool dirtylimit_is_allowed(void) { MigrationState *ms = migrate_get_current(); - if (migration_is_running(ms->state) && + if (migration_is_running() && (!qemu_thread_is_self(&ms->thread)) && migrate_dirty_limit() && dirtylimit_in_service()) { diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c index c7afdb1e81..cda7d78a77 100644 --- a/target/riscv/kvm/kvm-cpu.c +++ b/target/riscv/kvm/kvm-cpu.c @@ -44,7 +44,7 @@ #include "kvm_riscv.h" #include "sbi_ecall_interface.h" #include "chardev/char-fe.h" -#include "migration/migration.h" +#include "migration/misc.h" #include "sysemu/runstate.h" #include "hw/riscv/numa.h" @@ -729,7 +729,7 @@ static void kvm_riscv_put_regs_timer(CPUState *cs) * frequency. Therefore, we should check whether they are the same here * during the migration. */ - if (migration_is_running(migrate_get_current()->state)) { + if (migration_is_running()) { KVM_RISCV_GET_TIMER(cs, frequency, reg); if (reg != env->kvm_timer_frequency) { error_report("Dst Hosts timer frequency != Src Hosts"); From 714f33123bfc73c6caacd19a44e082da8cdc96e3 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:52 -0700 Subject: [PATCH 21/34] migration: export vcpu_dirty_limit_period Define and export vcpu_dirty_limit_period to eliminate a dependency on MigrationState. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-6-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- include/migration/client-options.h | 1 + migration/options.c | 7 +++++++ system/dirtylimit.c | 3 +-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/migration/client-options.h b/include/migration/client-options.h index 887fea1565..59f4b55cf4 100644 --- a/include/migration/client-options.h +++ b/include/migration/client-options.h @@ -20,5 +20,6 @@ bool migrate_switchover_ack(void); /* parameters */ MigMode migrate_mode(void); +uint64_t migrate_vcpu_dirty_limit_period(void); #endif diff --git a/migration/options.c b/migration/options.c index 642cfb00a3..09178c6f60 100644 --- a/migration/options.c +++ b/migration/options.c @@ -924,6 +924,13 @@ const char *migrate_tls_hostname(void) return s->parameters.tls_hostname; } +uint64_t migrate_vcpu_dirty_limit_period(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.x_vcpu_dirty_limit_period; +} + uint64_t migrate_xbzrle_cache_size(void) { MigrationState *s = migrate_get_current(); diff --git a/system/dirtylimit.c b/system/dirtylimit.c index 1622bb7426..b0afaa0776 100644 --- a/system/dirtylimit.c +++ b/system/dirtylimit.c @@ -77,14 +77,13 @@ static bool dirtylimit_quit; static void vcpu_dirty_rate_stat_collect(void) { - MigrationState *s = migrate_get_current(); VcpuStat stat; int i = 0; int64_t period = DIRTYLIMIT_CALC_TIME_MS; if (migrate_dirty_limit() && migration_is_active()) { - period = s->parameters.x_vcpu_dirty_limit_period; + period = migrate_vcpu_dirty_limit_period(); } /* calculate vcpu dirtyrate */ From 6e7856397641326163ef493197a849f99a9a6a44 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:53 -0700 Subject: [PATCH 22/34] migration: migration_thread_is_self Define and export migration_thread_is_self to eliminate a dependency on MigrationState. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-7-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- include/migration/misc.h | 1 + migration/migration.c | 7 +++++++ system/dirtylimit.c | 5 +---- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index 7526977de6..c4b5416357 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -61,6 +61,7 @@ void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); bool migration_is_active(void); +bool migration_thread_is_self(void); bool migration_is_setup_or_active(void); bool migrate_mode_is_cpr(MigrationState *); diff --git a/migration/migration.c b/migration/migration.c index 546ba86c63..afe72af0b1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1647,6 +1647,13 @@ bool migration_is_active(void) s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } +bool migration_thread_is_self(void) +{ + MigrationState *s = current_migration; + + return qemu_thread_is_self(&s->thread); +} + bool migrate_mode_is_cpr(MigrationState *s) { return s->parameters.mode == MIG_MODE_CPR_REBOOT; diff --git a/system/dirtylimit.c b/system/dirtylimit.c index b0afaa0776..ab20da34bb 100644 --- a/system/dirtylimit.c +++ b/system/dirtylimit.c @@ -25,7 +25,6 @@ #include "sysemu/kvm.h" #include "trace.h" #include "migration/misc.h" -#include "migration/migration.h" /* * Dirtylimit stop working if dirty page rate error @@ -448,10 +447,8 @@ static void dirtylimit_cleanup(void) */ static bool dirtylimit_is_allowed(void) { - MigrationState *ms = migrate_get_current(); - if (migration_is_running() && - (!qemu_thread_is_self(&ms->thread)) && + !migration_thread_is_self() && migrate_dirty_limit() && dirtylimit_in_service()) { return false; From 9bb630c6ee6e930e1c521829681365e5672bbd99 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:54 -0700 Subject: [PATCH 23/34] migration: migration_is_device Define and export migration_is_device to eliminate a dependency on MigrationState. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-8-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- hw/vfio/common.c | 4 +--- include/migration/misc.h | 1 + migration/migration.c | 7 +++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 2dbbf62e15..de010680ff 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -180,10 +180,8 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; - MigrationState *ms = migrate_get_current(); - if (!migration_is_active() && - ms->state != MIGRATION_STATUS_DEVICE) { + if (!migration_is_active() && !migration_is_device()) { return false; } diff --git a/include/migration/misc.h b/include/migration/misc.h index c4b5416357..28cfaed2c7 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -61,6 +61,7 @@ void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); bool migration_is_active(void); +bool migration_is_device(void); bool migration_thread_is_self(void); bool migration_is_setup_or_active(void); bool migrate_mode_is_cpr(MigrationState *); diff --git a/migration/migration.c b/migration/migration.c index afe72af0b1..db1e627848 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1647,6 +1647,13 @@ bool migration_is_active(void) s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } +bool migration_is_device(void) +{ + MigrationState *s = current_migration; + + return s->state == MIGRATION_STATUS_DEVICE; +} + bool migration_thread_is_self(void) { MigrationState *s = current_migration; From 20c64c8a51a477115589b3a9f7304e423dcbe7f2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:55 -0700 Subject: [PATCH 24/34] migration: migration_file_set_error Define and export migration_file_set_error to eliminate a dependency on MigrationState. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-9-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- hw/vfio/common.c | 9 +-------- hw/vfio/migration.c | 11 +++-------- include/migration/misc.h | 2 ++ migration/migration.c | 11 +++++++++++ 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index de010680ff..b44204eade 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -39,7 +39,6 @@ #include "sysemu/runstate.h" #include "trace.h" #include "qapi/error.h" -#include "migration/migration.h" #include "migration/misc.h" #include "migration/blocker.h" #include "migration/qemu-file.h" @@ -150,14 +149,8 @@ bool vfio_viommu_preset(VFIODevice *vbasedev) static void vfio_set_migration_error(int err) { - MigrationState *ms = migrate_get_current(); - if (migration_is_setup_or_active()) { - WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { - if (ms->to_dst_file) { - qemu_file_set_error(ms->to_dst_file, err); - } - } + migration_file_set_error(err); } } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 49c0016add..a15fd486c6 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -17,13 +17,12 @@ #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" -#include "migration/migration.h" +#include "migration/misc.h" #include "migration/savevm.h" #include "migration/vmstate.h" #include "migration/qemu-file.h" #include "migration/register.h" #include "migration/blocker.h" -#include "migration/misc.h" #include "qapi/error.h" #include "exec/ramlist.h" #include "exec/ram_addr.h" @@ -714,9 +713,7 @@ static void vfio_vmstate_change_prepare(void *opaque, bool running, * Migration should be aborted in this case, but vm_state_notify() * currently does not support reporting failures. */ - if (migrate_get_current()->to_dst_file) { - qemu_file_set_error(migrate_get_current()->to_dst_file, ret); - } + migration_file_set_error(ret); } trace_vfio_vmstate_change_prepare(vbasedev->name, running, @@ -746,9 +743,7 @@ static void vfio_vmstate_change(void *opaque, bool running, RunState state) * Migration should be aborted in this case, but vm_state_notify() * currently does not support reporting failures. */ - if (migrate_get_current()->to_dst_file) { - qemu_file_set_error(migrate_get_current()->to_dst_file, ret); - } + migration_file_set_error(ret); } trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), diff --git a/include/migration/misc.h b/include/migration/misc.h index 28cfaed2c7..e521cd5229 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -109,6 +109,8 @@ bool migration_in_setup(MigrationState *); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); bool migration_is_running(void); +void migration_file_set_error(int err); + /* ...and after the device transmission */ /* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */ bool migration_in_incoming_postcopy(void); diff --git a/migration/migration.c b/migration/migration.c index db1e627848..216f63d62b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3038,6 +3038,17 @@ static MigThrError postcopy_pause(MigrationState *s) } } +void migration_file_set_error(int err) +{ + MigrationState *s = current_migration; + + WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { + if (s->to_dst_file) { + qemu_file_set_error(s->to_dst_file, err); + } + } +} + static MigThrError migration_detect_error(MigrationState *s) { int ret; From 7395127f230abe8065eeec274f106c29a8a4498a Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:56 -0700 Subject: [PATCH 25/34] migration: privatize colo interfaces Remove private migration interfaces from net/colo-compare.c and push them to migration/colo.c. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-10-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- migration/colo.c | 17 +++++++++++------ net/colo-compare.c | 3 +-- stubs/colo.c | 1 - 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/migration/colo.c b/migration/colo.c index 315e31fe32..84632a603e 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -63,9 +63,9 @@ static bool colo_runstate_is_stopped(void) return runstate_check(RUN_STATE_COLO) || !runstate_is_running(); } -static void colo_checkpoint_notify(void *opaque) +static void colo_checkpoint_notify(void) { - MigrationState *s = opaque; + MigrationState *s = migrate_get_current(); int64_t next_notify_time; qemu_event_set(&s->colo_checkpoint_event); @@ -74,10 +74,15 @@ static void colo_checkpoint_notify(void *opaque) timer_mod(s->colo_delay_timer, next_notify_time); } +static void colo_checkpoint_notify_timer(void *opaque) +{ + colo_checkpoint_notify(); +} + void colo_checkpoint_delay_set(void) { if (migration_in_colo_state()) { - colo_checkpoint_notify(migrate_get_current()); + colo_checkpoint_notify(); } } @@ -162,7 +167,7 @@ static void primary_vm_do_failover(void) * kick COLO thread which might wait at * qemu_sem_wait(&s->colo_checkpoint_sem). */ - colo_checkpoint_notify(s); + colo_checkpoint_notify(); /* * Wake up COLO thread which may blocked in recv() or send(), @@ -518,7 +523,7 @@ out: static void colo_compare_notify_checkpoint(Notifier *notifier, void *data) { - colo_checkpoint_notify(data); + colo_checkpoint_notify(); } static void colo_process_checkpoint(MigrationState *s) @@ -642,7 +647,7 @@ void migrate_start_colo_process(MigrationState *s) bql_unlock(); qemu_event_init(&s->colo_checkpoint_event, false); s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, - colo_checkpoint_notify, s); + colo_checkpoint_notify_timer, NULL); qemu_sem_init(&s->colo_exit_sem, 0); colo_process_checkpoint(s); diff --git a/net/colo-compare.c b/net/colo-compare.c index f2dfc0ebdc..c4ad0ab71f 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -28,7 +28,6 @@ #include "sysemu/iothread.h" #include "net/colo-compare.h" #include "migration/colo.h" -#include "migration/migration.h" #include "util.h" #include "block/aio-wait.h" @@ -189,7 +188,7 @@ static void colo_compare_inconsistency_notify(CompareState *s) notify_remote_frame(s); } else { notifier_list_notify(&colo_compare_notifiers, - migrate_get_current()); + NULL); } } diff --git a/stubs/colo.c b/stubs/colo.c index 08c9f982d5..f8c069b739 100644 --- a/stubs/colo.c +++ b/stubs/colo.c @@ -2,7 +2,6 @@ #include "qemu/notify.h" #include "net/colo-compare.h" #include "migration/colo.h" -#include "migration/migration.h" #include "qemu/error-report.h" #include "qapi/qapi-commands-migration.h" From a3ed48933659b52f942a677b80b31e4234ddb32f Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:57 -0700 Subject: [PATCH 26/34] migration: delete unused accessors Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-11-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- include/migration/misc.h | 3 --- migration/migration.c | 10 ---------- 2 files changed, 13 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index e521cd5229..d563d2c801 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -105,13 +105,10 @@ void migration_add_notifier_mode(NotifierWithReturn *notify, void migration_remove_notifier(NotifierWithReturn *notify); int migration_call_notifiers(MigrationState *s, MigrationEventType type, Error **errp); -bool migration_in_setup(MigrationState *); -bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); bool migration_is_running(void); void migration_file_set_error(int err); -/* ...and after the device transmission */ /* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */ bool migration_in_incoming_postcopy(void); /* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */ diff --git a/migration/migration.c b/migration/migration.c index 216f63d62b..644e073b7d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1548,16 +1548,6 @@ int migration_call_notifiers(MigrationState *s, MigrationEventType type, return ret; } -bool migration_in_setup(MigrationState *s) -{ - return s->state == MIGRATION_STATUS_SETUP; -} - -bool migration_has_finished(MigrationState *s) -{ - return s->state == MIGRATION_STATUS_COMPLETED; -} - bool migration_has_failed(MigrationState *s) { return (s->state == MIGRATION_STATUS_CANCELLED || From c9539d9b14e6369ff169951f581b4c5cea1786e2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Mon, 11 Mar 2024 10:48:58 -0700 Subject: [PATCH 27/34] migration: purge MigrationState from public interface Move remaining MigrationState references from the public file misc.h to the private file migration.h. Signed-off-by: Steve Sistare Link: https://lore.kernel.org/r/1710179338-294359-12-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- include/migration/misc.h | 6 ++---- migration/migration.h | 6 ++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index d563d2c801..c9e200f4eb 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -64,7 +64,6 @@ bool migration_is_active(void); bool migration_is_device(void); bool migration_thread_is_self(void); bool migration_is_setup_or_active(void); -bool migrate_mode_is_cpr(MigrationState *); typedef enum MigrationEventType { MIG_EVENT_PRECOPY_SETUP, @@ -103,16 +102,15 @@ void migration_add_notifier_mode(NotifierWithReturn *notify, MigrationNotifyFunc func, MigMode mode); void migration_remove_notifier(NotifierWithReturn *notify); -int migration_call_notifiers(MigrationState *s, MigrationEventType type, - Error **errp); -bool migration_has_failed(MigrationState *); bool migration_is_running(void); void migration_file_set_error(int err); /* True if incoming migration entered POSTCOPY_INCOMING_DISCARD */ bool migration_in_incoming_postcopy(void); + /* True if incoming migration entered POSTCOPY_INCOMING_ADVISE */ bool migration_incoming_postcopy_advised(void); + /* True if background snapshot is active */ bool migration_in_bg_snapshot(void); diff --git a/migration/migration.h b/migration/migration.h index e4983db9c9..8045e39c26 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -26,6 +26,7 @@ #include "qom/object.h" #include "postcopy-ram.h" #include "sysemu/runstate.h" +#include "migration/misc.h" struct PostcopyBlocktimeContext; @@ -479,12 +480,17 @@ bool migrate_has_error(MigrationState *s); void migrate_fd_connect(MigrationState *s, Error *error_in); +int migration_call_notifiers(MigrationState *s, MigrationEventType type, + Error **errp); + int migrate_init(MigrationState *s, Error **errp); bool migration_is_blocked(Error **errp); /* True if outgoing migration has entered postcopy phase */ bool migration_in_postcopy(void); bool migration_postcopy_is_alive(int state); MigrationState *migrate_get_current(void); +bool migration_has_failed(MigrationState *); +bool migrate_mode_is_cpr(MigrationState *); uint64_t ram_get_total_transferred_pages(void); From 44fe138edc0f08b8568fe4ee90be6a2a56c67daf Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 11 Mar 2024 18:00:09 +0000 Subject: [PATCH 28/34] migration/multifd: Allow zero pages in file migration Currently, it's an error to have no data pages in the multifd file migration because zero page detection is done in the migration thread and zero pages don't reach multifd. This is enforced with the pages->num assert. We're about to add zero page detection on the multifd thread. Fix the file_write_ramblock_iov() to stop considering p->iovs_num=0 an error. Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-2-hao.xiang@linux.dev Signed-off-by: Peter Xu --- migration/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/file.c b/migration/file.c index 5054a60851..b0b963e0ce 100644 --- a/migration/file.c +++ b/migration/file.c @@ -159,7 +159,7 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, int niov, RAMBlock *block, Error **errp) { - ssize_t ret = -1; + ssize_t ret = 0; int i, slice_idx, slice_num; uintptr_t base, next, offset; size_t len; From c3cdf3fb18e5ecf8a3b4dc7223ddbfc53c418eb8 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 11 Mar 2024 18:00:10 +0000 Subject: [PATCH 29/34] migration/multifd: Allow clearing of the file_bmap from multifd We currently only need to clear the mapped-ram file bitmap from the migration thread during save_zero_page. We're about to add support for zero page detection on the multifd thread, so allow ramblock_set_file_bmap_atomic() to also clear the bits. Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-3-hao.xiang@linux.dev Signed-off-by: Peter Xu --- migration/multifd.c | 2 +- migration/ram.c | 8 ++++++-- migration/ram.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index bf9d483f7a..3ba922694e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -115,7 +115,7 @@ static void multifd_set_file_bitmap(MultiFDSendParams *p) assert(pages->block); for (int i = 0; i < p->pages->num; i++) { - ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]); + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); } } diff --git a/migration/ram.c b/migration/ram.c index 3ee8cb47d3..dec2e73f8e 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -3149,9 +3149,13 @@ static void ram_save_file_bmap(QEMUFile *f) } } -void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset) +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) { - set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + if (set) { + set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + } else { + clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + } } /** diff --git a/migration/ram.h b/migration/ram.h index b9ac0da587..08feecaf51 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -75,7 +75,8 @@ bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); -void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset); +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, + bool set); /* ram cache */ int colo_init_ram_cache(void); From 5fdbb1dfccfd59661c95cae760b8e276c5b8e65c Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:11 +0000 Subject: [PATCH 30/34] migration/multifd: Add new migration option zero-page-detection. This new parameter controls where the zero page checking is running. 1. If this parameter is set to 'legacy', zero page checking is done in the migration main thread. 2. If this parameter is set to 'none', zero page checking is disabled. Signed-off-by: Hao Xiang Reviewed-by: Peter Xu Acked-by: Markus Armbruster Link: https://lore.kernel.org/r/20240311180015.3359271-4-hao.xiang@linux.dev Signed-off-by: Peter Xu --- hw/core/qdev-properties-system.c | 10 +++++++++ include/hw/qdev-properties-system.h | 4 ++++ migration/migration-hmp-cmds.c | 9 ++++++++ migration/options.c | 21 ++++++++++++++++++ migration/options.h | 1 + migration/ram.c | 4 ++++ qapi/migration.json | 33 ++++++++++++++++++++++++++--- 7 files changed, 79 insertions(+), 3 deletions(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b45e90edb2..71a21bf24e 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -693,6 +693,16 @@ const PropertyInfo qdev_prop_granule_mode = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +const PropertyInfo qdev_prop_zero_page_detection = { + .name = "ZeroPageDetection", + .description = "zero_page_detection values, " + "none,legacy", + .enum_table = &ZeroPageDetection_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- Reserved Region --- */ /* diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 626be87dd3..438f65389f 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -9,6 +9,7 @@ extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; extern const PropertyInfo qdev_prop_mig_mode; extern const PropertyInfo qdev_prop_granule_mode; +extern const PropertyInfo qdev_prop_zero_page_detection; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; extern const PropertyInfo qdev_prop_bios_chs_trans; @@ -50,6 +51,9 @@ extern const PropertyInfo qdev_prop_iothread_vq_mapping_list; MigMode) #define DEFINE_PROP_GRANULE_MODE(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_granule_mode, GranuleMode) +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \ + ZeroPageDetection) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 99b49df5dd..7e96ae6ffd 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -344,6 +344,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), MultiFDCompression_str(params->multifd_compression)); + assert(params->has_zero_page_detection); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), + qapi_enum_lookup(&ZeroPageDetection_lookup, + params->zero_page_detection)); monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); @@ -634,6 +639,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; + case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: + p->has_zero_page_detection = true; + visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); + break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; if (!visit_type_size(v, param, &cache_size, &err)) { diff --git a/migration/options.c b/migration/options.c index 09178c6f60..8f2a3a2fa5 100644 --- a/migration/options.c +++ b/migration/options.c @@ -179,6 +179,9 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, + parameters.zero_page_detection, + ZERO_PAGE_DETECTION_LEGACY), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), @@ -938,6 +941,13 @@ uint64_t migrate_xbzrle_cache_size(void) return s->parameters.xbzrle_cache_size; } +ZeroPageDetection migrate_zero_page_detection(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.zero_page_detection; +} + /* parameter setters */ void migrate_set_block_incremental(bool value) @@ -1048,6 +1058,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_zero_page_detection = true; + params->zero_page_detection = s->parameters.zero_page_detection; return params; } @@ -1084,6 +1096,7 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_zero_page_detection = true; } /* @@ -1398,6 +1411,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_mode) { dest->mode = params->mode; } + + if (params->has_zero_page_detection) { + dest->zero_page_detection = params->zero_page_detection; + } } static void migrate_params_apply(MigrateSetParameters *params, Error **errp) @@ -1548,6 +1565,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_mode) { s->parameters.mode = params->mode; } + + if (params->has_zero_page_detection) { + s->parameters.zero_page_detection = params->zero_page_detection; + } } void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) diff --git a/migration/options.h b/migration/options.h index b6b69c2bb7..ab8199e207 100644 --- a/migration/options.h +++ b/migration/options.h @@ -90,6 +90,7 @@ const char *migrate_tls_authz(void); const char *migrate_tls_creds(void); const char *migrate_tls_hostname(void); uint64_t migrate_xbzrle_cache_size(void); +ZeroPageDetection migrate_zero_page_detection(void); /* parameters setters */ diff --git a/migration/ram.c b/migration/ram.c index dec2e73f8e..260529f264 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1140,6 +1140,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, QEMUFile *file = pss->pss_channel; int len = 0; + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { + return 0; + } + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { return 0; } diff --git a/qapi/migration.json b/qapi/migration.json index 51d188b902..83fdef73b9 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -670,6 +670,18 @@ { 'enum': 'MigMode', 'data': [ 'normal', 'cpr-reboot' ] } +## +# @ZeroPageDetection: +# +# @none: Do not perform zero page checking. +# +# @legacy: Perform zero page checking in main migration thread. +# +# Since: 9.0 +## +{ 'enum': 'ZeroPageDetection', + 'data': [ 'none', 'legacy' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -891,6 +903,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -924,7 +940,8 @@ 'block-bitmap-mapping', { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', - 'mode'] } + 'mode', + 'zero-page-detection'] } ## # @MigrateSetParameters: @@ -1083,6 +1100,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1136,7 +1157,8 @@ '*x-vcpu-dirty-limit-period': { 'type': 'uint64', 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', - '*mode': 'MigMode'} } + '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection'} } ## # @migrate-set-parameters: @@ -1311,6 +1333,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1361,7 +1387,8 @@ '*x-vcpu-dirty-limit-period': { 'type': 'uint64', 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', - '*mode': 'MigMode'} } + '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection'} } ## # @query-migrate-parameters: From 303e6f54f9657be76ee060006ee2d4cacff263a0 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:12 +0000 Subject: [PATCH 31/34] migration/multifd: Implement zero page transmission on the multifd thread. 1. Add zero_pages field in MultiFDPacket_t. 2. Implements the zero page detection and handling on the multifd threads for non-compression, zlib and zstd compression backends. 3. Added a new value 'multifd' in ZeroPageDetection enumeration. 4. Adds zero page counters and updates multifd send/receive tracing format to track the newly added counters. Signed-off-by: Hao Xiang Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-5-hao.xiang@linux.dev Signed-off-by: Peter Xu --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-zero-page.c | 87 ++++++++++++++++++++++++++++++ migration/multifd-zlib.c | 21 ++++++-- migration/multifd-zstd.c | 20 +++++-- migration/multifd.c | 90 +++++++++++++++++++++++++++----- migration/multifd.h | 23 +++++++- migration/ram.c | 1 - migration/trace-events | 8 +-- qapi/migration.json | 7 ++- 10 files changed, 228 insertions(+), 32 deletions(-) create mode 100644 migration/multifd-zero-page.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 71a21bf24e..7eca2f2377 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -696,7 +696,7 @@ const PropertyInfo qdev_prop_granule_mode = { const PropertyInfo qdev_prop_zero_page_detection = { .name = "ZeroPageDetection", .description = "zero_page_detection values, " - "none,legacy", + "none,legacy,multifd", .enum_table = &ZeroPageDetection_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 92b1cc4297..1eeb915ff6 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,6 +22,7 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', + 'multifd-zero-page.c', 'ram-compress.c', 'options.c', 'postcopy-ram.c', diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c new file mode 100644 index 0000000000..1ba38be636 --- /dev/null +++ b/migration/multifd-zero-page.c @@ -0,0 +1,87 @@ +/* + * Multifd zero page detection implementation. + * + * Copyright (c) 2024 Bytedance Inc + * + * Authors: + * Hao Xiang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "ram.h" + +static bool multifd_zero_page_enabled(void) +{ + return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; +} + +static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) +{ + ram_addr_t temp; + + if (a == b) { + return; + } + + temp = pages_offset[a]; + pages_offset[a] = pages_offset[b]; + pages_offset[b] = temp; +} + +/** + * multifd_send_zero_page_detect: Perform zero page detection on all pages. + * + * Sorts normal pages before zero pages in p->pages->offset and updates + * p->pages->normal_num. + * + * @param p A pointer to the send params. + */ +void multifd_send_zero_page_detect(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + RAMBlock *rb = pages->block; + int i = 0; + int j = pages->num - 1; + + if (!multifd_zero_page_enabled()) { + pages->normal_num = pages->num; + return; + } + + /* + * Sort the page offset array by moving all normal pages to + * the left and all zero pages to the right of the array. + */ + while (i <= j) { + uint64_t offset = pages->offset[i]; + + if (!buffer_is_zero(rb->host + offset, p->page_size)) { + i++; + continue; + } + + swap_page_offset(pages->offset, i, j); + ram_release_page(rb->idstr, offset); + j--; + } + + pages->normal_num = i; +} + +void multifd_recv_zero_page_process(MultiFDRecvParams *p) +{ + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; + if (!buffer_is_zero(page, p->page_size)) { + memset(page, 0, p->page_size); + } + } +} diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 6120faad65..83c0374380 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,13 +123,15 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -172,10 +174,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = out_size; p->iovs_num++; p->next_packet_size = out_size; + +out: p->flags |= MULTIFD_FLAG_ZLIB; - multifd_send_fill_packet(p); - return 0; } @@ -261,6 +263,14 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZLIB); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -310,6 +320,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, out_size, expected_size); return -1; } + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index cac236833d..02112255ad 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,16 +118,18 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } z->in.src = p->pages->block->host + pages->offset[i]; @@ -161,10 +163,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = z->out.pos; p->iovs_num++; p->next_packet_size = z->out.pos; + +out: p->flags |= MULTIFD_FLAG_ZSTD; - multifd_send_fill_packet(p); - return 0; } @@ -257,6 +259,14 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZSTD); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { diff --git a/migration/multifd.c b/migration/multifd.c index 3ba922694e..0179422f6d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -11,6 +11,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" @@ -111,12 +112,17 @@ void multifd_send_channel_created(void) static void multifd_set_file_bitmap(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; + uint32_t zero_num = p->pages->num - p->pages->normal_num; assert(pages->block); - for (int i = 0; i < p->pages->num; i++) { + for (int i = 0; i < p->pages->normal_num; i++) { ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); } + + for (int i = p->pages->num; i < zero_num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false); + } } /* Multifd without compression */ @@ -153,13 +159,13 @@ static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < pages->num; i++) { + for (int i = 0; i < pages->normal_num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = pages->num * p->page_size; + p->next_packet_size = pages->normal_num * p->page_size; } /** @@ -178,6 +184,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) bool use_zero_copy_send = migrate_zero_copy_send(); int ret; + multifd_send_zero_page_detect(p); + if (!multifd_use_packets()) { multifd_send_prepare_iovs(p); multifd_set_file_bitmap(p); @@ -261,6 +269,13 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_NOCOMP); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; @@ -295,6 +310,7 @@ static void multifd_pages_reset(MultiFDPages_t *pages) * overwritten later when reused. */ pages->num = 0; + pages->normal_num = 0; pages->block = NULL; } @@ -386,11 +402,13 @@ void multifd_send_fill_packet(MultiFDSendParams *p) MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; uint64_t packet_num; + uint32_t zero_num = pages->num - pages->normal_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(pages->num); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -408,10 +426,11 @@ void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; - p->total_normal_pages += pages->num; + p->total_normal_pages += pages->normal_num; + p->total_zero_pages += zero_num; - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); + trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, + p->flags, p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -452,20 +471,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal_num = be32_to_cpu(packet->normal_pages); if (p->normal_num > packet->pages_alloc) { error_setg(errp, "multifd: received packet " - "with %u pages and expected maximum pages are %u", + "with %u normal pages and expected maximum pages are %u", p->normal_num, packet->pages_alloc) ; return -1; } + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > packet->pages_alloc - p->normal_num) { + error_setg(errp, "multifd: received packet " + "with %u zero pages and expected maximum zero pages are %u", + p->zero_num, packet->pages_alloc - p->normal_num) ; + return -1; + } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; p->total_normal_pages += p->normal_num; + p->total_zero_pages += p->zero_num; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, - p->next_packet_size); + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, + p->flags, p->next_packet_size); - if (p->normal_num == 0) { + if (p->normal_num == 0 && p->zero_num == 0) { return 0; } @@ -491,6 +519,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal[i] = offset; } + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + return 0; } @@ -918,6 +958,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); multifd_pages_reset(p->pages); p->next_packet_size = 0; @@ -965,7 +1007,8 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1316,6 +1359,8 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->iov = NULL; g_free(p->normal); p->normal = NULL; + g_free(p->zero); + p->zero = NULL; multifd_recv_state->ops->recv_cleanup(p); } @@ -1449,7 +1494,7 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); } else { /* @@ -1507,7 +1552,9 @@ static void *multifd_recv_thread(void *opaque) } rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, + p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1559,6 +1606,7 @@ int multifd_recv_setup(Error **errp) p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; p->page_size = qemu_target_page_size(); } @@ -1633,3 +1681,17 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + multifd_send_prepare_header(p); + + return true; +} diff --git a/migration/multifd.h b/migration/multifd.h index 7447c2bea3..c9d9b09239 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -55,14 +55,24 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; uint64_t packet_num; - uint64_t unused[4]; /* Reserved for future use */ + /* zero pages */ + uint32_t zero_pages; + uint32_t unused32[1]; /* Reserved for future use */ + uint64_t unused64[3]; /* Reserved for future use */ char ramblock[256]; + /* + * This array contains the pointers to: + * - normal pages (initial normal_pages entries) + * - zero pages (following zero_pages entries) + */ uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; typedef struct { /* number of used pages */ uint32_t num; + /* number of normal pages */ + uint32_t normal_num; /* number of allocated pages */ uint32_t allocated; /* offset of each page */ @@ -136,6 +146,8 @@ typedef struct { uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; + /* zero pages sent through this channel */ + uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ @@ -194,12 +206,18 @@ typedef struct { uint8_t *host; /* non zero pages recv through this channel */ uint64_t total_normal_pages; + /* zero pages recv through this channel */ + uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ ram_addr_t *normal; /* num of non zero pages */ uint32_t normal_num; + /* Pages that are zero */ + ram_addr_t *zero; + /* num of zero pages */ + uint32_t zero_num; /* used for de-compression methods */ void *compress_data; } MultiFDRecvParams; @@ -221,6 +239,9 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); void multifd_send_fill_packet(MultiFDSendParams *p); +bool multifd_send_prepare_common(MultiFDSendParams *p); +void multifd_send_zero_page_detect(MultiFDSendParams *p); +void multifd_recv_zero_page_process(MultiFDRecvParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { diff --git a/migration/ram.c b/migration/ram.c index 260529f264..c26435adc7 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1288,7 +1288,6 @@ static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) if (!multifd_queue_page(block, offset)) { return -1; } - stat64_add(&mig_stats.normal_pages, 1); return 1; } diff --git a/migration/trace-events b/migration/trace-events index bf1a069632..f0e1cb80c7 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" +multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" +multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" multifd_send_terminate_threads(void) "" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 +multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" diff --git a/qapi/migration.json b/qapi/migration.json index 83fdef73b9..2684e4e9ac 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -677,10 +677,15 @@ # # @legacy: Perform zero page checking in main migration thread. # +# @multifd: Perform zero page checking in multifd sender thread if +# multifd migration is enabled, else in the main migration +# thread as for @legacy. +# # Since: 9.0 +# ## { 'enum': 'ZeroPageDetection', - 'data': [ 'none', 'legacy' ] } + 'data': [ 'none', 'legacy', 'multifd' ] } ## # @BitmapMigrationBitmapAliasTransform: From 9ae90f73e623c8b8c7ec1fccd8ca493805df8fbd Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:13 +0000 Subject: [PATCH 32/34] migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. 1. Add a dedicated handler for MigrationOps::ram_save_target_page in multifd live migration. 2. Refactor ram_save_target_page_legacy so that the legacy and multifd handlers don't have internal functions calling into each other. Signed-off-by: Hao Xiang Reviewed-by: Fabiano Rosas Message-Id: <20240226195654.934709-4-hao.xiang@bytedance.com> Link: https://lore.kernel.org/r/20240311180015.3359271-6-hao.xiang@linux.dev Signed-off-by: Peter Xu --- migration/ram.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index c26435adc7..8deb84984f 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2079,7 +2079,6 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, */ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) { - RAMBlock *block = pss->block; ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; @@ -2095,17 +2094,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } + return ram_save_page(rs, pss); +} + +/** + * ram_save_target_page_multifd: send one target page to multifd workers + * + * Returns 1 if the page was queued, -1 otherwise. + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) +{ + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + /* - * Do not use multifd in postcopy as one whole host page should be - * placed. Meanwhile postcopy requires atomic update of pages, so even - * if host page size == guest page size the dest guest during run may - * still see partially copied pages which is data corruption. + * While using multifd live migration, we still need to handle zero + * page checking on the migration main thread. */ - if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(block, offset); + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (save_zero_page(rs, pss, offset)) { + return 1; + } } - return ram_save_page(rs, pss); + return ram_save_multifd_page(block, offset); } /* Should be called before sending a host page */ @@ -3112,7 +3127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } migration_ops = g_malloc0(sizeof(MigrationOps)); - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + if (migrate_multifd()) { + migration_ops->ram_save_target_page = ram_save_target_page_multifd; + } else { + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + } bql_unlock(); ret = multifd_send_sync_main(); From 70c25c92e602f393d3c33596530c5f2b18491e55 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:14 +0000 Subject: [PATCH 33/34] migration/multifd: Enable multifd zero page checking by default. 1. Set default "zero-page-detection" option to "multifd". Now zero page checking can be done in the multifd threads and this becomes the default configuration. 2. Handle migration QEMU9.0 -> QEMU8.2 compatibility. We provide backward compatibility where zero page checking is done from the migration main thread. Signed-off-by: Hao Xiang Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240311180015.3359271-7-hao.xiang@linux.dev Signed-off-by: Peter Xu --- hw/core/machine.c | 4 +++- migration/options.c | 2 +- qapi/migration.json | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 9ac5d5389a..0e9d646b61 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -32,7 +32,9 @@ #include "hw/virtio/virtio-net.h" #include "audio/audio.h" -GlobalProperty hw_compat_8_2[] = {}; +GlobalProperty hw_compat_8_2[] = { + { "migration", "zero-page-detection", "legacy"}, +}; const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2); GlobalProperty hw_compat_8_1[] = { diff --git a/migration/options.c b/migration/options.c index 8f2a3a2fa5..9ed2fe4bee 100644 --- a/migration/options.c +++ b/migration/options.c @@ -181,7 +181,7 @@ Property migration_properties[] = { MIG_MODE_NORMAL), DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, parameters.zero_page_detection, - ZERO_PAGE_DETECTION_LEGACY), + ZERO_PAGE_DETECTION_MULTIFD), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), diff --git a/qapi/migration.json b/qapi/migration.json index 2684e4e9ac..aa1b39bce1 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -909,7 +909,7 @@ # (Since 8.2) # # @zero-page-detection: Whether and how to detect zero pages. -# See description in @ZeroPageDetection. Default is 'legacy'. +# See description in @ZeroPageDetection. Default is 'multifd'. # (since 9.0) # # Features: @@ -1106,7 +1106,7 @@ # (Since 8.2) # # @zero-page-detection: Whether and how to detect zero pages. -# See description in @ZeroPageDetection. Default is 'legacy'. +# See description in @ZeroPageDetection. Default is 'multifd'. # (since 9.0) # # Features: @@ -1339,7 +1339,7 @@ # (Since 8.2) # # @zero-page-detection: Whether and how to detect zero pages. -# See description in @ZeroPageDetection. Default is 'legacy'. +# See description in @ZeroPageDetection. Default is 'multifd'. # (since 9.0) # # Features: From 1815338df00fd0a3fe25085564c6966f74c8f43d Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:15 +0000 Subject: [PATCH 34/34] migration/multifd: Add new migration test cases for legacy zero page checking. Now that zero page checking is done on the multifd sender threads by default, we still provide an option for backward compatibility. This change adds a qtest migration test case to set the zero-page-detection option to "legacy" and run multifd migration with zero page checking on the migration main thread. Signed-off-by: Hao Xiang Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240311180015.3359271-8-hao.xiang@linux.dev Signed-off-by: Peter Xu --- tests/qtest/migration-test.c | 52 ++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 4023d808f9..71895abb7f 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2771,6 +2771,24 @@ test_migrate_precopy_tcp_multifd_start(QTestState *from, return test_migrate_precopy_tcp_multifd_start_common(from, to, "none"); } +static void * +test_migrate_precopy_tcp_multifd_start_zero_page_legacy(QTestState *from, + QTestState *to) +{ + test_migrate_precopy_tcp_multifd_start_common(from, to, "none"); + migrate_set_parameter_str(from, "zero-page-detection", "legacy"); + return NULL; +} + +static void * +test_migration_precopy_tcp_multifd_start_no_zero_page(QTestState *from, + QTestState *to) +{ + test_migrate_precopy_tcp_multifd_start_common(from, to, "none"); + migrate_set_parameter_str(from, "zero-page-detection", "none"); + return NULL; +} + static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) @@ -2812,6 +2830,36 @@ static void test_multifd_tcp_none(void) test_precopy_common(&args); } +static void test_multifd_tcp_zero_page_legacy(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_start_zero_page_legacy, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure + * everything will work alright even if guest page is changing. + */ + .live = true, + }; + test_precopy_common(&args); +} + +static void test_multifd_tcp_no_zero_page(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migration_precopy_tcp_multifd_start_no_zero_page, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure + * everything will work alright even if guest page is changing. + */ + .live = true, + }; + test_precopy_common(&args); +} + static void test_multifd_tcp_zlib(void) { MigrateCommon args = { @@ -3729,6 +3777,10 @@ int main(int argc, char **argv) } migration_test_add("/migration/multifd/tcp/plain/none", test_multifd_tcp_none); + migration_test_add("/migration/multifd/tcp/plain/zero-page/legacy", + test_multifd_tcp_zero_page_legacy); + migration_test_add("/migration/multifd/tcp/plain/zero-page/none", + test_multifd_tcp_no_zero_page); migration_test_add("/migration/multifd/tcp/plain/cancel", test_multifd_tcp_cancel); migration_test_add("/migration/multifd/tcp/plain/zlib",