Migration pull 2021-02-08

v2
   Dropped vmstate: Fix memory leak in vmstate_handle_alloc
     Broke on Power
   Added migration: only check page size match if RAM postcopy is enabled
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEERfXHG0oMt/uXep+pBRYzHrxb/ecFAmAhIE4ACgkQBRYzHrxb
 /ecPuA/+Pgo++1ZSseJUgbLePwyTVc0jahdcvYEDmLUn8UM6ikBcBXBgUKHdkFW3
 bjSSVgB/xxvXSiafBK4xFNrCqSgqMSr3DJcHmvWgv2wVARcYf6Z26Da53LZq1Qru
 0tvRyb40Od1f9zb8Zj7e2Y3pjQ9ybLLbjfNhgnOBbQivqWkjZI31oV2KUCWY2+eV
 T1BEwr6mgYepqhmeB6OvQZtaQVC5toirS6NajNF4nt0vZEIGIvK6/A9erCVU8Tze
 5ch1J0MUqgc3q6ZSE/I9BHEy6MaL0X8G6H+ezjxdoRQtbt1iM/YqZJCSrXkAxiLC
 ROohryb6qVk26+UYuana79faLwrw359WlkwNEE6SEIRSENu+6p7bgN3LZuCILCO7
 xJEkeTgy6r40IGCkDC9aWa8pyLHpNX9gyLpGBHdIRD6zEOWaKNtzh7E2uo/T0ann
 BpcfgQOsYN25hIHiiXnxozUREbx71VDfMq7GqGB6eC3u2+a3U6jpSJb1nNq5NB89
 FJYLZy5Rbuy7OStMwfMsxRs7E63XvGgnwrN8FczU/pumCPX4lDYIpnocqinUmP8p
 XubRQQVaVDSKIq1mvzw7iR/1NsP9vfYvnrAIv941f38NBmDKqdPuMOXR/qB/Kp2Y
 jB7b1L5/JcXbWsQmK7fda9jmPzFwSO2cTeTiUonk9RfuuDEws0A=
 =4tbe
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20210208a' into staging

Migration pull 2021-02-08

v2
  Dropped vmstate: Fix memory leak in vmstate_handle_alloc
    Broke on Power
  Added migration: only check page size match if RAM postcopy is enabled

# gpg: Signature made Mon 08 Feb 2021 11:28:14 GMT
# gpg:                using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7
# gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full]
# Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A  9FA9 0516 331E BC5B FDE7

* remotes/dgilbert/tags/pull-migration-20210208a: (27 commits)
  migration: only check page size match if RAM postcopy is enabled
  migration: introduce snapshot-{save, load, delete} QMP commands
  iotests: fix loading of common.config from tests/ subdir
  iotests: add support for capturing and matching QMP events
  migration: introduce a delete_snapshot wrapper
  migration: wire up support for snapshot device selection
  migration: control whether snapshots are ovewritten
  block: rename and alter bdrv_all_find_snapshot semantics
  block: allow specifying name of block device for vmstate storage
  block: add ability to specify list of blockdevs during snapshot
  migration: stop returning errno from load_snapshot()
  migration: Make save_snapshot() return bool, not 0/-1
  block: push error reporting into bdrv_all_*_snapshot functions
  migration: Display the migration blockers
  migration: Add blocker information
  migration: Fix a few absurdly defective error messages
  migration: Fix cache_init()'s "Failed to allocate" error messages
  migration: Clean up signed vs. unsigned XBZRLE cache-size
  migration: Fix migrate-set-parameters argument validation
  migration: introduce 'userfaultfd-wrlat.py' script
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2021-02-08 18:23:47 +00:00
commit 2436651b26
30 changed files with 2153 additions and 234 deletions

View File

@ -900,10 +900,11 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
ImageEntry *image_entry, *next_ie; ImageEntry *image_entry, *next_ie;
SnapshotEntry *snapshot_entry; SnapshotEntry *snapshot_entry;
Error *err = NULL;
bs = bdrv_all_find_vmstate_bs(); bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, &err);
if (!bs) { if (!bs) {
monitor_printf(mon, "No available block device supports snapshots\n"); error_report_err(err);
return; return;
} }
aio_context = bdrv_get_aio_context(bs); aio_context = bdrv_get_aio_context(bs);
@ -953,7 +954,7 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
total = 0; total = 0;
for (i = 0; i < nb_sns; i++) { for (i = 0; i < nb_sns; i++) {
SnapshotEntry *next_sn; SnapshotEntry *next_sn;
if (bdrv_all_find_snapshot(sn_tab[i].name, &bs1) == 0) { if (bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL) == 1) {
global_snapshots[total] = i; global_snapshots[total] = i;
total++; total++;
QTAILQ_FOREACH(image_entry, &image_list, next) { QTAILQ_FOREACH(image_entry, &image_list, next) {

View File

@ -447,6 +447,41 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
return ret; return ret;
} }
static int bdrv_all_get_snapshot_devices(bool has_devices, strList *devices,
GList **all_bdrvs,
Error **errp)
{
g_autoptr(GList) bdrvs = NULL;
if (has_devices) {
if (!devices) {
error_setg(errp, "At least one device is required for snapshot");
return -1;
}
while (devices) {
BlockDriverState *bs = bdrv_find_node(devices->value);
if (!bs) {
error_setg(errp, "No block device node '%s'", devices->value);
return -1;
}
bdrvs = g_list_append(bdrvs, bs);
devices = devices->next;
}
} else {
BlockDriverState *bs;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
bdrvs = g_list_append(bdrvs, bs);
}
}
*all_bdrvs = g_steal_pointer(&bdrvs);
return 0;
}
static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs)
{ {
if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
@ -462,44 +497,59 @@ static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs)
* These functions will properly handle dataplane (take aio_context_acquire * These functions will properly handle dataplane (take aio_context_acquire
* when appropriate for appropriate block drivers) */ * when appropriate for appropriate block drivers) */
bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
Error **errp)
{ {
bool ok = true; g_autoptr(GList) bdrvs = NULL;
BlockDriverState *bs; GList *iterbdrvs;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
return false;
}
iterbdrvs = bdrvs;
while (iterbdrvs) {
BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs); AioContext *ctx = bdrv_get_aio_context(bs);
bool ok = true;
aio_context_acquire(ctx); aio_context_acquire(ctx);
if (bdrv_all_snapshots_includes_bs(bs)) { if (devices || bdrv_all_snapshots_includes_bs(bs)) {
ok = bdrv_can_snapshot(bs); ok = bdrv_can_snapshot(bs);
} }
aio_context_release(ctx); aio_context_release(ctx);
if (!ok) { if (!ok) {
bdrv_next_cleanup(&it); error_setg(errp, "Device '%s' is writable but does not support "
goto fail; "snapshots", bdrv_get_device_or_node_name(bs));
return false;
} }
iterbdrvs = iterbdrvs->next;
} }
fail: return true;
*first_bad_bs = bs;
return ok;
} }
int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, int bdrv_all_delete_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp) Error **errp)
{ {
int ret = 0; g_autoptr(GList) bdrvs = NULL;
BlockDriverState *bs; GList *iterbdrvs;
BdrvNextIterator it;
QEMUSnapshotInfo sn1, *snapshot = &sn1;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
return -1;
}
iterbdrvs = bdrvs;
while (iterbdrvs) {
BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs); AioContext *ctx = bdrv_get_aio_context(bs);
QEMUSnapshotInfo sn1, *snapshot = &sn1;
int ret = 0;
aio_context_acquire(ctx); aio_context_acquire(ctx);
if (bdrv_all_snapshots_includes_bs(bs) && if ((devices || bdrv_all_snapshots_includes_bs(bs)) &&
bdrv_snapshot_find(bs, snapshot, name) >= 0) bdrv_snapshot_find(bs, snapshot, name) >= 0)
{ {
ret = bdrv_snapshot_delete(bs, snapshot->id_str, ret = bdrv_snapshot_delete(bs, snapshot->id_str,
@ -507,118 +557,180 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
} }
aio_context_release(ctx); aio_context_release(ctx);
if (ret < 0) { if (ret < 0) {
bdrv_next_cleanup(&it); error_prepend(errp, "Could not delete snapshot '%s' on '%s': ",
goto fail; name, bdrv_get_device_or_node_name(bs));
return -1;
} }
iterbdrvs = iterbdrvs->next;
} }
fail: return 0;
*first_bad_bs = bs;
return ret;
} }
int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, int bdrv_all_goto_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp) Error **errp)
{ {
int ret = 0; g_autoptr(GList) bdrvs = NULL;
BlockDriverState *bs; GList *iterbdrvs;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
return -1;
}
iterbdrvs = bdrvs;
while (iterbdrvs) {
BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs); AioContext *ctx = bdrv_get_aio_context(bs);
int ret = 0;
aio_context_acquire(ctx); aio_context_acquire(ctx);
if (bdrv_all_snapshots_includes_bs(bs)) { if (devices || bdrv_all_snapshots_includes_bs(bs)) {
ret = bdrv_snapshot_goto(bs, name, errp); ret = bdrv_snapshot_goto(bs, name, errp);
} }
aio_context_release(ctx); aio_context_release(ctx);
if (ret < 0) { if (ret < 0) {
bdrv_next_cleanup(&it); error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
goto fail; name, bdrv_get_device_or_node_name(bs));
return -1;
} }
iterbdrvs = iterbdrvs->next;
} }
fail: return 0;
*first_bad_bs = bs;
return ret;
} }
int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) int bdrv_all_has_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp)
{ {
QEMUSnapshotInfo sn; g_autoptr(GList) bdrvs = NULL;
int err = 0; GList *iterbdrvs;
BlockDriverState *bs;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
AioContext *ctx = bdrv_get_aio_context(bs); return -1;
aio_context_acquire(ctx);
if (bdrv_all_snapshots_includes_bs(bs)) {
err = bdrv_snapshot_find(bs, &sn, name);
}
aio_context_release(ctx);
if (err < 0) {
bdrv_next_cleanup(&it);
goto fail;
}
} }
fail: iterbdrvs = bdrvs;
*first_bad_bs = bs; while (iterbdrvs) {
return err; BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
QEMUSnapshotInfo sn;
int ret = 0;
aio_context_acquire(ctx);
if (devices || bdrv_all_snapshots_includes_bs(bs)) {
ret = bdrv_snapshot_find(bs, &sn, name);
}
aio_context_release(ctx);
if (ret < 0) {
if (ret == -ENOENT) {
return 0;
} else {
error_setg_errno(errp, errno,
"Could not check snapshot '%s' on '%s'",
name, bdrv_get_device_or_node_name(bs));
return -1;
}
}
iterbdrvs = iterbdrvs->next;
}
return 1;
} }
int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
BlockDriverState *vm_state_bs, BlockDriverState *vm_state_bs,
uint64_t vm_state_size, uint64_t vm_state_size,
BlockDriverState **first_bad_bs) bool has_devices, strList *devices,
Error **errp)
{ {
int err = 0; g_autoptr(GList) bdrvs = NULL;
BlockDriverState *bs; GList *iterbdrvs;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
return -1;
}
iterbdrvs = bdrvs;
while (iterbdrvs) {
BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs); AioContext *ctx = bdrv_get_aio_context(bs);
int ret = 0;
aio_context_acquire(ctx); aio_context_acquire(ctx);
if (bs == vm_state_bs) { if (bs == vm_state_bs) {
sn->vm_state_size = vm_state_size; sn->vm_state_size = vm_state_size;
err = bdrv_snapshot_create(bs, sn); ret = bdrv_snapshot_create(bs, sn);
} else if (bdrv_all_snapshots_includes_bs(bs)) { } else if (devices || bdrv_all_snapshots_includes_bs(bs)) {
sn->vm_state_size = 0; sn->vm_state_size = 0;
err = bdrv_snapshot_create(bs, sn); ret = bdrv_snapshot_create(bs, sn);
} }
aio_context_release(ctx); aio_context_release(ctx);
if (err < 0) { if (ret < 0) {
bdrv_next_cleanup(&it); error_setg(errp, "Could not create snapshot '%s' on '%s'",
goto fail; sn->name, bdrv_get_device_or_node_name(bs));
return -1;
} }
iterbdrvs = iterbdrvs->next;
} }
fail: return 0;
*first_bad_bs = bs;
return err;
} }
BlockDriverState *bdrv_all_find_vmstate_bs(void)
{
BlockDriverState *bs;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
bool has_devices, strList *devices,
Error **errp)
{
g_autoptr(GList) bdrvs = NULL;
GList *iterbdrvs;
if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
return NULL;
}
iterbdrvs = bdrvs;
while (iterbdrvs) {
BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs); AioContext *ctx = bdrv_get_aio_context(bs);
bool found; bool found = false;
aio_context_acquire(ctx); aio_context_acquire(ctx);
found = bdrv_all_snapshots_includes_bs(bs) && bdrv_can_snapshot(bs); found = (devices || bdrv_all_snapshots_includes_bs(bs)) &&
bdrv_can_snapshot(bs);
aio_context_release(ctx); aio_context_release(ctx);
if (found) { if (vmstate_bs) {
bdrv_next_cleanup(&it); if (g_str_equal(vmstate_bs,
break; bdrv_get_node_name(bs))) {
if (found) {
return bs;
} else {
error_setg(errp,
"vmstate block device '%s' does not support snapshots",
vmstate_bs);
return NULL;
}
}
} else if (found) {
return bs;
} }
iterbdrvs = iterbdrvs->next;
} }
return bs;
if (vmstate_bs) {
error_setg(errp,
"vmstate block device '%s' does not exist", vmstate_bs);
} else {
error_setg(errp,
"no block device can store vmstate for snapshot");
}
return NULL;
} }

View File

@ -2173,6 +2173,16 @@ static int spapr_pci_pre_save(void *opaque)
return 0; return 0;
} }
static int spapr_pci_post_save(void *opaque)
{
SpaprPhbState *sphb = opaque;
g_free(sphb->msi_devs);
sphb->msi_devs = NULL;
sphb->msi_devs_num = 0;
return 0;
}
static int spapr_pci_post_load(void *opaque, int version_id) static int spapr_pci_post_load(void *opaque, int version_id)
{ {
SpaprPhbState *sphb = opaque; SpaprPhbState *sphb = opaque;
@ -2205,6 +2215,7 @@ static const VMStateDescription vmstate_spapr_pci = {
.version_id = 2, .version_id = 2,
.minimum_version_id = 2, .minimum_version_id = 2,
.pre_save = spapr_pci_pre_save, .pre_save = spapr_pci_pre_save,
.post_save = spapr_pci_post_save,
.post_load = spapr_pci_post_load, .post_load = spapr_pci_post_load,
.fields = (VMStateField[]) { .fields = (VMStateField[]) {
VMSTATE_UINT64_EQUAL(buid, SpaprPhbState, NULL), VMSTATE_UINT64_EQUAL(buid, SpaprPhbState, NULL),

View File

@ -25,7 +25,7 @@
#ifndef SNAPSHOT_H #ifndef SNAPSHOT_H
#define SNAPSHOT_H #define SNAPSHOT_H
#include "qapi/qapi-builtin-types.h"
#define SNAPSHOT_OPT_BASE "snapshot." #define SNAPSHOT_OPT_BASE "snapshot."
#define SNAPSHOT_OPT_ID "snapshot.id" #define SNAPSHOT_OPT_ID "snapshot.id"
@ -77,17 +77,26 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
* These functions will properly handle dataplane (take aio_context_acquire * These functions will properly handle dataplane (take aio_context_acquire
* when appropriate for appropriate block drivers */ * when appropriate for appropriate block drivers */
bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs); bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bsd_bs,
Error **errp);
int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs,
Error **errp); Error **errp);
int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs); int bdrv_all_delete_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp);
int bdrv_all_goto_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp);
int bdrv_all_has_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp);
int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
BlockDriverState *vm_state_bs, BlockDriverState *vm_state_bs,
uint64_t vm_state_size, uint64_t vm_state_size,
BlockDriverState **first_bad_bs); bool has_devices,
strList *devices,
Error **errp);
BlockDriverState *bdrv_all_find_vmstate_bs(void); BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
bool has_devices, strList *devices,
Error **errp);
#endif #endif

View File

@ -149,6 +149,14 @@ typedef struct IOMMUTLBEvent {
/* RAM is a persistent kind memory */ /* RAM is a persistent kind memory */
#define RAM_PMEM (1 << 5) #define RAM_PMEM (1 << 5)
/*
* UFFDIO_WRITEPROTECT is used on this RAMBlock to
* support 'write-tracking' migration type.
* Implies ram_state->ram_wt_enabled.
*/
#define RAM_UF_WRITEPROTECT (1 << 6)
static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
IOMMUNotifierFlag flags, IOMMUNotifierFlag flags,
hwaddr start, hwaddr end, hwaddr start, hwaddr end,

View File

@ -15,7 +15,50 @@
#ifndef QEMU_MIGRATION_SNAPSHOT_H #ifndef QEMU_MIGRATION_SNAPSHOT_H
#define QEMU_MIGRATION_SNAPSHOT_H #define QEMU_MIGRATION_SNAPSHOT_H
int save_snapshot(const char *name, Error **errp); #include "qapi/qapi-builtin-types.h"
int load_snapshot(const char *name, Error **errp);
/**
* save_snapshot: Save an internal snapshot.
* @name: name of internal snapshot
* @overwrite: replace existing snapshot with @name
* @vmstate: blockdev node name to store VM state in
* @has_devices: whether to use explicit device list
* @devices: explicit device list to snapshot
* @errp: pointer to error object
* On success, return %true.
* On failure, store an error through @errp and return %false.
*/
bool save_snapshot(const char *name, bool overwrite,
const char *vmstate,
bool has_devices, strList *devices,
Error **errp);
/**
* load_snapshot: Load an internal snapshot.
* @name: name of internal snapshot
* @vmstate: blockdev node name to load VM state from
* @has_devices: whether to use explicit device list
* @devices: explicit device list to snapshot
* @errp: pointer to error object
* On success, return %true.
* On failure, store an error through @errp and return %false.
*/
bool load_snapshot(const char *name,
const char *vmstate,
bool has_devices, strList *devices,
Error **errp);
/**
* delete_snapshot: Delete a snapshot.
* @name: path to snapshot
* @has_devices: whether to use explicit device list
* @devices: explicit device list to snapshot
* @errp: pointer to error object
* On success, return %true.
* On failure, store an error through @errp and return %false.
*/
bool delete_snapshot(const char *name,
bool has_devices, strList *devices,
Error **errp);
#endif #endif

View File

@ -0,0 +1,35 @@
/*
* Linux UFFD-WP support
*
* Copyright Virtuozzo GmbH, 2020
*
* Authors:
* Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or
* later. See the COPYING file in the top-level directory.
*/
#ifndef USERFAULTFD_H
#define USERFAULTFD_H
#include "qemu/osdep.h"
#include "exec/hwaddr.h"
#include <linux/userfaultfd.h>
int uffd_query_features(uint64_t *features);
int uffd_create_fd(uint64_t features, bool non_blocking);
void uffd_close_fd(int uffd_fd);
int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
uint64_t mode, uint64_t *ioctls);
int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length);
int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
bool wp, bool dont_wake);
int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
uint64_t length, bool dont_wake);
int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake);
int uffd_wakeup(int uffd_fd, void *addr, uint64_t length);
int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count);
bool uffd_poll_events(int uffd_fd, int tmo);
#endif /* USERFAULTFD_H */

View File

@ -58,6 +58,7 @@
#include "qemu/queue.h" #include "qemu/queue.h"
#include "multifd.h" #include "multifd.h"
#include "qemu/yank.h" #include "qemu/yank.h"
#include "sysemu/cpus.h"
#ifdef CONFIG_VFIO #ifdef CONFIG_VFIO
#include "hw/vfio/vfio-common.h" #include "hw/vfio/vfio-common.h"
@ -134,6 +135,38 @@ enum mig_rp_message_type {
MIG_RP_MSG_MAX MIG_RP_MSG_MAX
}; };
/* Migration capabilities set */
struct MigrateCapsSet {
int size; /* Capability set size */
MigrationCapability caps[]; /* Variadic array of capabilities */
};
typedef struct MigrateCapsSet MigrateCapsSet;
/* Define and initialize MigrateCapsSet */
#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \
MigrateCapsSet _name = { \
.size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
.caps = { __VA_ARGS__ } \
}
/* Background-snapshot compatibility check list */
static const
INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
MIGRATION_CAPABILITY_POSTCOPY_RAM,
MIGRATION_CAPABILITY_DIRTY_BITMAPS,
MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
MIGRATION_CAPABILITY_RETURN_PATH,
MIGRATION_CAPABILITY_MULTIFD,
MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
MIGRATION_CAPABILITY_AUTO_CONVERGE,
MIGRATION_CAPABILITY_RELEASE_RAM,
MIGRATION_CAPABILITY_RDMA_PIN_ALL,
MIGRATION_CAPABILITY_COMPRESS,
MIGRATION_CAPABILITY_XBZRLE,
MIGRATION_CAPABILITY_X_COLO,
MIGRATION_CAPABILITY_VALIDATE_UUID);
/* When we add fault tolerance, we could have several /* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add migrations at once. For now we don't need to add
dynamic creation of migration */ dynamic creation of migration */
@ -141,6 +174,8 @@ enum mig_rp_message_type {
static MigrationState *current_migration; static MigrationState *current_migration;
static MigrationIncomingState *current_incoming; static MigrationIncomingState *current_incoming;
static GSList *migration_blockers;
static bool migration_object_check(MigrationState *ms, Error **errp); static bool migration_object_check(MigrationState *ms, Error **errp);
static int migration_maybe_pause(MigrationState *s, static int migration_maybe_pause(MigrationState *s,
int *current_active_state, int *current_active_state,
@ -1041,6 +1076,27 @@ static void fill_source_migration_info(MigrationInfo *info)
{ {
MigrationState *s = migrate_get_current(); MigrationState *s = migrate_get_current();
info->blocked = migration_is_blocked(NULL);
info->has_blocked_reasons = info->blocked;
info->blocked_reasons = NULL;
if (info->blocked) {
GSList *cur_blocker = migration_blockers;
/*
* There are two types of reasons a migration might be blocked;
* a) devices marked in VMState as non-migratable, and
* b) Explicit migration blockers
* We need to add both of them here.
*/
qemu_savevm_non_migratable_list(&info->blocked_reasons);
while (cur_blocker) {
QAPI_LIST_PREPEND(info->blocked_reasons,
g_strdup(error_get_pretty(cur_blocker->data)));
cur_blocker = g_slist_next(cur_blocker);
}
}
switch (s->state) { switch (s->state) {
case MIGRATION_STATUS_NONE: case MIGRATION_STATUS_NONE:
/* no migration has happened ever */ /* no migration has happened ever */
@ -1089,6 +1145,31 @@ static void fill_source_migration_info(MigrationInfo *info)
info->status = s->state; info->status = s->state;
} }
typedef enum WriteTrackingSupport {
WT_SUPPORT_UNKNOWN = 0,
WT_SUPPORT_ABSENT,
WT_SUPPORT_AVAILABLE,
WT_SUPPORT_COMPATIBLE
} WriteTrackingSupport;
static
WriteTrackingSupport migrate_query_write_tracking(void)
{
/* Check if kernel supports required UFFD features */
if (!ram_write_tracking_available()) {
return WT_SUPPORT_ABSENT;
}
/*
* Check if current memory configuration is
* compatible with required UFFD features.
*/
if (!ram_write_tracking_compatible()) {
return WT_SUPPORT_AVAILABLE;
}
return WT_SUPPORT_COMPATIBLE;
}
/** /**
* @migration_caps_check - check capability validity * @migration_caps_check - check capability validity
* *
@ -1150,6 +1231,39 @@ static bool migrate_caps_check(bool *cap_list,
} }
} }
if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
WriteTrackingSupport wt_support;
int idx;
/*
* Check if 'background-snapshot' capability is supported by
* host kernel and compatible with guest memory configuration.
*/
wt_support = migrate_query_write_tracking();
if (wt_support < WT_SUPPORT_AVAILABLE) {
error_setg(errp, "Background-snapshot is not supported by host kernel");
return false;
}
if (wt_support < WT_SUPPORT_COMPATIBLE) {
error_setg(errp, "Background-snapshot is not compatible "
"with guest memory configuration");
return false;
}
/*
* Check if there are any migration capabilities
* incompatible with 'background-snapshot'.
*/
for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
int incomp_cap = check_caps_background_snapshot.caps[idx];
if (cap_list[incomp_cap]) {
error_setg(errp,
"Background-snapshot is not compatible with %s",
MigrationCapability_str(incomp_cap));
return false;
}
}
}
return true; return true;
} }
@ -1226,21 +1340,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
if (params->has_compress_level && if (params->has_compress_level &&
(params->compress_level > 9)) { (params->compress_level > 9)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
"is invalid, it should be in the range of 0 to 9"); "a value between 0 and 9");
return false; return false;
} }
if (params->has_compress_threads && (params->compress_threads < 1)) { if (params->has_compress_threads && (params->compress_threads < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"compress_threads", "compress_threads",
"is invalid, it should be in the range of 1 to 255"); "a value between 1 and 255");
return false; return false;
} }
if (params->has_decompress_threads && (params->decompress_threads < 1)) { if (params->has_decompress_threads && (params->decompress_threads < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"decompress_threads", "decompress_threads",
"is invalid, it should be in the range of 1 to 255"); "a value between 1 and 255");
return false; return false;
} }
@ -1293,21 +1407,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
if (params->has_multifd_channels && (params->multifd_channels < 1)) { if (params->has_multifd_channels && (params->multifd_channels < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"multifd_channels", "multifd_channels",
"is invalid, it should be in the range of 1 to 255"); "a value between 1 and 255");
return false; return false;
} }
if (params->has_multifd_zlib_level && if (params->has_multifd_zlib_level &&
(params->multifd_zlib_level > 9)) { (params->multifd_zlib_level > 9)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level", error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
"is invalid, it should be in the range of 0 to 9"); "a value between 0 and 9");
return false; return false;
} }
if (params->has_multifd_zstd_level && if (params->has_multifd_zstd_level &&
(params->multifd_zstd_level > 20)) { (params->multifd_zstd_level > 20)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
"is invalid, it should be in the range of 0 to 20"); "a value between 0 and 20");
return false; return false;
} }
@ -1316,8 +1430,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
!is_power_of_2(params->xbzrle_cache_size))) { !is_power_of_2(params->xbzrle_cache_size))) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"xbzrle_cache_size", "xbzrle_cache_size",
"is invalid, it should be bigger than target page size" "a power of two no less than the target page size");
" and a power of 2");
return false; return false;
} }
@ -1334,21 +1447,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
params->announce_initial > 100000) { params->announce_initial > 100000) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_initial", "announce_initial",
"is invalid, it must be less than 100000 ms"); "a value between 0 and 100000");
return false; return false;
} }
if (params->has_announce_max && if (params->has_announce_max &&
params->announce_max > 100000) { params->announce_max > 100000) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_max", "announce_max",
"is invalid, it must be less than 100000 ms"); "a value between 0 and 100000");
return false; return false;
} }
if (params->has_announce_rounds && if (params->has_announce_rounds &&
params->announce_rounds > 1000) { params->announce_rounds > 1000) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_rounds", "announce_rounds",
"is invalid, it must be in the range of 0 to 1000"); "a value between 0 and 1000");
return false; return false;
} }
if (params->has_announce_step && if (params->has_announce_step &&
@ -1356,7 +1469,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
params->announce_step > 10000)) { params->announce_step > 10000)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_step", "announce_step",
"is invalid, it must be in the range of 1 to 10000 ms"); "a value between 0 and 10000");
return false; return false;
} }
@ -1909,6 +2022,7 @@ void migrate_init(MigrationState *s)
* locks. * locks.
*/ */
s->cleanup_bh = 0; s->cleanup_bh = 0;
s->vm_start_bh = 0;
s->to_dst_file = NULL; s->to_dst_file = NULL;
s->state = MIGRATION_STATUS_NONE; s->state = MIGRATION_STATUS_NONE;
s->rp_state.from_dst_file = NULL; s->rp_state.from_dst_file = NULL;
@ -1934,8 +2048,6 @@ void migrate_init(MigrationState *s)
s->threshold_size = 0; s->threshold_size = 0;
} }
static GSList *migration_blockers;
int migrate_add_blocker(Error *reason, Error **errp) int migrate_add_blocker(Error *reason, Error **errp)
{ {
if (only_migratable) { if (only_migratable) {
@ -2216,7 +2328,7 @@ void qmp_migrate_set_cache_size(int64_t value, Error **errp)
qmp_migrate_set_parameters(&p, errp); qmp_migrate_set_parameters(&p, errp);
} }
int64_t qmp_query_migrate_cache_size(Error **errp) uint64_t qmp_query_migrate_cache_size(Error **errp)
{ {
return migrate_xbzrle_cache_size(); return migrate_xbzrle_cache_size();
} }
@ -2446,7 +2558,7 @@ int migrate_use_xbzrle(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
} }
int64_t migrate_xbzrle_cache_size(void) uint64_t migrate_xbzrle_cache_size(void)
{ {
MigrationState *s; MigrationState *s;
@ -2491,6 +2603,15 @@ bool migrate_use_block_incremental(void)
return s->parameters.block_incremental; return s->parameters.block_incremental;
} }
bool migrate_background_snapshot(void)
{
MigrationState *s;
s = migrate_get_current();
return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
}
/* migration thread support */ /* migration thread support */
/* /*
* Something bad happened to the RP stream, mark an error * Something bad happened to the RP stream, mark an error
@ -3117,6 +3238,50 @@ fail:
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILED);
} }
/**
* bg_migration_completion: Used by bg_migration_thread when after all the
* RAM has been saved. The caller 'breaks' the loop when this returns.
*
* @s: Current migration state
*/
static void bg_migration_completion(MigrationState *s)
{
int current_active_state = s->state;
/*
* Stop tracking RAM writes - un-protect memory, un-register UFFD
* memory ranges, flush kernel wait queues and wake up threads
* waiting for write fault to be resolved.
*/
ram_write_tracking_stop();
if (s->state == MIGRATION_STATUS_ACTIVE) {
/*
* By this moment we have RAM content saved into the migration stream.
* The next step is to flush the non-RAM content (device state)
* right after the ram content. The device state has been stored into
* the temporary buffer before RAM saving started.
*/
qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
qemu_fflush(s->to_dst_file);
} else if (s->state == MIGRATION_STATUS_CANCELLING) {
goto fail;
}
if (qemu_file_get_error(s->to_dst_file)) {
trace_migration_completion_file_err();
goto fail;
}
migrate_set_state(&s->state, current_active_state,
MIGRATION_STATUS_COMPLETED);
return;
fail:
migrate_set_state(&s->state, current_active_state,
MIGRATION_STATUS_FAILED);
}
bool migrate_colo_enabled(void) bool migrate_colo_enabled(void)
{ {
MigrationState *s = migrate_get_current(); MigrationState *s = migrate_get_current();
@ -3457,6 +3622,47 @@ static void migration_iteration_finish(MigrationState *s)
qemu_mutex_unlock_iothread(); qemu_mutex_unlock_iothread();
} }
static void bg_migration_iteration_finish(MigrationState *s)
{
qemu_mutex_lock_iothread();
switch (s->state) {
case MIGRATION_STATUS_COMPLETED:
migration_calculate_complete(s);
break;
case MIGRATION_STATUS_ACTIVE:
case MIGRATION_STATUS_FAILED:
case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_CANCELLING:
break;
default:
/* Should not reach here, but if so, forgive the VM. */
error_report("%s: Unknown ending state %d", __func__, s->state);
break;
}
migrate_fd_cleanup_schedule(s);
qemu_mutex_unlock_iothread();
}
/*
* Return true if continue to the next iteration directly, false
* otherwise.
*/
static MigIterateState bg_migration_iteration_run(MigrationState *s)
{
int res;
res = qemu_savevm_state_iterate(s->to_dst_file, false);
if (res > 0) {
bg_migration_completion(s);
return MIG_ITERATE_BREAK;
}
return MIG_ITERATE_RESUME;
}
void migration_make_urgent_request(void) void migration_make_urgent_request(void)
{ {
qemu_sem_post(&migrate_get_current()->rate_limit_sem); qemu_sem_post(&migrate_get_current()->rate_limit_sem);
@ -3604,6 +3810,165 @@ static void *migration_thread(void *opaque)
return NULL; return NULL;
} }
static void bg_migration_vm_start_bh(void *opaque)
{
MigrationState *s = opaque;
qemu_bh_delete(s->vm_start_bh);
s->vm_start_bh = NULL;
vm_start();
s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
}
/**
* Background snapshot thread, based on live migration code.
* This is an alternative implementation of live migration mechanism
* introduced specifically to support background snapshots.
*
* It takes advantage of userfault_fd write protection mechanism introduced
* in v5.7 kernel. Compared to existing dirty page logging migration much
* lesser stream traffic is produced resulting in smaller snapshot images,
* simply cause of no page duplicates can get into the stream.
*
* Another key point is that generated vmstate stream reflects machine state
* 'frozen' at the beginning of snapshot creation compared to dirty page logging
* mechanism, which effectively results in that saved snapshot is the state of VM
* at the end of the process.
*/
static void *bg_migration_thread(void *opaque)
{
MigrationState *s = opaque;
int64_t setup_start;
MigThrError thr_error;
QEMUFile *fb;
bool early_fail = true;
rcu_register_thread();
object_ref(OBJECT(s));
qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
/*
* We want to save vmstate for the moment when migration has been
* initiated but also we want to save RAM content while VM is running.
* The RAM content should appear first in the vmstate. So, we first
* stash the non-RAM part of the vmstate to the temporary buffer,
* then write RAM part of the vmstate to the migration stream
* with vCPUs running and, finally, write stashed non-RAM part of
* the vmstate from the buffer to the migration stream.
*/
s->bioc = qio_channel_buffer_new(128 * 1024);
qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
object_unref(OBJECT(s->bioc));
update_iteration_initial_status(s);
qemu_savevm_state_header(s->to_dst_file);
qemu_savevm_state_setup(s->to_dst_file);
if (qemu_savevm_state_guest_unplug_pending()) {
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_WAIT_UNPLUG);
while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
qemu_savevm_state_guest_unplug_pending()) {
qemu_sem_timedwait(&s->wait_unplug_sem, 250);
}
migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
MIGRATION_STATUS_ACTIVE);
} else {
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_ACTIVE);
}
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
trace_migration_thread_setup_complete();
s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
qemu_mutex_lock_iothread();
/*
* If VM is currently in suspended state, then, to make a valid runstate
* transition in vm_stop_force_state() we need to wakeup it up.
*/
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
s->vm_was_running = runstate_is_running();
if (global_state_store()) {
goto fail;
}
/* Forcibly stop VM before saving state of vCPUs and devices */
if (vm_stop_force_state(RUN_STATE_PAUSED)) {
goto fail;
}
/*
* Put vCPUs in sync with shadow context structures, then
* save their state to channel-buffer along with devices.
*/
cpu_synchronize_all_states();
if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
goto fail;
}
/* Now initialize UFFD context and start tracking RAM writes */
if (ram_write_tracking_start()) {
goto fail;
}
early_fail = false;
/*
* Start VM from BH handler to avoid write-fault lock here.
* UFFD-WP protection for the whole RAM is already enabled so
* calling VM state change notifiers from vm_start() would initiate
* writes to virtio VQs memory which is in write-protected region.
*/
s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
qemu_bh_schedule(s->vm_start_bh);
qemu_mutex_unlock_iothread();
while (migration_is_active(s)) {
MigIterateState iter_state = bg_migration_iteration_run(s);
if (iter_state == MIG_ITERATE_SKIP) {
continue;
} else if (iter_state == MIG_ITERATE_BREAK) {
break;
}
/*
* Try to detect any kind of failures, and see whether we
* should stop the migration now.
*/
thr_error = migration_detect_error(s);
if (thr_error == MIG_THR_ERR_FATAL) {
/* Stop migration */
break;
}
migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
}
trace_migration_thread_after_loop();
fail:
if (early_fail) {
migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_FAILED);
qemu_mutex_unlock_iothread();
}
bg_migration_iteration_finish(s);
qemu_fclose(fb);
object_unref(OBJECT(s));
rcu_unregister_thread();
return NULL;
}
void migrate_fd_connect(MigrationState *s, Error *error_in) void migrate_fd_connect(MigrationState *s, Error *error_in)
{ {
Error *local_err = NULL; Error *local_err = NULL;
@ -3667,8 +4032,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
migrate_fd_cleanup(s); migrate_fd_cleanup(s);
return; return;
} }
qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
QEMU_THREAD_JOINABLE); if (migrate_background_snapshot()) {
qemu_thread_create(&s->thread, "bg_snapshot",
bg_migration_thread, s, QEMU_THREAD_JOINABLE);
} else {
qemu_thread_create(&s->thread, "live_migration",
migration_thread, s, QEMU_THREAD_JOINABLE);
}
s->migration_thread_running = true; s->migration_thread_running = true;
} }
@ -3784,6 +4155,8 @@ static Property migration_properties[] = {
DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
DEFINE_PROP_MIG_CAP("x-background-snapshot",
MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
DEFINE_PROP_END_OF_LIST(), DEFINE_PROP_END_OF_LIST(),
}; };

View File

@ -20,6 +20,7 @@
#include "qemu/thread.h" #include "qemu/thread.h"
#include "qemu/coroutine_int.h" #include "qemu/coroutine_int.h"
#include "io/channel.h" #include "io/channel.h"
#include "io/channel-buffer.h"
#include "net/announce.h" #include "net/announce.h"
#include "qom/object.h" #include "qom/object.h"
@ -147,8 +148,10 @@ struct MigrationState {
/*< public >*/ /*< public >*/
QemuThread thread; QemuThread thread;
QEMUBH *vm_start_bh;
QEMUBH *cleanup_bh; QEMUBH *cleanup_bh;
QEMUFile *to_dst_file; QEMUFile *to_dst_file;
QIOChannelBuffer *bioc;
/* /*
* Protects to_dst_file pointer. We need to make sure we won't * Protects to_dst_file pointer. We need to make sure we won't
* yield or hang during the critical section, since this lock will * yield or hang during the critical section, since this lock will
@ -324,7 +327,7 @@ int migrate_multifd_zlib_level(void);
int migrate_multifd_zstd_level(void); int migrate_multifd_zstd_level(void);
int migrate_use_xbzrle(void); int migrate_use_xbzrle(void);
int64_t migrate_xbzrle_cache_size(void); uint64_t migrate_xbzrle_cache_size(void);
bool migrate_colo_enabled(void); bool migrate_colo_enabled(void);
bool migrate_use_block(void); bool migrate_use_block(void);
@ -341,6 +344,7 @@ int migrate_compress_wait_thread(void);
int migrate_decompress_threads(void); int migrate_decompress_threads(void);
bool migrate_use_events(void); bool migrate_use_events(void);
bool migrate_postcopy_blocktime(void); bool migrate_postcopy_blocktime(void);
bool migrate_background_snapshot(void);
/* Sending on the return path - generic and then for each message type */ /* Sending on the return path - generic and then for each message type */
void migrate_send_rp_shut(MigrationIncomingState *mis, void migrate_send_rp_shut(MigrationIncomingState *mis,

View File

@ -38,7 +38,7 @@ struct PageCache {
size_t num_items; size_t num_items;
}; };
PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) PageCache *cache_init(uint64_t new_size, size_t page_size, Error **errp)
{ {
int64_t i; int64_t i;
size_t num_pages = new_size / page_size; size_t num_pages = new_size / page_size;
@ -60,8 +60,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp)
/* We prefer not to abort if there is no memory */ /* We prefer not to abort if there is no memory */
cache = g_try_malloc(sizeof(*cache)); cache = g_try_malloc(sizeof(*cache));
if (!cache) { if (!cache) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", error_setg(errp, "Failed to allocate cache");
"Failed to allocate cache");
return NULL; return NULL;
} }
cache->page_size = page_size; cache->page_size = page_size;
@ -74,8 +73,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp)
cache->page_cache = g_try_malloc((cache->max_num_items) * cache->page_cache = g_try_malloc((cache->max_num_items) *
sizeof(*cache->page_cache)); sizeof(*cache->page_cache));
if (!cache->page_cache) { if (!cache->page_cache) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", error_setg(errp, "Failed to allocate page cache");
"Failed to allocate page cache");
g_free(cache); g_free(cache);
return NULL; return NULL;
} }

View File

@ -28,7 +28,7 @@ typedef struct PageCache PageCache;
* @page_size: cache page size * @page_size: cache page size
* @errp: set *errp if the check failed, with reason * @errp: set *errp if the check failed, with reason
*/ */
PageCache *cache_init(int64_t cache_size, size_t page_size, Error **errp); PageCache *cache_init(uint64_t cache_size, size_t page_size, Error **errp);
/** /**
* cache_fini: free all cache resources * cache_fini: free all cache resources
* @cache pointer to the PageCache struct * @cache pointer to the PageCache struct

View File

@ -595,7 +595,7 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
{ {
if (size < IO_BUF_SIZE) { if (size < IO_BUF_SIZE) {
size_t res; size_t res;
uint8_t *src; uint8_t *src = NULL;
res = qemu_peek_buffer(f, &src, size, 0); res = qemu_peek_buffer(f, &src, size, 0);

View File

@ -56,6 +56,11 @@
#include "savevm.h" #include "savevm.h"
#include "qemu/iov.h" #include "qemu/iov.h"
#include "multifd.h" #include "multifd.h"
#include "sysemu/runstate.h"
#if defined(__linux__)
#include "qemu/userfaultfd.h"
#endif /* defined(__linux__) */
/***********************************************************/ /***********************************************************/
/* ram save/restore */ /* ram save/restore */
@ -126,7 +131,7 @@ static void XBZRLE_cache_unlock(void)
* @new_size: new cache size * @new_size: new cache size
* @errp: set *errp if the check failed, with reason * @errp: set *errp if the check failed, with reason
*/ */
int xbzrle_cache_resize(int64_t new_size, Error **errp) int xbzrle_cache_resize(uint64_t new_size, Error **errp)
{ {
PageCache *new_cache; PageCache *new_cache;
int64_t ret = 0; int64_t ret = 0;
@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
struct RAMState { struct RAMState {
/* QEMUFile used for this migration */ /* QEMUFile used for this migration */
QEMUFile *f; QEMUFile *f;
/* UFFD file descriptor, used in 'write-tracking' migration */
int uffdio_fd;
/* Last block that we have visited searching for dirty pages */ /* Last block that we have visited searching for dirty pages */
RAMBlock *last_seen_block; RAMBlock *last_seen_block;
/* Last block from where we have sent data */ /* Last block from where we have sent data */
@ -1434,6 +1441,269 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
return block; return block;
} }
#if defined(__linux__)
/**
* poll_fault_page: try to get next UFFD write fault page and, if pending fault
* is found, return RAM block pointer and page offset
*
* Returns pointer to the RAMBlock containing faulting page,
* NULL if no write faults are pending
*
* @rs: current RAM state
* @offset: page offset from the beginning of the block
*/
static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
{
struct uffd_msg uffd_msg;
void *page_address;
RAMBlock *bs;
int res;
if (!migrate_background_snapshot()) {
return NULL;
}
res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
if (res <= 0) {
return NULL;
}
page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
bs = qemu_ram_block_from_host(page_address, false, offset);
assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
return bs;
}
/**
* ram_save_release_protection: release UFFD write protection after
* a range of pages has been saved
*
* @rs: current RAM state
* @pss: page-search-status structure
* @start_page: index of the first page in the range relative to pss->block
*
* Returns 0 on success, negative value in case of an error
*/
static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
unsigned long start_page)
{
int res = 0;
/* Check if page is from UFFD-managed region. */
if (pss->block->flags & RAM_UF_WRITEPROTECT) {
void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
/* Flush async buffers before un-protect. */
qemu_fflush(rs->f);
/* Un-protect memory range. */
res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
false, false);
}
return res;
}
/* ram_write_tracking_available: check if kernel supports required UFFD features
*
* Returns true if supports, false otherwise
*/
bool ram_write_tracking_available(void)
{
uint64_t uffd_features;
int res;
res = uffd_query_features(&uffd_features);
return (res == 0 &&
(uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
}
/* ram_write_tracking_compatible: check if guest configuration is
* compatible with 'write-tracking'
*
* Returns true if compatible, false otherwise
*/
bool ram_write_tracking_compatible(void)
{
const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
int uffd_fd;
RAMBlock *bs;
bool ret = false;
/* Open UFFD file descriptor */
uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
if (uffd_fd < 0) {
return false;
}
RCU_READ_LOCK_GUARD();
RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
uint64_t uffd_ioctls;
/* Nothing to do with read-only and MMIO-writable regions */
if (bs->mr->readonly || bs->mr->rom_device) {
continue;
}
/* Try to register block memory via UFFD-IO to track writes */
if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
goto out;
}
if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
goto out;
}
}
ret = true;
out:
uffd_close_fd(uffd_fd);
return ret;
}
/*
* ram_write_tracking_start: start UFFD-WP memory tracking
*
* Returns 0 for success or negative value in case of error
*/
int ram_write_tracking_start(void)
{
int uffd_fd;
RAMState *rs = ram_state;
RAMBlock *bs;
/* Open UFFD file descriptor */
uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
if (uffd_fd < 0) {
return uffd_fd;
}
rs->uffdio_fd = uffd_fd;
RCU_READ_LOCK_GUARD();
RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
/* Nothing to do with read-only and MMIO-writable regions */
if (bs->mr->readonly || bs->mr->rom_device) {
continue;
}
/* Register block memory with UFFD to track writes */
if (uffd_register_memory(rs->uffdio_fd, bs->host,
bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
goto fail;
}
/* Apply UFFD write protection to the block memory range */
if (uffd_change_protection(rs->uffdio_fd, bs->host,
bs->max_length, true, false)) {
goto fail;
}
bs->flags |= RAM_UF_WRITEPROTECT;
memory_region_ref(bs->mr);
trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
bs->host, bs->max_length);
}
return 0;
fail:
error_report("ram_write_tracking_start() failed: restoring initial memory state");
RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
continue;
}
/*
* In case some memory block failed to be write-protected
* remove protection and unregister all succeeded RAM blocks
*/
uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
/* Cleanup flags and remove reference */
bs->flags &= ~RAM_UF_WRITEPROTECT;
memory_region_unref(bs->mr);
}
uffd_close_fd(uffd_fd);
rs->uffdio_fd = -1;
return -1;
}
/**
* ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
*/
void ram_write_tracking_stop(void)
{
RAMState *rs = ram_state;
RAMBlock *bs;
RCU_READ_LOCK_GUARD();
RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
continue;
}
/* Remove protection and unregister all affected RAM blocks */
uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
bs->host, bs->max_length);
/* Cleanup flags and remove reference */
bs->flags &= ~RAM_UF_WRITEPROTECT;
memory_region_unref(bs->mr);
}
/* Finally close UFFD file descriptor */
uffd_close_fd(rs->uffdio_fd);
rs->uffdio_fd = -1;
}
#else
/* No target OS support, stubs just fail or ignore */
static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
{
(void) rs;
(void) offset;
return NULL;
}
static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
unsigned long start_page)
{
(void) rs;
(void) pss;
(void) start_page;
return 0;
}
bool ram_write_tracking_available(void)
{
return false;
}
bool ram_write_tracking_compatible(void)
{
assert(0);
return false;
}
int ram_write_tracking_start(void)
{
assert(0);
return -1;
}
void ram_write_tracking_stop(void)
{
assert(0);
}
#endif /* defined(__linux__) */
/** /**
* get_queued_page: unqueue a page from the postcopy requests * get_queued_page: unqueue a page from the postcopy requests
* *
@ -1473,6 +1743,14 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
} while (block && !dirty); } while (block && !dirty);
if (!block) {
/*
* Poll write faults too if background snapshot is enabled; that's
* when we have vcpus got blocked by the write protected pages.
*/
block = poll_fault_page(rs, &offset);
}
if (block) { if (block) {
/* /*
* As soon as we start servicing pages out of order, then we have * As soon as we start servicing pages out of order, then we have
@ -1715,6 +1993,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
int tmppages, pages = 0; int tmppages, pages = 0;
size_t pagesize_bits = size_t pagesize_bits =
qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
unsigned long start_page = pss->page;
int res;
if (ramblock_is_ignored(pss->block)) { if (ramblock_is_ignored(pss->block)) {
error_report("block %s should not be migrated !", pss->block->idstr); error_report("block %s should not be migrated !", pss->block->idstr);
@ -1740,10 +2020,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
} while ((pss->page & (pagesize_bits - 1)) && } while ((pss->page & (pagesize_bits - 1)) &&
offset_in_ramblock(pss->block, offset_in_ramblock(pss->block,
((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
/* The offset we leave with is the last one we looked at */ /* The offset we leave with is the last one we looked at */
pss->page--; pss->page--;
return pages;
res = ram_save_release_protection(rs, pss, start_page);
return (res < 0 ? res : pages);
} }
/** /**
@ -1880,10 +2161,13 @@ static void ram_save_cleanup(void *opaque)
RAMState **rsp = opaque; RAMState **rsp = opaque;
RAMBlock *block; RAMBlock *block;
/* caller have hold iothread lock or is in a bh, so there is /* We don't use dirty log with background snapshots */
* no writing race against the migration bitmap if (!migrate_background_snapshot()) {
*/ /* caller have hold iothread lock or is in a bh, so there is
memory_global_dirty_log_stop(); * no writing race against the migration bitmap
*/
memory_global_dirty_log_stop();
}
RAMBLOCK_FOREACH_NOT_IGNORED(block) { RAMBLOCK_FOREACH_NOT_IGNORED(block) {
g_free(block->clear_bmap); g_free(block->clear_bmap);
@ -2343,8 +2627,11 @@ static void ram_init_bitmaps(RAMState *rs)
WITH_RCU_READ_LOCK_GUARD() { WITH_RCU_READ_LOCK_GUARD() {
ram_list_init_bitmaps(); ram_list_init_bitmaps();
memory_global_dirty_log_start(); /* We don't use dirty log with background snapshots */
migration_bitmap_sync_precopy(rs); if (!migrate_background_snapshot()) {
memory_global_dirty_log_start();
migration_bitmap_sync_precopy(rs);
}
} }
qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_ramlist();
qemu_mutex_unlock_iothread(); qemu_mutex_unlock_iothread();
@ -3521,7 +3808,7 @@ static int ram_load_precopy(QEMUFile *f)
} }
} }
/* For postcopy we need to check hugepage sizes match */ /* For postcopy we need to check hugepage sizes match */
if (postcopy_advised && if (postcopy_advised && migrate_postcopy_ram() &&
block->page_size != qemu_host_page_size) { block->page_size != qemu_host_page_size) {
uint64_t remote_page_size = qemu_get_be64(f); uint64_t remote_page_size = qemu_get_be64(f);
if (remote_page_size != block->page_size) { if (remote_page_size != block->page_size) {

View File

@ -47,7 +47,7 @@ bool ramblock_is_ignored(RAMBlock *block);
INTERNAL_RAMBLOCK_FOREACH(block) \ INTERNAL_RAMBLOCK_FOREACH(block) \
if (!qemu_ram_is_migratable(block)) {} else if (!qemu_ram_is_migratable(block)) {} else
int xbzrle_cache_resize(int64_t new_size, Error **errp); int xbzrle_cache_resize(uint64_t new_size, Error **errp);
uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_remaining(void);
uint64_t ram_bytes_total(void); uint64_t ram_bytes_total(void);
@ -79,4 +79,10 @@ void colo_flush_ram_cache(void);
void colo_release_ram_cache(void); void colo_release_ram_cache(void);
void colo_incoming_start_dirty_log(void); void colo_incoming_start_dirty_log(void);
/* Background snapshot */
bool ram_write_tracking_available(void);
bool ram_write_tracking_compatible(void);
int ram_write_tracking_start(void);
void ram_write_tracking_stop(void);
#endif #endif

View File

@ -43,6 +43,8 @@
#include "qapi/error.h" #include "qapi/error.h"
#include "qapi/qapi-commands-migration.h" #include "qapi/qapi-commands-migration.h"
#include "qapi/qmp/json-writer.h" #include "qapi/qmp/json-writer.h"
#include "qapi/clone-visitor.h"
#include "qapi/qapi-builtin-visit.h"
#include "qapi/qmp/qerror.h" #include "qapi/qmp/qerror.h"
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "sysemu/cpus.h" #include "sysemu/cpus.h"
@ -315,6 +317,16 @@ static int configuration_pre_save(void *opaque)
return 0; return 0;
} }
static int configuration_post_save(void *opaque)
{
SaveState *state = opaque;
g_free(state->capabilities);
state->capabilities = NULL;
state->caps_count = 0;
return 0;
}
static int configuration_pre_load(void *opaque) static int configuration_pre_load(void *opaque)
{ {
SaveState *state = opaque; SaveState *state = opaque;
@ -365,24 +377,36 @@ static int configuration_post_load(void *opaque, int version_id)
{ {
SaveState *state = opaque; SaveState *state = opaque;
const char *current_name = MACHINE_GET_CLASS(current_machine)->name; const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
int ret = 0;
if (strncmp(state->name, current_name, state->len) != 0) { if (strncmp(state->name, current_name, state->len) != 0) {
error_report("Machine type received is '%.*s' and local is '%s'", error_report("Machine type received is '%.*s' and local is '%s'",
(int) state->len, state->name, current_name); (int) state->len, state->name, current_name);
return -EINVAL; ret = -EINVAL;
goto out;
} }
if (state->target_page_bits != qemu_target_page_bits()) { if (state->target_page_bits != qemu_target_page_bits()) {
error_report("Received TARGET_PAGE_BITS is %d but local is %d", error_report("Received TARGET_PAGE_BITS is %d but local is %d",
state->target_page_bits, qemu_target_page_bits()); state->target_page_bits, qemu_target_page_bits());
return -EINVAL; ret = -EINVAL;
goto out;
} }
if (!configuration_validate_capabilities(state)) { if (!configuration_validate_capabilities(state)) {
return -EINVAL; ret = -EINVAL;
goto out;
} }
return 0; out:
g_free((void *)state->name);
state->name = NULL;
state->len = 0;
g_free(state->capabilities);
state->capabilities = NULL;
state->caps_count = 0;
return ret;
} }
static int get_capability(QEMUFile *f, void *pv, size_t size, static int get_capability(QEMUFile *f, void *pv, size_t size,
@ -516,6 +540,7 @@ static const VMStateDescription vmstate_configuration = {
.pre_load = configuration_pre_load, .pre_load = configuration_pre_load,
.post_load = configuration_post_load, .post_load = configuration_post_load,
.pre_save = configuration_pre_save, .pre_save = configuration_pre_save,
.post_save = configuration_post_save,
.fields = (VMStateField[]) { .fields = (VMStateField[]) {
VMSTATE_UINT32(len, SaveState), VMSTATE_UINT32(len, SaveState),
VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
@ -1131,6 +1156,19 @@ bool qemu_savevm_state_blocked(Error **errp)
return false; return false;
} }
void qemu_savevm_non_migratable_list(strList **reasons)
{
SaveStateEntry *se;
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->vmsd && se->vmsd->unmigratable) {
QAPI_LIST_PREPEND(*reasons,
g_strdup_printf("non-migratable device: %s",
se->idstr));
}
}
}
void qemu_savevm_state_header(QEMUFile *f) void qemu_savevm_state_header(QEMUFile *f)
{ {
trace_savevm_state_header(); trace_savevm_state_header();
@ -1355,7 +1393,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
return 0; return 0;
} }
static
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy, bool in_postcopy,
bool inactivate_disks) bool inactivate_disks)
@ -2729,9 +2766,10 @@ int qemu_load_device_state(QEMUFile *f)
return 0; return 0;
} }
int save_snapshot(const char *name, Error **errp) bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
bool has_devices, strList *devices, Error **errp)
{ {
BlockDriverState *bs, *bs1; BlockDriverState *bs;
QEMUSnapshotInfo sn1, *sn = &sn1; QEMUSnapshotInfo sn1, *sn = &sn1;
int ret = -1, ret2; int ret = -1, ret2;
QEMUFile *f; QEMUFile *f;
@ -2742,35 +2780,43 @@ int save_snapshot(const char *name, Error **errp)
AioContext *aio_context; AioContext *aio_context;
if (migration_is_blocked(errp)) { if (migration_is_blocked(errp)) {
return ret; return false;
} }
if (!replay_can_snapshot()) { if (!replay_can_snapshot()) {
error_setg(errp, "Record/replay does not allow making snapshot " error_setg(errp, "Record/replay does not allow making snapshot "
"right now. Try once more later."); "right now. Try once more later.");
return ret; return false;
} }
if (!bdrv_all_can_snapshot(&bs)) { if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
error_setg(errp, "Device '%s' is writable but does not support " return false;
"snapshots", bdrv_get_device_or_node_name(bs));
return ret;
} }
/* Delete old snapshots of the same name */ /* Delete old snapshots of the same name */
if (name) { if (name) {
ret = bdrv_all_delete_snapshot(name, &bs1, errp); if (overwrite) {
if (ret < 0) { if (bdrv_all_delete_snapshot(name, has_devices,
error_prepend(errp, "Error while deleting snapshot on device " devices, errp) < 0) {
"'%s': ", bdrv_get_device_or_node_name(bs1)); return false;
return ret; }
} else {
ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
if (ret2 < 0) {
return false;
}
if (ret2 == 1) {
error_setg(errp,
"Snapshot '%s' already exists in one or more devices",
name);
return false;
}
} }
} }
bs = bdrv_all_find_vmstate_bs(); bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
if (bs == NULL) { if (bs == NULL) {
error_setg(errp, "No block device can accept snapshots"); return false;
return ret;
} }
aio_context = bdrv_get_aio_context(bs); aio_context = bdrv_get_aio_context(bs);
@ -2779,7 +2825,7 @@ int save_snapshot(const char *name, Error **errp)
ret = global_state_store(); ret = global_state_store();
if (ret) { if (ret) {
error_setg(errp, "Error saving global state"); error_setg(errp, "Error saving global state");
return ret; return false;
} }
vm_stop(RUN_STATE_SAVE_VM); vm_stop(RUN_STATE_SAVE_VM);
@ -2833,11 +2879,10 @@ int save_snapshot(const char *name, Error **errp)
aio_context_release(aio_context); aio_context_release(aio_context);
aio_context = NULL; aio_context = NULL;
ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
has_devices, devices, errp);
if (ret < 0) { if (ret < 0) {
error_setg(errp, "Error while creating snapshot on '%s'", bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
bdrv_get_device_or_node_name(bs));
bdrv_all_delete_snapshot(sn->name, &bs, NULL);
goto the_end; goto the_end;
} }
@ -2853,7 +2898,7 @@ int save_snapshot(const char *name, Error **errp)
if (saved_vm_running) { if (saved_vm_running) {
vm_start(); vm_start();
} }
return ret; return ret == 0;
} }
void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
@ -2938,33 +2983,32 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp)
migration_incoming_state_destroy(); migration_incoming_state_destroy();
} }
int load_snapshot(const char *name, Error **errp) bool load_snapshot(const char *name, const char *vmstate,
bool has_devices, strList *devices, Error **errp)
{ {
BlockDriverState *bs, *bs_vm_state; BlockDriverState *bs_vm_state;
QEMUSnapshotInfo sn; QEMUSnapshotInfo sn;
QEMUFile *f; QEMUFile *f;
int ret; int ret;
AioContext *aio_context; AioContext *aio_context;
MigrationIncomingState *mis = migration_incoming_get_current(); MigrationIncomingState *mis = migration_incoming_get_current();
if (!bdrv_all_can_snapshot(&bs)) { if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
error_setg(errp, return false;
"Device '%s' is writable but does not support snapshots",
bdrv_get_device_or_node_name(bs));
return -ENOTSUP;
} }
ret = bdrv_all_find_snapshot(name, &bs); ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
if (ret < 0) { if (ret < 0) {
error_setg(errp, return false;
"Device '%s' does not have the requested snapshot '%s'", }
bdrv_get_device_or_node_name(bs), name); if (ret == 0) {
return ret; error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
name);
return false;
} }
bs_vm_state = bdrv_all_find_vmstate_bs(); bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
if (!bs_vm_state) { if (!bs_vm_state) {
error_setg(errp, "No block device supports snapshots"); return false;
return -ENOTSUP;
} }
aio_context = bdrv_get_aio_context(bs_vm_state); aio_context = bdrv_get_aio_context(bs_vm_state);
@ -2973,11 +3017,11 @@ int load_snapshot(const char *name, Error **errp)
ret = bdrv_snapshot_find(bs_vm_state, &sn, name); ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
aio_context_release(aio_context); aio_context_release(aio_context);
if (ret < 0) { if (ret < 0) {
return ret; return false;
} else if (sn.vm_state_size == 0) { } else if (sn.vm_state_size == 0) {
error_setg(errp, "This is a disk-only snapshot. Revert to it " error_setg(errp, "This is a disk-only snapshot. Revert to it "
" offline using qemu-img"); " offline using qemu-img");
return -EINVAL; return false;
} }
/* /*
@ -2989,10 +3033,8 @@ int load_snapshot(const char *name, Error **errp)
/* Flush all IO requests so they don't interfere with the new state. */ /* Flush all IO requests so they don't interfere with the new state. */
bdrv_drain_all_begin(); bdrv_drain_all_begin();
ret = bdrv_all_goto_snapshot(name, &bs, errp); ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
if (ret < 0) { if (ret < 0) {
error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
name, bdrv_get_device_or_node_name(bs));
goto err_drain; goto err_drain;
} }
@ -3000,7 +3042,6 @@ int load_snapshot(const char *name, Error **errp)
f = qemu_fopen_bdrv(bs_vm_state, 0); f = qemu_fopen_bdrv(bs_vm_state, 0);
if (!f) { if (!f) {
error_setg(errp, "Could not open VM state file"); error_setg(errp, "Could not open VM state file");
ret = -EINVAL;
goto err_drain; goto err_drain;
} }
@ -3020,14 +3061,28 @@ int load_snapshot(const char *name, Error **errp)
if (ret < 0) { if (ret < 0) {
error_setg(errp, "Error %d while loading VM state", ret); error_setg(errp, "Error %d while loading VM state", ret);
return ret; return false;
} }
return 0; return true;
err_drain: err_drain:
bdrv_drain_all_end(); bdrv_drain_all_end();
return ret; return false;
}
bool delete_snapshot(const char *name, bool has_devices,
strList *devices, Error **errp)
{
if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
return false;
}
if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
return false;
}
return true;
} }
void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
@ -3057,3 +3112,187 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
return !(vmsd && vmsd->unmigratable); return !(vmsd && vmsd->unmigratable);
} }
typedef struct SnapshotJob {
Job common;
char *tag;
char *vmstate;
strList *devices;
Coroutine *co;
Error **errp;
bool ret;
} SnapshotJob;
static void qmp_snapshot_job_free(SnapshotJob *s)
{
g_free(s->tag);
g_free(s->vmstate);
qapi_free_strList(s->devices);
}
static void snapshot_load_job_bh(void *opaque)
{
Job *job = opaque;
SnapshotJob *s = container_of(job, SnapshotJob, common);
int orig_vm_running;
job_progress_set_remaining(&s->common, 1);
orig_vm_running = runstate_is_running();
vm_stop(RUN_STATE_RESTORE_VM);
s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
if (s->ret && orig_vm_running) {
vm_start();
}
job_progress_update(&s->common, 1);
qmp_snapshot_job_free(s);
aio_co_wake(s->co);
}
static void snapshot_save_job_bh(void *opaque)
{
Job *job = opaque;
SnapshotJob *s = container_of(job, SnapshotJob, common);
job_progress_set_remaining(&s->common, 1);
s->ret = save_snapshot(s->tag, false, s->vmstate,
true, s->devices, s->errp);
job_progress_update(&s->common, 1);
qmp_snapshot_job_free(s);
aio_co_wake(s->co);
}
static void snapshot_delete_job_bh(void *opaque)
{
Job *job = opaque;
SnapshotJob *s = container_of(job, SnapshotJob, common);
job_progress_set_remaining(&s->common, 1);
s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
job_progress_update(&s->common, 1);
qmp_snapshot_job_free(s);
aio_co_wake(s->co);
}
static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
{
SnapshotJob *s = container_of(job, SnapshotJob, common);
s->errp = errp;
s->co = qemu_coroutine_self();
aio_bh_schedule_oneshot(qemu_get_aio_context(),
snapshot_save_job_bh, job);
qemu_coroutine_yield();
return s->ret ? 0 : -1;
}
static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
{
SnapshotJob *s = container_of(job, SnapshotJob, common);
s->errp = errp;
s->co = qemu_coroutine_self();
aio_bh_schedule_oneshot(qemu_get_aio_context(),
snapshot_load_job_bh, job);
qemu_coroutine_yield();
return s->ret ? 0 : -1;
}
static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
{
SnapshotJob *s = container_of(job, SnapshotJob, common);
s->errp = errp;
s->co = qemu_coroutine_self();
aio_bh_schedule_oneshot(qemu_get_aio_context(),
snapshot_delete_job_bh, job);
qemu_coroutine_yield();
return s->ret ? 0 : -1;
}
static const JobDriver snapshot_load_job_driver = {
.instance_size = sizeof(SnapshotJob),
.job_type = JOB_TYPE_SNAPSHOT_LOAD,
.run = snapshot_load_job_run,
};
static const JobDriver snapshot_save_job_driver = {
.instance_size = sizeof(SnapshotJob),
.job_type = JOB_TYPE_SNAPSHOT_SAVE,
.run = snapshot_save_job_run,
};
static const JobDriver snapshot_delete_job_driver = {
.instance_size = sizeof(SnapshotJob),
.job_type = JOB_TYPE_SNAPSHOT_DELETE,
.run = snapshot_delete_job_run,
};
void qmp_snapshot_save(const char *job_id,
const char *tag,
const char *vmstate,
strList *devices,
Error **errp)
{
SnapshotJob *s;
s = job_create(job_id, &snapshot_save_job_driver, NULL,
qemu_get_aio_context(), JOB_MANUAL_DISMISS,
NULL, NULL, errp);
if (!s) {
return;
}
s->tag = g_strdup(tag);
s->vmstate = g_strdup(vmstate);
s->devices = QAPI_CLONE(strList, devices);
job_start(&s->common);
}
void qmp_snapshot_load(const char *job_id,
const char *tag,
const char *vmstate,
strList *devices,
Error **errp)
{
SnapshotJob *s;
s = job_create(job_id, &snapshot_load_job_driver, NULL,
qemu_get_aio_context(), JOB_MANUAL_DISMISS,
NULL, NULL, errp);
if (!s) {
return;
}
s->tag = g_strdup(tag);
s->vmstate = g_strdup(vmstate);
s->devices = QAPI_CLONE(strList, devices);
job_start(&s->common);
}
void qmp_snapshot_delete(const char *job_id,
const char *tag,
strList *devices,
Error **errp)
{
SnapshotJob *s;
s = job_create(job_id, &snapshot_delete_job_driver, NULL,
qemu_get_aio_context(), JOB_MANUAL_DISMISS,
NULL, NULL, errp);
if (!s) {
return;
}
s->tag = g_strdup(tag);
s->devices = QAPI_CLONE(strList, devices);
job_start(&s->common);
}

View File

@ -30,6 +30,7 @@
#define QEMU_VM_SECTION_FOOTER 0x7e #define QEMU_VM_SECTION_FOOTER 0x7e
bool qemu_savevm_state_blocked(Error **errp); bool qemu_savevm_state_blocked(Error **errp);
void qemu_savevm_non_migratable_list(strList **reasons);
void qemu_savevm_state_setup(QEMUFile *f); void qemu_savevm_state_setup(QEMUFile *f);
bool qemu_savevm_state_guest_unplug_pending(void); bool qemu_savevm_state_guest_unplug_pending(void);
int qemu_savevm_state_resume_prepare(MigrationState *s); int qemu_savevm_state_resume_prepare(MigrationState *s);
@ -64,5 +65,7 @@ int qemu_loadvm_state(QEMUFile *f);
void qemu_loadvm_state_cleanup(void); void qemu_loadvm_state_cleanup(void);
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
int qemu_load_device_state(QEMUFile *f); int qemu_load_device_state(QEMUFile *f);
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy, bool inactivate_disks);
#endif #endif

View File

@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) ""
save_xbzrle_page_overflow(void) "" save_xbzrle_page_overflow(void) ""
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
# multifd.c # multifd.c
multifd_new_send_channel_async(uint8_t id) "channel %d" multifd_new_send_channel_async(uint8_t id) "channel %d"

View File

@ -224,6 +224,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
migration_global_dump(mon); migration_global_dump(mon);
if (info->blocked) {
strList *reasons = info->blocked_reasons;
monitor_printf(mon, "Outgoing migration blocked:\n");
while (reasons) {
monitor_printf(mon, " %s\n", reasons->value);
reasons = reasons->next;
}
}
if (info->has_status) { if (info->has_status) {
monitor_printf(mon, "Migration status: %s", monitor_printf(mon, "Migration status: %s",
MigrationStatus_str(info->status)); MigrationStatus_str(info->status));
@ -1130,7 +1139,7 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict)
vm_stop(RUN_STATE_RESTORE_VM); vm_stop(RUN_STATE_RESTORE_VM);
if (load_snapshot(name, &err) == 0 && saved_vm_running) { if (!load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) {
vm_start(); vm_start();
} }
hmp_handle_error(mon, err); hmp_handle_error(mon, err);
@ -1140,21 +1149,17 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
{ {
Error *err = NULL; Error *err = NULL;
save_snapshot(qdict_get_try_str(qdict, "name"), &err); save_snapshot(qdict_get_try_str(qdict, "name"),
true, NULL, false, NULL, &err);
hmp_handle_error(mon, err); hmp_handle_error(mon, err);
} }
void hmp_delvm(Monitor *mon, const QDict *qdict) void hmp_delvm(Monitor *mon, const QDict *qdict)
{ {
BlockDriverState *bs;
Error *err = NULL; Error *err = NULL;
const char *name = qdict_get_str(qdict, "name"); const char *name = qdict_get_str(qdict, "name");
if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { delete_snapshot(name, false, NULL, &err);
error_prepend(&err,
"deleting snapshot on device '%s': ",
bdrv_get_device_name(bs));
}
hmp_handle_error(mon, err); hmp_handle_error(mon, err);
} }
@ -1294,11 +1299,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
switch (val) { switch (val) {
case MIGRATION_PARAMETER_COMPRESS_LEVEL: case MIGRATION_PARAMETER_COMPRESS_LEVEL:
p->has_compress_level = true; p->has_compress_level = true;
visit_type_int(v, param, &p->compress_level, &err); visit_type_uint8(v, param, &p->compress_level, &err);
break; break;
case MIGRATION_PARAMETER_COMPRESS_THREADS: case MIGRATION_PARAMETER_COMPRESS_THREADS:
p->has_compress_threads = true; p->has_compress_threads = true;
visit_type_int(v, param, &p->compress_threads, &err); visit_type_uint8(v, param, &p->compress_threads, &err);
break; break;
case MIGRATION_PARAMETER_COMPRESS_WAIT_THREAD: case MIGRATION_PARAMETER_COMPRESS_WAIT_THREAD:
p->has_compress_wait_thread = true; p->has_compress_wait_thread = true;
@ -1306,19 +1311,19 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break; break;
case MIGRATION_PARAMETER_DECOMPRESS_THREADS: case MIGRATION_PARAMETER_DECOMPRESS_THREADS:
p->has_decompress_threads = true; p->has_decompress_threads = true;
visit_type_int(v, param, &p->decompress_threads, &err); visit_type_uint8(v, param, &p->decompress_threads, &err);
break; break;
case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD: case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD:
p->has_throttle_trigger_threshold = true; p->has_throttle_trigger_threshold = true;
visit_type_int(v, param, &p->throttle_trigger_threshold, &err); visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err);
break; break;
case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL: case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL:
p->has_cpu_throttle_initial = true; p->has_cpu_throttle_initial = true;
visit_type_int(v, param, &p->cpu_throttle_initial, &err); visit_type_uint8(v, param, &p->cpu_throttle_initial, &err);
break; break;
case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT: case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT:
p->has_cpu_throttle_increment = true; p->has_cpu_throttle_increment = true;
visit_type_int(v, param, &p->cpu_throttle_increment, &err); visit_type_uint8(v, param, &p->cpu_throttle_increment, &err);
break; break;
case MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW: case MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW:
p->has_cpu_throttle_tailslow = true; p->has_cpu_throttle_tailslow = true;
@ -1326,7 +1331,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break; break;
case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: case MIGRATION_PARAMETER_MAX_CPU_THROTTLE:
p->has_max_cpu_throttle = true; p->has_max_cpu_throttle = true;
visit_type_int(v, param, &p->max_cpu_throttle, &err); visit_type_uint8(v, param, &p->max_cpu_throttle, &err);
break; break;
case MIGRATION_PARAMETER_TLS_CREDS: case MIGRATION_PARAMETER_TLS_CREDS:
p->has_tls_creds = true; p->has_tls_creds = true;
@ -1362,11 +1367,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break; break;
case MIGRATION_PARAMETER_DOWNTIME_LIMIT: case MIGRATION_PARAMETER_DOWNTIME_LIMIT:
p->has_downtime_limit = true; p->has_downtime_limit = true;
visit_type_int(v, param, &p->downtime_limit, &err); visit_type_size(v, param, &p->downtime_limit, &err);
break; break;
case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY: case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY:
p->has_x_checkpoint_delay = true; p->has_x_checkpoint_delay = true;
visit_type_int(v, param, &p->x_checkpoint_delay, &err); visit_type_uint32(v, param, &p->x_checkpoint_delay, &err);
break; break;
case MIGRATION_PARAMETER_BLOCK_INCREMENTAL: case MIGRATION_PARAMETER_BLOCK_INCREMENTAL:
p->has_block_incremental = true; p->has_block_incremental = true;
@ -1374,7 +1379,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break; break;
case MIGRATION_PARAMETER_MULTIFD_CHANNELS: case MIGRATION_PARAMETER_MULTIFD_CHANNELS:
p->has_multifd_channels = true; p->has_multifd_channels = true;
visit_type_int(v, param, &p->multifd_channels, &err); visit_type_uint8(v, param, &p->multifd_channels, &err);
break; break;
case MIGRATION_PARAMETER_MULTIFD_COMPRESSION: case MIGRATION_PARAMETER_MULTIFD_COMPRESSION:
p->has_multifd_compression = true; p->has_multifd_compression = true;
@ -1383,11 +1388,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break; break;
case MIGRATION_PARAMETER_MULTIFD_ZLIB_LEVEL: case MIGRATION_PARAMETER_MULTIFD_ZLIB_LEVEL:
p->has_multifd_zlib_level = true; p->has_multifd_zlib_level = true;
visit_type_int(v, param, &p->multifd_zlib_level, &err); visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
break; break;
case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
p->has_multifd_zstd_level = true; p->has_multifd_zstd_level = true;
visit_type_int(v, param, &p->multifd_zstd_level, &err); visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
break; break;
case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE:
p->has_xbzrle_cache_size = true; p->has_xbzrle_cache_size = true;

View File

@ -22,10 +22,17 @@
# #
# @amend: image options amend job type, see "x-blockdev-amend" (since 5.1) # @amend: image options amend job type, see "x-blockdev-amend" (since 5.1)
# #
# @snapshot-load: snapshot load job type, see "snapshot-load" (since 6.0)
#
# @snapshot-save: snapshot save job type, see "snapshot-save" (since 6.0)
#
# @snapshot-delete: snapshot delete job type, see "snapshot-delete" (since 6.0)
#
# Since: 1.7 # Since: 1.7
## ##
{ 'enum': 'JobType', { 'enum': 'JobType',
'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend'] } 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend',
'snapshot-load', 'snapshot-save', 'snapshot-delete'] }
## ##
# @JobStatus: # @JobStatus:

View File

@ -78,7 +78,7 @@
# Since: 1.2 # Since: 1.2
## ##
{ 'struct': 'XBZRLECacheStats', { 'struct': 'XBZRLECacheStats',
'data': {'cache-size': 'int', 'bytes': 'int', 'pages': 'int', 'data': {'cache-size': 'size', 'bytes': 'int', 'pages': 'int',
'cache-miss': 'int', 'cache-miss-rate': 'number', 'cache-miss': 'int', 'cache-miss-rate': 'number',
'encoding-rate': 'number', 'overflow': 'int' } } 'encoding-rate': 'number', 'overflow': 'int' } }
@ -224,6 +224,10 @@
# only returned if VFIO device is present, migration is supported by all # only returned if VFIO device is present, migration is supported by all
# VFIO devices and status is 'active' or 'completed' (since 5.2) # VFIO devices and status is 'active' or 'completed' (since 5.2)
# #
# @blocked: True if outgoing migration is blocked (since 6.0)
#
# @blocked-reasons: A list of reasons an outgoing migration is blocked (since 6.0)
#
# Since: 0.14 # Since: 0.14
## ##
{ 'struct': 'MigrationInfo', { 'struct': 'MigrationInfo',
@ -237,6 +241,8 @@
'*setup-time': 'int', '*setup-time': 'int',
'*cpu-throttle-percentage': 'int', '*cpu-throttle-percentage': 'int',
'*error-desc': 'str', '*error-desc': 'str',
'blocked': 'bool',
'*blocked-reasons': ['str'],
'*postcopy-blocktime' : 'uint32', '*postcopy-blocktime' : 'uint32',
'*postcopy-vcpu-blocktime': ['uint32'], '*postcopy-vcpu-blocktime': ['uint32'],
'*compression': 'CompressionStats', '*compression': 'CompressionStats',
@ -442,6 +448,11 @@
# @validate-uuid: Send the UUID of the source to allow the destination # @validate-uuid: Send the UUID of the source to allow the destination
# to ensure it is the same. (since 4.2) # to ensure it is the same. (since 4.2)
# #
# @background-snapshot: If enabled, the migration stream will be a snapshot
# of the VM exactly at the point when the migration
# procedure starts. The VM RAM is saved with running VM.
# (since 6.0)
#
# Since: 1.2 # Since: 1.2
## ##
{ 'enum': 'MigrationCapability', { 'enum': 'MigrationCapability',
@ -449,7 +460,7 @@
'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram',
'block', 'return-path', 'pause-before-switchover', 'multifd', 'block', 'return-path', 'pause-before-switchover', 'multifd',
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
'x-ignore-shared', 'validate-uuid' ] } 'x-ignore-shared', 'validate-uuid', 'background-snapshot'] }
## ##
# @MigrationCapabilityStatus: # @MigrationCapabilityStatus:
@ -885,28 +896,28 @@
'*announce-max': 'size', '*announce-max': 'size',
'*announce-rounds': 'size', '*announce-rounds': 'size',
'*announce-step': 'size', '*announce-step': 'size',
'*compress-level': 'int', '*compress-level': 'uint8',
'*compress-threads': 'int', '*compress-threads': 'uint8',
'*compress-wait-thread': 'bool', '*compress-wait-thread': 'bool',
'*decompress-threads': 'int', '*decompress-threads': 'uint8',
'*throttle-trigger-threshold': 'int', '*throttle-trigger-threshold': 'uint8',
'*cpu-throttle-initial': 'int', '*cpu-throttle-initial': 'uint8',
'*cpu-throttle-increment': 'int', '*cpu-throttle-increment': 'uint8',
'*cpu-throttle-tailslow': 'bool', '*cpu-throttle-tailslow': 'bool',
'*tls-creds': 'StrOrNull', '*tls-creds': 'StrOrNull',
'*tls-hostname': 'StrOrNull', '*tls-hostname': 'StrOrNull',
'*tls-authz': 'StrOrNull', '*tls-authz': 'StrOrNull',
'*max-bandwidth': 'int', '*max-bandwidth': 'size',
'*downtime-limit': 'int', '*downtime-limit': 'uint64',
'*x-checkpoint-delay': 'int', '*x-checkpoint-delay': 'uint32',
'*block-incremental': 'bool', '*block-incremental': 'bool',
'*multifd-channels': 'int', '*multifd-channels': 'uint8',
'*xbzrle-cache-size': 'size', '*xbzrle-cache-size': 'size',
'*max-postcopy-bandwidth': 'size', '*max-postcopy-bandwidth': 'size',
'*max-cpu-throttle': 'int', '*max-cpu-throttle': 'uint8',
'*multifd-compression': 'MultiFDCompression', '*multifd-compression': 'MultiFDCompression',
'*multifd-zlib-level': 'int', '*multifd-zlib-level': 'uint8',
'*multifd-zstd-level': 'int', '*multifd-zstd-level': 'uint8',
'*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } }
## ##
@ -1093,7 +1104,7 @@
'*max-bandwidth': 'size', '*max-bandwidth': 'size',
'*downtime-limit': 'uint64', '*downtime-limit': 'uint64',
'*x-checkpoint-delay': 'uint32', '*x-checkpoint-delay': 'uint32',
'*block-incremental': 'bool' , '*block-incremental': 'bool',
'*multifd-channels': 'uint8', '*multifd-channels': 'uint8',
'*xbzrle-cache-size': 'size', '*xbzrle-cache-size': 'size',
'*max-postcopy-bandwidth': 'size', '*max-postcopy-bandwidth': 'size',
@ -1465,7 +1476,7 @@
# <- { "return": 67108864 } # <- { "return": 67108864 }
# #
## ##
{ 'command': 'query-migrate-cache-size', 'returns': 'int', { 'command': 'query-migrate-cache-size', 'returns': 'size',
'features': [ 'deprecated' ] } 'features': [ 'deprecated' ] }
## ##
@ -1843,3 +1854,176 @@
# Since: 5.2 # Since: 5.2
## ##
{ 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' } { 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' }
##
# @snapshot-save:
#
# Save a VM snapshot
#
# @job-id: identifier for the newly created job
# @tag: name of the snapshot to create
# @vmstate: block device node name to save vmstate to
# @devices: list of block device node names to save a snapshot to
#
# Applications should not assume that the snapshot save is complete
# when this command returns. The job commands / events must be used
# to determine completion and to fetch details of any errors that arise.
#
# Note that execution of the guest CPUs may be stopped during the
# time it takes to save the snapshot. A future version of QEMU
# may ensure CPUs are executing continuously.
#
# It is strongly recommended that @devices contain all writable
# block device nodes if a consistent snapshot is required.
#
# If @tag already exists, an error will be reported
#
# Returns: nothing
#
# Example:
#
# -> { "execute": "snapshot-save",
# "data": {
# "job-id": "snapsave0",
# "tag": "my-snap",
# "vmstate": "disk0",
# "devices": ["disk0", "disk1"]
# }
# }
# <- { "return": { } }
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "created", "id": "snapsave0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "running", "id": "snapsave0"}}
# <- {"event": "STOP"}
# <- {"event": "RESUME"}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "waiting", "id": "snapsave0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "pending", "id": "snapsave0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "concluded", "id": "snapsave0"}}
# -> {"execute": "query-jobs"}
# <- {"return": [{"current-progress": 1,
# "status": "concluded",
# "total-progress": 1,
# "type": "snapshot-save",
# "id": "snapsave0"}]}
#
# Since: 6.0
##
{ 'command': 'snapshot-save',
'data': { 'job-id': 'str',
'tag': 'str',
'vmstate': 'str',
'devices': ['str'] } }
##
# @snapshot-load:
#
# Load a VM snapshot
#
# @job-id: identifier for the newly created job
# @tag: name of the snapshot to load.
# @vmstate: block device node name to load vmstate from
# @devices: list of block device node names to load a snapshot from
#
# Applications should not assume that the snapshot load is complete
# when this command returns. The job commands / events must be used
# to determine completion and to fetch details of any errors that arise.
#
# Note that execution of the guest CPUs will be stopped during the
# time it takes to load the snapshot.
#
# It is strongly recommended that @devices contain all writable
# block device nodes that can have changed since the original
# @snapshot-save command execution.
#
# Returns: nothing
#
# Example:
#
# -> { "execute": "snapshot-load",
# "data": {
# "job-id": "snapload0",
# "tag": "my-snap",
# "vmstate": "disk0",
# "devices": ["disk0", "disk1"]
# }
# }
# <- { "return": { } }
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "created", "id": "snapload0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "running", "id": "snapload0"}}
# <- {"event": "STOP"}
# <- {"event": "RESUME"}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "waiting", "id": "snapload0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "pending", "id": "snapload0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "concluded", "id": "snapload0"}}
# -> {"execute": "query-jobs"}
# <- {"return": [{"current-progress": 1,
# "status": "concluded",
# "total-progress": 1,
# "type": "snapshot-load",
# "id": "snapload0"}]}
#
# Since: 6.0
##
{ 'command': 'snapshot-load',
'data': { 'job-id': 'str',
'tag': 'str',
'vmstate': 'str',
'devices': ['str'] } }
##
# @snapshot-delete:
#
# Delete a VM snapshot
#
# @job-id: identifier for the newly created job
# @tag: name of the snapshot to delete.
# @devices: list of block device node names to delete a snapshot from
#
# Applications should not assume that the snapshot delete is complete
# when this command returns. The job commands / events must be used
# to determine completion and to fetch details of any errors that arise.
#
# Returns: nothing
#
# Example:
#
# -> { "execute": "snapshot-delete",
# "data": {
# "job-id": "snapdelete0",
# "tag": "my-snap",
# "devices": ["disk0", "disk1"]
# }
# }
# <- { "return": { } }
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "created", "id": "snapdelete0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "running", "id": "snapdelete0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "waiting", "id": "snapdelete0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "pending", "id": "snapdelete0"}}
# <- {"event": "JOB_STATUS_CHANGE",
# "data": {"status": "concluded", "id": "snapdelete0"}}
# -> {"execute": "query-jobs"}
# <- {"return": [{"current-progress": 1,
# "status": "concluded",
# "total-progress": 1,
# "type": "snapshot-delete",
# "id": "snapdelete0"}]}
#
# Since: 6.0
##
{ 'command': 'snapshot-delete',
'data': { 'job-id': 'str',
'tag': 'str',
'devices': ['str'] } }

View File

@ -143,12 +143,13 @@ static char *replay_find_nearest_snapshot(int64_t icount,
QEMUSnapshotInfo *sn_tab; QEMUSnapshotInfo *sn_tab;
QEMUSnapshotInfo *nearest = NULL; QEMUSnapshotInfo *nearest = NULL;
char *ret = NULL; char *ret = NULL;
int rv;
int nb_sns, i; int nb_sns, i;
AioContext *aio_context; AioContext *aio_context;
*snapshot_icount = -1; *snapshot_icount = -1;
bs = bdrv_all_find_vmstate_bs(); bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, NULL);
if (!bs) { if (!bs) {
goto fail; goto fail;
} }
@ -159,7 +160,10 @@ static char *replay_find_nearest_snapshot(int64_t icount,
aio_context_release(aio_context); aio_context_release(aio_context);
for (i = 0; i < nb_sns; i++) { for (i = 0; i < nb_sns; i++) {
if (bdrv_all_find_snapshot(sn_tab[i].name, &bs) == 0) { rv = bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL);
if (rv < 0)
goto fail;
if (rv == 1) {
if (sn_tab[i].icount != -1ULL if (sn_tab[i].icount != -1ULL
&& sn_tab[i].icount <= icount && sn_tab[i].icount <= icount
&& (!nearest || nearest->icount < sn_tab[i].icount)) { && (!nearest || nearest->icount < sn_tab[i].icount)) {
@ -192,7 +196,7 @@ static void replay_seek(int64_t icount, QEMUTimerCB callback, Error **errp)
if (icount < replay_get_current_icount() if (icount < replay_get_current_icount()
|| replay_get_current_icount() < snapshot_icount) { || replay_get_current_icount() < snapshot_icount) {
vm_stop(RUN_STATE_RESTORE_VM); vm_stop(RUN_STATE_RESTORE_VM);
load_snapshot(snapshot, errp); load_snapshot(snapshot, NULL, false, NULL, errp);
} }
g_free(snapshot); g_free(snapshot);
} }
@ -323,7 +327,7 @@ void replay_gdb_attached(void)
*/ */
if (replay_mode == REPLAY_MODE_PLAY if (replay_mode == REPLAY_MODE_PLAY
&& !replay_snapshot) { && !replay_snapshot) {
if (save_snapshot("start_debugging", NULL) != 0) { if (!save_snapshot("start_debugging", true, NULL, false, NULL, NULL)) {
/* Can't create the snapshot. Continue conventional debugging. */ /* Can't create the snapshot. Continue conventional debugging. */
} }
} }

View File

@ -77,13 +77,14 @@ void replay_vmstate_init(void)
if (replay_snapshot) { if (replay_snapshot) {
if (replay_mode == REPLAY_MODE_RECORD) { if (replay_mode == REPLAY_MODE_RECORD) {
if (save_snapshot(replay_snapshot, &err) != 0) { if (!save_snapshot(replay_snapshot,
true, NULL, false, NULL, &err)) {
error_report_err(err); error_report_err(err);
error_report("Could not create snapshot for icount record"); error_report("Could not create snapshot for icount record");
exit(1); exit(1);
} }
} else if (replay_mode == REPLAY_MODE_PLAY) { } else if (replay_mode == REPLAY_MODE_PLAY) {
if (load_snapshot(replay_snapshot, &err) != 0) { if (!load_snapshot(replay_snapshot, NULL, false, NULL, &err)) {
error_report_err(err); error_report_err(err);
error_report("Could not load snapshot for icount replay"); error_report("Could not load snapshot for icount replay");
exit(1); exit(1);

122
scripts/userfaultfd-wrlat.py Executable file
View File

@ -0,0 +1,122 @@
#!/usr/bin/python3
#
# userfaultfd-wrlat Summarize userfaultfd write fault latencies.
# Events are continuously accumulated for the
# run, while latency distribution histogram is
# dumped each 'interval' seconds.
#
# For Linux, uses BCC, eBPF.
#
# USAGE: userfaultfd-lat [interval [count]]
#
# Copyright Virtuozzo GmbH, 2020
#
# Authors:
# Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
#
# This work is licensed under the terms of the GNU GPL, version 2 or
# later. See the COPYING file in the top-level directory.
from __future__ import print_function
from bcc import BPF
from ctypes import c_ushort, c_int, c_ulonglong
from time import sleep
from sys import argv
def usage():
print("USAGE: %s [interval [count]]" % argv[0])
exit()
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/mm.h>
BPF_HASH(ev_start, u32, u64);
BPF_HISTOGRAM(ev_delta_hist, u64);
/* Trace UFFD page fault start event. */
static void do_event_start()
{
/* Using "(u32)" to drop group ID which is upper 32 bits */
u32 tid = (u32) bpf_get_current_pid_tgid();
u64 ts = bpf_ktime_get_ns();
ev_start.update(&tid, &ts);
}
/* Trace UFFD page fault end event. */
static void do_event_end()
{
/* Using "(u32)" to drop group ID which is upper 32 bits */
u32 tid = (u32) bpf_get_current_pid_tgid();
u64 ts = bpf_ktime_get_ns();
u64 *tsp;
tsp = ev_start.lookup(&tid);
if (tsp) {
u64 delta = ts - (*tsp);
/* Transform time delta to milliseconds */
ev_delta_hist.increment(bpf_log2l(delta / 1000000));
ev_start.delete(&tid);
}
}
/* KPROBE for handle_userfault(). */
int probe_handle_userfault(struct pt_regs *ctx, struct vm_fault *vmf,
unsigned long reason)
{
/* Trace only UFFD write faults. */
if (reason & VM_UFFD_WP) {
do_event_start();
}
return 0;
}
/* KRETPROBE for handle_userfault(). */
int retprobe_handle_userfault(struct pt_regs *ctx)
{
do_event_end();
return 0;
}
"""
# arguments
interval = 10
count = -1
if len(argv) > 1:
try:
interval = int(argv[1])
if interval == 0:
raise
if len(argv) > 2:
count = int(argv[2])
except: # also catches -h, --help
usage()
# load BPF program
b = BPF(text=bpf_text)
# attach KRPOBEs
b.attach_kprobe(event="handle_userfault", fn_name="probe_handle_userfault")
b.attach_kretprobe(event="handle_userfault", fn_name="retprobe_handle_userfault")
# header
print("Tracing UFFD-WP write fault latency... Hit Ctrl-C to end.")
# output
loop = 0
do_exit = 0
while (1):
if count > 0:
loop += 1
if loop > count:
exit()
try:
sleep(interval)
except KeyboardInterrupt:
pass; do_exit = 1
print()
b["ev_delta_hist"].print_log2_hist("msecs")
if do_exit:
exit()

View File

@ -2545,7 +2545,7 @@ void qmp_x_exit_preconfig(Error **errp)
if (loadvm) { if (loadvm) {
Error *local_err = NULL; Error *local_err = NULL;
if (load_snapshot(loadvm, &local_err) < 0) { if (!load_snapshot(loadvm, NULL, false, NULL, &local_err)) {
error_report_err(local_err); error_report_err(local_err);
autostart = 0; autostart = 0;
exit(1); exit(1);

View File

@ -6,11 +6,11 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
Testing: Testing:
QEMU X.Y.Z monitor - type 'help' for more information QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0 (qemu) savevm snap0
Error: No block device can accept snapshots Error: no block device can store vmstate for snapshot
(qemu) info snapshots (qemu) info snapshots
No available block device supports snapshots no block device can store vmstate for snapshot
(qemu) loadvm snap0 (qemu) loadvm snap0
Error: No block device supports snapshots Error: no block device can store vmstate for snapshot
(qemu) quit (qemu) quit
@ -22,7 +22,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0 (qemu) savevm snap0
Error: Device 'none0' is writable but does not support snapshots Error: Device 'none0' is writable but does not support snapshots
(qemu) info snapshots (qemu) info snapshots
No available block device supports snapshots no block device can store vmstate for snapshot
(qemu) loadvm snap0 (qemu) loadvm snap0
Error: Device 'none0' is writable but does not support snapshots Error: Device 'none0' is writable but does not support snapshots
(qemu) quit (qemu) quit
@ -58,7 +58,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0 (qemu) savevm snap0
Error: Device 'virtio0' is writable but does not support snapshots Error: Device 'virtio0' is writable but does not support snapshots
(qemu) info snapshots (qemu) info snapshots
No available block device supports snapshots no block device can store vmstate for snapshot
(qemu) loadvm snap0 (qemu) loadvm snap0
Error: Device 'virtio0' is writable but does not support snapshots Error: Device 'virtio0' is writable but does not support snapshots
(qemu) quit (qemu) quit
@ -83,7 +83,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0 (qemu) savevm snap0
Error: Device 'file' is writable but does not support snapshots Error: Device 'file' is writable but does not support snapshots
(qemu) info snapshots (qemu) info snapshots
No available block device supports snapshots no block device can store vmstate for snapshot
(qemu) loadvm snap0 (qemu) loadvm snap0
Error: Device 'file' is writable but does not support snapshots Error: Device 'file' is writable but does not support snapshots
(qemu) quit (qemu) quit

View File

@ -53,6 +53,15 @@ _in_fd=4
# If $mismatch_only is set, only non-matching responses will # If $mismatch_only is set, only non-matching responses will
# be echoed. # be echoed.
# #
# If $capture_events is non-empty, then any QMP event names it lists
# will not be echoed out, but instead collected in the $QEMU_EVENTS
# variable. The _wait_event function can later be used to receive
# the cached events.
#
# If $only_capture_events is set to anything but an empty string,
# then an error will be raised if a QMP message is seen which is
# not an event listed in $capture_events.
#
# If $success_or_failure is set, the meaning of the arguments is # If $success_or_failure is set, the meaning of the arguments is
# changed as follows: # changed as follows:
# $2: A string to search for in the response; if found, this indicates # $2: A string to search for in the response; if found, this indicates
@ -78,6 +87,31 @@ _timed_wait_for()
QEMU_STATUS[$h]=0 QEMU_STATUS[$h]=0
while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]} while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]}
do do
if [ -n "$capture_events" ]; then
capture=0
local evname
for evname in $capture_events
do
case ${resp} in
*\"event\":\ \"${evname}\"* ) capture=1 ;;
esac
done
if [ $capture = 1 ];
then
ev=$(echo "${resp}" | tr -d '\r' | tr % .)
QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}"
if [ -n "$only_capture_events" ]; then
return
else
continue
fi
fi
fi
if [ -n "$only_capture_events" ]; then
echo "Only expected $capture_events but got ${resp}"
exit 1
fi
if [ -z "${silent}" ] && [ -z "${mismatch_only}" ]; then if [ -z "${silent}" ] && [ -z "${mismatch_only}" ]; then
echo "${resp}" | _filter_testdir | _filter_qemu \ echo "${resp}" | _filter_testdir | _filter_qemu \
| _filter_qemu_io | _filter_qmp | _filter_hmp | _filter_qemu_io | _filter_qmp | _filter_hmp
@ -172,12 +206,82 @@ _send_qemu_cmd()
let count--; let count--;
done done
if [ ${QEMU_STATUS[$h]} -ne 0 ] && [ -z "${qemu_error_no_exit}" ]; then if [ ${QEMU_STATUS[$h]} -ne 0 ] && [ -z "${qemu_error_no_exit}" ]; then
echo "Timeout waiting for ${1} on handle ${h}" echo "Timeout waiting for command ${1} response on handle ${h}"
exit 1 #Timeout means the test failed exit 1 #Timeout means the test failed
fi fi
} }
# Check event cache for a named QMP event
#
# Input parameters:
# $1: Name of the QMP event to check for
#
# Checks if the named QMP event that was previously captured
# into $QEMU_EVENTS. When matched, the QMP event will be echoed
# and the $matched variable set to 1.
#
# _wait_event is more suitable for test usage in most cases
_check_cached_events()
{
local evname=${1}
local match="\"event\": \"$evname\""
matched=0
if [ -n "$QEMU_EVENTS" ]; then
CURRENT_QEMU_EVENTS=$QEMU_EVENTS
QEMU_EVENTS=
old_IFS=$IFS
IFS="%"
for ev in $CURRENT_QEMU_EVENTS
do
grep -q "$match" < <(echo "${ev}")
if [ $? -eq 0 ] && [ $matched = 0 ]; then
echo "${ev}" | _filter_testdir | _filter_qemu \
| _filter_qemu_io | _filter_qmp | _filter_hmp
matched=1
else
QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}"
fi
done
IFS=$old_IFS
fi
}
# Wait for a named QMP event
#
# Input parameters:
# $1: QEMU handle to use
# $2: Name of the QMP event to wait for
#
# Checks if the named QMP even was previously captured
# into $QEMU_EVENTS. If none are present, then waits for the
# event to arrive on the QMP channel. When matched, the QMP
# event will be echoed
_wait_event()
{
local h=${1}
local evname=${2}
while true
do
_check_cached_events $evname
if [ $matched = 1 ];
then
return
fi
only_capture_events=1 qemu_error_no_exit=1 _timed_wait_for ${h}
if [ ${QEMU_STATUS[$h]} -ne 0 ] ; then
echo "Timeout waiting for event ${evname} on handle ${h}"
exit 1 #Timeout means the test failed
fi
done
}
# Launch a QEMU process. # Launch a QEMU process.
# #
# Input parameters: # Input parameters:

View File

@ -109,8 +109,14 @@ peek_file_raw()
dd if="$1" bs=1 skip="$2" count="$3" status=none dd if="$1" bs=1 skip="$2" count="$3" status=none
} }
config=common.config
if ! . ./common.config test -f $config || config=../common.config
if ! test -f $config
then
echo "$0: failed to find common.config"
exit 1
fi
if ! . $config
then then
echo "$0: failed to source common.config" echo "$0: failed to source common.config"
exit 1 exit 1

View File

@ -52,6 +52,7 @@ if have_system
util_ss.add(files('crc-ccitt.c')) util_ss.add(files('crc-ccitt.c'))
util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio]) util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
util_ss.add(files('yank.c')) util_ss.add(files('yank.c'))
util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
endif endif
if have_block if have_block

View File

@ -91,3 +91,12 @@ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uin
qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
#userfaultfd.c
uffd_query_features_nosys(int err) "errno: %i"
uffd_query_features_api_failed(int err) "errno: %i"
uffd_create_fd_nosys(int err) "errno: %i"
uffd_create_fd_api_failed(int err) "errno: %i"
uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64
uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i"
uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i"

345
util/userfaultfd.c Normal file
View File

@ -0,0 +1,345 @@
/*
* Linux UFFD-WP support
*
* Copyright Virtuozzo GmbH, 2020
*
* Authors:
* Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or
* later. See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "qemu/bitops.h"
#include "qemu/error-report.h"
#include "qemu/userfaultfd.h"
#include "trace.h"
#include <poll.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
/**
* uffd_query_features: query UFFD features
*
* Returns: 0 on success, negative value in case of an error
*
* @features: parameter to receive 'uffdio_api.features'
*/
int uffd_query_features(uint64_t *features)
{
int uffd_fd;
struct uffdio_api api_struct = { 0 };
int ret = -1;
uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
if (uffd_fd < 0) {
trace_uffd_query_features_nosys(errno);
return -1;
}
api_struct.api = UFFD_API;
api_struct.features = 0;
if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
trace_uffd_query_features_api_failed(errno);
goto out;
}
*features = api_struct.features;
ret = 0;
out:
close(uffd_fd);
return ret;
}
/**
* uffd_create_fd: create UFFD file descriptor
*
* Returns non-negative file descriptor or negative value in case of an error
*
* @features: UFFD features to request
* @non_blocking: create UFFD file descriptor for non-blocking operation
*/
int uffd_create_fd(uint64_t features, bool non_blocking)
{
int uffd_fd;
int flags;
struct uffdio_api api_struct = { 0 };
uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
uffd_fd = syscall(__NR_userfaultfd, flags);
if (uffd_fd < 0) {
trace_uffd_create_fd_nosys(errno);
return -1;
}
api_struct.api = UFFD_API;
api_struct.features = features;
if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
trace_uffd_create_fd_api_failed(errno);
goto fail;
}
if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
goto fail;
}
return uffd_fd;
fail:
close(uffd_fd);
return -1;
}
/**
* uffd_close_fd: close UFFD file descriptor
*
* @uffd_fd: UFFD file descriptor
*/
void uffd_close_fd(int uffd_fd)
{
assert(uffd_fd >= 0);
close(uffd_fd);
}
/**
* uffd_register_memory: register memory range via UFFD-IO
*
* Returns 0 in case of success, negative value in case of an error
*
* @uffd_fd: UFFD file descriptor
* @addr: base address of memory range
* @length: length of memory range
* @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
* @ioctls: optional pointer to receive supported IOCTL mask
*/
int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
uint64_t mode, uint64_t *ioctls)
{
struct uffdio_register uffd_register;
uffd_register.range.start = (uintptr_t) addr;
uffd_register.range.len = length;
uffd_register.mode = mode;
if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
trace_uffd_register_memory_failed(addr, length, mode, errno);
return -1;
}
if (ioctls) {
*ioctls = uffd_register.ioctls;
}
return 0;
}
/**
* uffd_unregister_memory: un-register memory range with UFFD-IO
*
* Returns 0 in case of success, negative value in case of an error
*
* @uffd_fd: UFFD file descriptor
* @addr: base address of memory range
* @length: length of memory range
*/
int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
{
struct uffdio_range uffd_range;
uffd_range.start = (uintptr_t) addr;
uffd_range.len = length;
if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
trace_uffd_unregister_memory_failed(addr, length, errno);
return -1;
}
return 0;
}
/**
* uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
*
* Returns 0 on success, negative value in case of error
*
* @uffd_fd: UFFD file descriptor
* @addr: base address of memory range
* @length: length of memory range
* @wp: write-protect/unprotect
* @dont_wake: do not wake threads waiting on wr-protected page
*/
int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
bool wp, bool dont_wake)
{
struct uffdio_writeprotect uffd_writeprotect;
uffd_writeprotect.range.start = (uintptr_t) addr;
uffd_writeprotect.range.len = length;
if (!wp && dont_wake) {
/* DONTWAKE is meaningful only on protection release */
uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
} else {
uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
}
if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
" mode=%" PRIx64 " errno=%i", addr, length,
(uint64_t) uffd_writeprotect.mode, errno);
return -1;
}
return 0;
}
/**
* uffd_copy_page: copy range of pages to destination via UFFD-IO
*
* Copy range of source pages to the destination to resolve
* missing page fault somewhere in the destination range.
*
* Returns 0 on success, negative value in case of an error
*
* @uffd_fd: UFFD file descriptor
* @dst_addr: destination base address
* @src_addr: source base address
* @length: length of the range to copy
* @dont_wake: do not wake threads waiting on missing page
*/
int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
uint64_t length, bool dont_wake)
{
struct uffdio_copy uffd_copy;
uffd_copy.dst = (uintptr_t) dst_addr;
uffd_copy.src = (uintptr_t) src_addr;
uffd_copy.len = length;
uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
" mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
length, (uint64_t) uffd_copy.mode, errno);
return -1;
}
return 0;
}
/**
* uffd_zero_page: fill range of pages with zeroes via UFFD-IO
*
* Fill range pages with zeroes to resolve missing page fault within the range.
*
* Returns 0 on success, negative value in case of an error
*
* @uffd_fd: UFFD file descriptor
* @addr: base address
* @length: length of the range to fill with zeroes
* @dont_wake: do not wake threads waiting on missing page
*/
int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
{
struct uffdio_zeropage uffd_zeropage;
uffd_zeropage.range.start = (uintptr_t) addr;
uffd_zeropage.range.len = length;
uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
" mode=%" PRIx64 " errno=%i", addr, length,
(uint64_t) uffd_zeropage.mode, errno);
return -1;
}
return 0;
}
/**
* uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
*
* Wake up threads waiting on any page/pages from the designated range.
* The main use case is when during some period, page faults are resolved
* via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
* for the whole memory range are satisfied in a single call to uffd_wakeup().
*
* Returns 0 on success, negative value in case of an error
*
* @uffd_fd: UFFD file descriptor
* @addr: base address
* @length: length of the range
*/
int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
{
struct uffdio_range uffd_range;
uffd_range.start = (uintptr_t) addr;
uffd_range.len = length;
if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
addr, length, errno);
return -1;
}
return 0;
}
/**
* uffd_read_events: read pending UFFD events
*
* Returns number of fetched messages, 0 if non is available or
* negative value in case of an error
*
* @uffd_fd: UFFD file descriptor
* @msgs: pointer to message buffer
* @count: number of messages that can fit in the buffer
*/
int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
{
ssize_t res;
do {
res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
} while (res < 0 && errno == EINTR);
if ((res < 0 && errno == EAGAIN)) {
return 0;
}
if (res < 0) {
error_report("uffd_read_events() failed: errno=%i", errno);
return -1;
}
return (int) (res / sizeof(struct uffd_msg));
}
/**
* uffd_poll_events: poll UFFD file descriptor for read
*
* Returns true if events are available for read, false otherwise
*
* @uffd_fd: UFFD file descriptor
* @tmo: timeout value
*/
bool uffd_poll_events(int uffd_fd, int tmo)
{
int res;
struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
do {
res = poll(&poll_fd, 1, tmo);
} while (res < 0 && errno == EINTR);
if (res == 0) {
return false;
}
if (res < 0) {
error_report("uffd_poll_events() failed: errno=%i", errno);
return false;
}
return (poll_fd.revents & POLLIN) != 0;
}