From 741d4086c856320807a2575389d7c0505578270b Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Fri, 1 Dec 2017 13:08:38 +0100 Subject: [PATCH 01/27] migration: Use proper types in json We use int for everything (int64_t), and then we check that value is between 0 and 255. Change it to the valid types. This change only happens for HMP. QMP always use bytes and similar. Signed-off-by: Juan Quintela Reviewed-by: Eric Blake --- hmp.c | 22 +++++++++---------- migration/migration.c | 49 ++++++++++++++++++------------------------- qapi/migration.json | 20 +++++++++--------- 3 files changed, 41 insertions(+), 50 deletions(-) diff --git a/hmp.c b/hmp.c index 2d72f94193..056bf70cf1 100644 --- a/hmp.c +++ b/hmp.c @@ -293,23 +293,23 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) if (params) { assert(params->has_compress_level); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_LEVEL), params->compress_level); assert(params->has_compress_threads); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_THREADS), params->compress_threads); assert(params->has_decompress_threads); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_DECOMPRESS_THREADS), params->decompress_threads); assert(params->has_cpu_throttle_initial); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL), params->cpu_throttle_initial); assert(params->has_cpu_throttle_increment); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT), params->cpu_throttle_increment); assert(params->has_tls_creds); @@ -321,28 +321,28 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) MigrationParameter_str(MIGRATION_PARAMETER_TLS_HOSTNAME), params->tls_hostname); assert(params->has_max_bandwidth); - monitor_printf(mon, "%s: %" PRId64 " bytes/second\n", + monitor_printf(mon, "%s: %" PRIu64 " bytes/second\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_BANDWIDTH), params->max_bandwidth); assert(params->has_downtime_limit); - monitor_printf(mon, "%s: %" PRId64 " milliseconds\n", + monitor_printf(mon, "%s: %" PRIu64 " milliseconds\n", MigrationParameter_str(MIGRATION_PARAMETER_DOWNTIME_LIMIT), params->downtime_limit); assert(params->has_x_checkpoint_delay); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_X_CHECKPOINT_DELAY), params->x_checkpoint_delay); assert(params->has_block_incremental); monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_BLOCK_INCREMENTAL), params->block_incremental ? "on" : "off"); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_X_MULTIFD_CHANNELS), params->x_multifd_channels); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_X_MULTIFD_PAGE_COUNT), params->x_multifd_page_count); - monitor_printf(mon, "%s: %" PRId64 "\n", + monitor_printf(mon, "%s: %" PRIu64 "\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); } diff --git a/migration/migration.c b/migration/migration.c index 4de3b551fe..8d2372394c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -741,22 +741,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, static bool migrate_params_check(MigrationParameters *params, Error **errp) { if (params->has_compress_level && - (params->compress_level < 0 || params->compress_level > 9)) { + (params->compress_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", "is invalid, it should be in the range of 0 to 9"); return false; } - if (params->has_compress_threads && - (params->compress_threads < 1 || params->compress_threads > 255)) { + if (params->has_compress_threads && (params->compress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_threads", "is invalid, it should be in the range of 1 to 255"); return false; } - if (params->has_decompress_threads && - (params->decompress_threads < 1 || params->decompress_threads > 255)) { + if (params->has_decompress_threads && (params->decompress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "decompress_threads", "is invalid, it should be in the range of 1 to 255"); @@ -781,38 +779,31 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } - if (params->has_max_bandwidth && - (params->max_bandwidth < 0 || params->max_bandwidth > SIZE_MAX)) { + if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) { error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the" " range of 0 to %zu bytes/second", SIZE_MAX); return false; } if (params->has_downtime_limit && - (params->downtime_limit < 0 || - params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { + (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) { error_setg(errp, "Parameter 'downtime_limit' expects an integer in " "the range of 0 to %d milliseconds", MAX_MIGRATE_DOWNTIME); return false; } - if (params->has_x_checkpoint_delay && (params->x_checkpoint_delay < 0)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "x_checkpoint_delay", - "is invalid, it should be positive"); - return false; - } - if (params->has_x_multifd_channels && - (params->x_multifd_channels < 1 || params->x_multifd_channels > 255)) { + /* x_checkpoint_delay is now always positive */ + + if (params->has_x_multifd_channels && (params->x_multifd_channels < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_channels", "is invalid, it should be in the range of 1 to 255"); return false; } if (params->has_x_multifd_page_count && - (params->x_multifd_page_count < 1 || - params->x_multifd_page_count > 10000)) { + (params->x_multifd_page_count < 1 || + params->x_multifd_page_count > 10000)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_page_count", "is invalid, it should be in the range of 1 to 10000"); @@ -2394,33 +2385,33 @@ static Property migration_properties[] = { send_section_footer, true), /* Migration parameters */ - DEFINE_PROP_INT64("x-compress-level", MigrationState, + DEFINE_PROP_UINT8("x-compress-level", MigrationState, parameters.compress_level, DEFAULT_MIGRATE_COMPRESS_LEVEL), - DEFINE_PROP_INT64("x-compress-threads", MigrationState, + DEFINE_PROP_UINT8("x-compress-threads", MigrationState, parameters.compress_threads, DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT), - DEFINE_PROP_INT64("x-decompress-threads", MigrationState, + DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, parameters.decompress_threads, DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), - DEFINE_PROP_INT64("x-cpu-throttle-initial", MigrationState, + DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState, parameters.cpu_throttle_initial, DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL), - DEFINE_PROP_INT64("x-cpu-throttle-increment", MigrationState, + DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState, parameters.cpu_throttle_increment, DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), - DEFINE_PROP_INT64("x-max-bandwidth", MigrationState, + DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), - DEFINE_PROP_INT64("x-downtime-limit", MigrationState, + DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, parameters.downtime_limit, DEFAULT_MIGRATE_SET_DOWNTIME), - DEFINE_PROP_INT64("x-checkpoint-delay", MigrationState, + DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState, parameters.x_checkpoint_delay, DEFAULT_MIGRATE_X_CHECKPOINT_DELAY), - DEFINE_PROP_INT64("x-multifd-channels", MigrationState, + DEFINE_PROP_UINT8("x-multifd-channels", MigrationState, parameters.x_multifd_channels, DEFAULT_MIGRATE_MULTIFD_CHANNELS), - DEFINE_PROP_INT64("x-multifd-page-count", MigrationState, + DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState, parameters.x_multifd_page_count, DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT), DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, diff --git a/qapi/migration.json b/qapi/migration.json index 03f57c9616..4cd3d13158 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -668,19 +668,19 @@ # Since: 2.4 ## { 'struct': 'MigrationParameters', - 'data': { '*compress-level': 'int', - '*compress-threads': 'int', - '*decompress-threads': 'int', - '*cpu-throttle-initial': 'int', - '*cpu-throttle-increment': 'int', + 'data': { '*compress-level': 'uint8', + '*compress-threads': 'uint8', + '*decompress-threads': 'uint8', + '*cpu-throttle-initial': 'uint8', + '*cpu-throttle-increment': 'uint8', '*tls-creds': 'str', '*tls-hostname': 'str', - '*max-bandwidth': 'int', - '*downtime-limit': 'int', - '*x-checkpoint-delay': 'int', + '*max-bandwidth': 'size', + '*downtime-limit': 'uint64', + '*x-checkpoint-delay': 'uint32', '*block-incremental': 'bool' , - '*x-multifd-channels': 'int', - '*x-multifd-page-count': 'int', + '*x-multifd-channels': 'uint8', + '*x-multifd-page-count': 'uint32', '*xbzrle-cache-size': 'size' } } ## From 6f0f6428351d09dd904bf81b137bdd4fd25ba029 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Thu, 26 Oct 2017 11:49:57 +0200 Subject: [PATCH 02/27] migration: print features as on off Once there, do one thing for line Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Peter Xu --- migration/migration.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 8d2372394c..19917a4b5b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2366,10 +2366,15 @@ void migration_global_dump(Monitor *mon) { MigrationState *ms = migrate_get_current(); - monitor_printf(mon, "globals: store-global-state=%d, only_migratable=%d, " - "send-configuration=%d, send-section-footer=%d\n", - ms->store_global_state, ms->only_migratable, - ms->send_configuration, ms->send_section_footer); + monitor_printf(mon, "globals:\n"); + monitor_printf(mon, "store-global-state: %s\n", + ms->store_global_state ? "on" : "off"); + monitor_printf(mon, "only-migratable: %s\n", + ms->only_migratable ? "on" : "off"); + monitor_printf(mon, "send-configuration: %s\n", + ms->send_configuration ? "on" : "off"); + monitor_printf(mon, "send-section-footer: %s\n", + ms->send_section_footer ? "on" : "off"); } #define DEFINE_PROP_MIG_CAP(name, x) \ From 9102d27e33b2a5b676ce3eb73b714c1f92062058 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Fri, 27 Oct 2017 12:23:29 +0200 Subject: [PATCH 03/27] migration: free addr in the same function that we created it Otherwise, we can't use it after calling socket_start_incoming_migration Signed-off-by: Juan Quintela Reviewed-by: Peter Xu --- migration/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/socket.c b/migration/socket.c index dee869044a..3a8232dd2d 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -172,7 +172,6 @@ static void socket_start_incoming_migration(SocketAddress *saddr, if (qio_channel_socket_listen_sync(listen_ioc, saddr, errp) < 0) { object_unref(OBJECT(listen_ioc)); - qapi_free_SocketAddress(saddr); return; } @@ -181,7 +180,6 @@ static void socket_start_incoming_migration(SocketAddress *saddr, socket_accept_incoming_migration, listen_ioc, (GDestroyNotify)object_unref); - qapi_free_SocketAddress(saddr); } void tcp_start_incoming_migration(const char *host_port, Error **errp) @@ -191,6 +189,7 @@ void tcp_start_incoming_migration(const char *host_port, Error **errp) if (!err) { socket_start_incoming_migration(saddr, &err); } + qapi_free_SocketAddress(saddr); error_propagate(errp, err); } @@ -198,4 +197,5 @@ void unix_start_incoming_migration(const char *path, Error **errp) { SocketAddress *saddr = unix_build_address(path); socket_start_incoming_migration(saddr, errp); + qapi_free_SocketAddress(saddr); } From 2e3c8f8dbddf2992e29533ded845640d109d77f3 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 15 Dec 2017 15:45:07 +0000 Subject: [PATCH 04/27] docs: Convert migration.txt to rst Mostly just manual conversion with very minor fixes. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Daniel P. Berrange Reviewed-by: Kashyap Chamarthy Reviewed-by: Peter Xu Signed-off-by: Juan Quintela --- docs/devel/{migration.txt => migration.rst} | 450 +++++++++++--------- 1 file changed, 237 insertions(+), 213 deletions(-) rename docs/devel/{migration.txt => migration.rst} (58%) diff --git a/docs/devel/migration.txt b/docs/devel/migration.rst similarity index 58% rename from docs/devel/migration.txt rename to docs/devel/migration.rst index 4030703726..bf97080dac 100644 --- a/docs/devel/migration.txt +++ b/docs/devel/migration.rst @@ -1,4 +1,6 @@ -= Migration = +========= +Migration +========= QEMU has code to load/save the state of the guest that it is running. These are two complementary operations. Saving the state just does @@ -26,7 +28,8 @@ the guest to be stopped. Typically the time that the guest is unresponsive during live migration is the low hundred of milliseconds (notice that this depends on a lot of things). -=== Types of migration === +Types of migration +================== Now that we have talked about live migration, there are several ways to do migration: @@ -41,49 +44,21 @@ All these four migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. -=== State Live Migration === +State Live Migration +==================== This is used for RAM and block devices. It is not yet ported to vmstate. -=== What is the common infrastructure === +Common infrastructure +===================== -QEMU uses a QEMUFile abstraction to be able to do migration. Any type -of migration that wants to use QEMU infrastructure has to create a -QEMUFile with: +The files, sockets or fd's that carry the migration stream are abstracted by +the ``QEMUFile`` type (see `migration/qemu-file.h`). In most cases this +is connected to a subtype of ``QIOChannel`` (see `io/`). -QEMUFile *qemu_fopen_ops(void *opaque, - QEMUFilePutBufferFunc *put_buffer, - QEMUFileGetBufferFunc *get_buffer, - QEMUFileCloseFunc *close); - -The functions have the following functionality: - -This function writes a chunk of data to a file at the given position. -The pos argument can be ignored if the file is only used for -streaming. The handler should try to write all of the data it can. - -typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, - int64_t pos, int size); - -Read a chunk of data from a file at the given position. The pos argument -can be ignored if the file is only be used for streaming. The number of -bytes actually read should be returned. - -typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, - int64_t pos, int size); - -Close a file and return an error code. - -typedef int (QEMUFileCloseFunc)(void *opaque); - -You can use any internal state that you need using the opaque void * -pointer that is passed to all functions. - -The important functions for us are put_buffer()/get_buffer() that -allow to write/read a buffer into the QEMUFile. - -=== How to save the state of one device === +Saving the state of one device +============================== The state of a device is saved using intermediate buffers. There are some helper functions to assist this saving. @@ -93,34 +68,38 @@ version. When we migrate a device, we save/load the state as a series of fields. Some times, due to bugs or new functionality, we need to change the state to store more/different information. We use the version to identify each time that we do a change. Each version is -associated with a series of fields saved. The save_state always saves -the state as the newer version. But load_state sometimes is able to +associated with a series of fields saved. The `save_state` always saves +the state as the newer version. But `load_state` sometimes is able to load state from an older version. -=== Legacy way === +Legacy way +---------- This way is going to disappear as soon as all current users are ported to VMSTATE. Each device has to register two functions, one to save the state and another to load the state back. -int register_savevm(DeviceState *dev, - const char *idstr, - int instance_id, - int version_id, - SaveStateHandler *save_state, - LoadStateHandler *load_state, - void *opaque); +.. code:: c -typedef void SaveStateHandler(QEMUFile *f, void *opaque); -typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); + int register_savevm(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque); -The important functions for the device state format are the save_state -and load_state. Notice that load_state receives a version_id -parameter to know what state format is receiving. save_state doesn't + typedef void SaveStateHandler(QEMUFile *f, void *opaque); + typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); + +The important functions for the device state format are the `save_state` +and `load_state`. Notice that `load_state` receives a version_id +parameter to know what state format is receiving. `save_state` doesn't have a version_id parameter because it always uses the latest version. -=== VMState === +VMState +------- The legacy way of saving/loading state of the device had the problem that we have to maintain two functions in sync. If we did one change @@ -135,31 +114,36 @@ save/load functions. An example (from hw/input/pckbd.c) -static const VMStateDescription vmstate_kbd = { - .name = "pckbd", - .version_id = 3, - .minimum_version_id = 3, - .fields = (VMStateField[]) { - VMSTATE_UINT8(write_cmd, KBDState), - VMSTATE_UINT8(status, KBDState), - VMSTATE_UINT8(mode, KBDState), - VMSTATE_UINT8(pending, KBDState), - VMSTATE_END_OF_LIST() - } -}; +.. code:: c + + static const VMStateDescription vmstate_kbd = { + .name = "pckbd", + .version_id = 3, + .minimum_version_id = 3, + .fields = (VMStateField[]) { + VMSTATE_UINT8(write_cmd, KBDState), + VMSTATE_UINT8(status, KBDState), + VMSTATE_UINT8(mode, KBDState), + VMSTATE_UINT8(pending, KBDState), + VMSTATE_END_OF_LIST() + } + }; We are declaring the state with name "pckbd". -The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. +The `version_id` is 3, and the fields are 4 uint8_t in a KBDState structure. We registered this with: +.. code:: c + vmstate_register(NULL, 0, &vmstate_kbd, s); Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. -You can search for VMSTATE_* macros for lots of types used in QEMU in +You can search for ``VMSTATE_*`` macros for lots of types used in QEMU in include/hw/hw.h. -=== More about versions === +More about versions +------------------- Version numbers are intended for major incompatible changes to the migration of a device, and using them breaks backwards-migration @@ -168,22 +152,23 @@ compatibility; in general most changes can be made by adding Subsections You can see that there are several version fields: -- version_id: the maximum version_id supported by VMState for that device. -- minimum_version_id: the minimum version_id that VMState is able to understand +- `version_id`: the maximum version_id supported by VMState for that device. +- `minimum_version_id`: the minimum version_id that VMState is able to understand for that device. -- minimum_version_id_old: For devices that were not able to port to vmstate, we can +- `minimum_version_id_old`: For devices that were not able to port to vmstate, we can assign a function that knows how to read this old state. This field is - ignored if there is no load_state_old handler. + ignored if there is no `load_state_old` handler. So, VMState is able to read versions from minimum_version_id to -version_id. And the function load_state_old() (if present) is able to +version_id. And the function ``load_state_old()`` (if present) is able to load state from minimum_version_id_old to minimum_version_id. This function is deprecated and will be removed when no more users are left. Saving state will always create a section with the 'version_id' value and thus can't be loaded by any older QEMU. -=== Massaging functions === +Massaging functions +------------------- Sometimes, it is not enough to be able to save the state directly from one structure, we need to fill the correct values there. One @@ -194,24 +179,24 @@ load the state for the cpu that we have just loaded from the QEMUFile. The functions to do that are inside a vmstate definition, and are called: -- int (*pre_load)(void *opaque); +- ``int (*pre_load)(void *opaque);`` This function is called before we load the state of one device. -- int (*post_load)(void *opaque, int version_id); +- ``int (*post_load)(void *opaque, int version_id);`` This function is called after we load the state of one device. -- int (*pre_save)(void *opaque); +- ``int (*pre_save)(void *opaque);`` This function is called before we save the state of one device. Example: You can look at hpet.c, that uses the three function to - massage the state that is transferred. +massage the state that is transferred. If you use memory API functions that update memory layout outside initialization (i.e., in response to a guest action), this is a strong -indication that you need to call these functions in a post_load callback. +indication that you need to call these functions in a `post_load` callback. Examples of such memory API functions are: - memory_region_add_subregion() @@ -221,7 +206,8 @@ Examples of such memory API functions are: - memory_region_set_address() - memory_region_set_alias_offset() -=== Subsections === +Subsections +----------- The use of version_id allows to be able to migrate from older versions to newer versions of a device. But not the other way around. This @@ -251,52 +237,54 @@ value that it uses. Example: -static bool ide_drive_pio_state_needed(void *opaque) -{ - IDEState *s = opaque; +.. code:: c - return ((s->status & DRQ_STAT) != 0) - || (s->bus->error_status & BM_STATUS_PIO_RETRY); -} + static bool ide_drive_pio_state_needed(void *opaque) + { + IDEState *s = opaque; -const VMStateDescription vmstate_ide_drive_pio_state = { - .name = "ide_drive/pio_state", - .version_id = 1, - .minimum_version_id = 1, - .pre_save = ide_drive_pio_pre_save, - .post_load = ide_drive_pio_post_load, - .needed = ide_drive_pio_state_needed, - .fields = (VMStateField[]) { - VMSTATE_INT32(req_nb_sectors, IDEState), - VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, - vmstate_info_uint8, uint8_t), - VMSTATE_INT32(cur_io_buffer_offset, IDEState), - VMSTATE_INT32(cur_io_buffer_len, IDEState), - VMSTATE_UINT8(end_transfer_fn_idx, IDEState), - VMSTATE_INT32(elementary_transfer_size, IDEState), - VMSTATE_INT32(packet_transfer_size, IDEState), - VMSTATE_END_OF_LIST() - } -}; + return ((s->status & DRQ_STAT) != 0) + || (s->bus->error_status & BM_STATUS_PIO_RETRY); + } -const VMStateDescription vmstate_ide_drive = { - .name = "ide_drive", - .version_id = 3, - .minimum_version_id = 0, - .post_load = ide_drive_post_load, - .fields = (VMStateField[]) { - .... several fields .... - VMSTATE_END_OF_LIST() - }, - .subsections = (const VMStateDescription*[]) { - &vmstate_ide_drive_pio_state, - NULL - } -}; + const VMStateDescription vmstate_ide_drive_pio_state = { + .name = "ide_drive/pio_state", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = ide_drive_pio_pre_save, + .post_load = ide_drive_pio_post_load, + .needed = ide_drive_pio_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT32(req_nb_sectors, IDEState), + VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, + vmstate_info_uint8, uint8_t), + VMSTATE_INT32(cur_io_buffer_offset, IDEState), + VMSTATE_INT32(cur_io_buffer_len, IDEState), + VMSTATE_UINT8(end_transfer_fn_idx, IDEState), + VMSTATE_INT32(elementary_transfer_size, IDEState), + VMSTATE_INT32(packet_transfer_size, IDEState), + VMSTATE_END_OF_LIST() + } + }; + + const VMStateDescription vmstate_ide_drive = { + .name = "ide_drive", + .version_id = 3, + .minimum_version_id = 0, + .post_load = ide_drive_post_load, + .fields = (VMStateField[]) { + .... several fields .... + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_ide_drive_pio_state, + NULL + } + }; Here we have a subsection for the pio state. We only need to save/send this state when we are in the middle of a pio operation -(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is +(that is what ``ide_drive_pio_state_needed()`` checks). If DRQ_STAT is not enabled, the values on that fields are garbage and don't need to be sent. @@ -304,11 +292,12 @@ Using a condition function that checks a 'property' to determine whether to send a subsection allows backwards migration compatibility when new subsections are added. -For example; - a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and +For example: + + a) Add a new property using ``DEFINE_PROP_BOOL`` - e.g. support-foo and default it to true. - b) Add an entry to the HW_COMPAT_ for the previous version - that sets the property to false. + b) Add an entry to the ``HW_COMPAT_`` for the previous version that sets + the property to false. c) Add a static bool support_foo function that tests the property. d) Add a subsection with a .needed set to the support_foo function e) (potentially) Add a pre_load that sets up a default value for 'foo' @@ -332,25 +321,30 @@ in most cases. In general the preference is to tie the subsection to the machine type, and allow reliable migrations, unless the behaviour from omission of the subsection is really bad. -= Not sending existing elements = +Not sending existing elements +----------------------------- -Sometimes members of the VMState are no longer needed; - removing them will break migration compatibility - making them version dependent and bumping the version will break backwards - migration compatibility. +Sometimes members of the VMState are no longer needed: + + - removing them will break migration compatibility + + - making them version dependent and bumping the version will break backwards migration compatibility. The best way is to: - a) Add a new property/compatibility/function in the same way for subsections - above. + + a) Add a new property/compatibility/function in the same way for subsections above. b) replace the VMSTATE macro with the _TEST version of the macro, e.g.: - VMSTATE_UINT32(foo, barstruct) + + ``VMSTATE_UINT32(foo, barstruct)`` + becomes - VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz) - Sometime in the future when we no longer care about the ancient -versions these can be killed off. + ``VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz)`` -= Return path = + Sometime in the future when we no longer care about the ancient versions these can be killed off. + +Return path +----------- In most migration scenarios there is only a single data path that runs from the source VM to the destination, typically along a single fd (although @@ -360,19 +354,23 @@ However, some uses need two way communication; in particular the Postcopy destination needs to be able to request pages on demand from the source. For these scenarios there is a 'return path' from the destination to the source; -qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return +``qemu_file_get_return_path(QEMUFile* fwdpath)`` gives the QEMUFile* for the return path. Source side + Forward path - written by migration thread Return path - opened by main thread, read by return-path thread Destination side + Forward path - read by main thread Return path - opened by main thread, written by main thread AND postcopy - thread (protected by rp_mutex) + thread (protected by rp_mutex) + +Postcopy +======== -= Postcopy = 'Postcopy' migration is a way to deal with migrations that refuse to converge (or take too long to converge) its plus side is that there is an upper bound on the amount of migration traffic and time it takes, the down side is that during @@ -386,27 +384,30 @@ a fault that's translated by QEMU into a request to the source QEMU. Postcopy can be combined with precopy (i.e. normal migration) so that if precopy doesn't finish in a given time the switch is made to postcopy. -=== Enabling postcopy === +Enabling postcopy +----------------- To enable postcopy, issue this command on the monitor prior to the start of migration: -migrate_set_capability postcopy-ram on +``migrate_set_capability postcopy-ram on`` The normal commands are then used to start a migration, which is still started in precopy mode. Issuing: -migrate_start_postcopy +``migrate_start_postcopy`` will now cause the transition from precopy to postcopy. It can be issued immediately after migration is started or any time later on. Issuing it after the end of a migration is harmless. -Note: During the postcopy phase, the bandwidth limits set using -migrate_set_speed is ignored (to avoid delaying requested pages that -the destination is waiting for). +.. note:: + During the postcopy phase, the bandwidth limits set using + ``migrate_set_speed`` is ignored (to avoid delaying requested pages that + the destination is waiting for). -=== Postcopy device transfer === +Postcopy device transfer +------------------------ Loading of device data may cause the device emulation to access guest RAM that may trigger faults that have to be resolved by the source, as such @@ -416,6 +417,7 @@ before the device load begins to free the stream up. This is achieved by 'packaging' the device data into a blob that's read in one go. Source behaviour +---------------- Until postcopy is entered the migration stream is identical to normal precopy, except for the addition of a 'postcopy advise' command at @@ -423,13 +425,14 @@ the beginning, to tell the destination that postcopy might happen. When postcopy starts the source sends the page discard data and then forms the 'package' containing: - Command: 'postcopy listen' - The device state - A series of sections, identical to the precopy streams device state stream - containing everything except postcopiable devices (i.e. RAM) - Command: 'postcopy run' + - Command: 'postcopy listen' + - The device state -The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + - Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the contents are formatted in the same way as the main migration stream. During postcopy the source scans the list of dirty pages and sends them @@ -441,82 +444,100 @@ to be sent quickly in the hope that those pages are likely to be used by the destination soon. Destination behaviour +--------------------- Initially the destination looks the same as precopy, with a single thread reading the migration stream; the 'postcopy advise' and 'discard' commands are processed to change the way RAM is managed, but don't affect the stream processing. ------------------------------------------------------------------------------- - 1 2 3 4 5 6 7 -main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) -thread | | - | (page request) - | \___ - v \ -listen thread: --- page -- page -- page -- page -- page -- +:: - a b c ------------------------------------------------------------------------------- + ------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 + main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) + thread | | + | (page request) + | \___ + v \ + listen thread: --- page -- page -- page -- page -- page -- -On receipt of CMD_PACKAGED (1) - All the data associated with the package - the ( ... ) section in the -diagram - is read into memory, and the main thread recurses into -qemu_loadvm_state_main to process the contents of the package (2) -which contains commands (3,6) and devices (4...) + a b c + ------------------------------------------------------------------------------ -On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) -a new thread (a) is started that takes over servicing the migration stream, -while the main thread carries on loading the package. It loads normal -background page data (b) but if during a device load a fault happens (5) the -returned page (c) is loaded by the listen thread allowing the main threads -device load to carry on. +- On receipt of ``CMD_PACKAGED`` (1) -The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination -CPUs start running. -At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour -and is no longer used by migration, while the listen thread carries -on servicing page data until the end of migration. + All the data associated with the package - the ( ... ) section in the diagram - + is read into memory, and the main thread recurses into qemu_loadvm_state_main + to process the contents of the package (2) which contains commands (3,6) and + devices (4...) -=== Postcopy states === +- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) + + a new thread (a) is started that takes over servicing the migration stream, + while the main thread carries on loading the package. It loads normal + background page data (b) but if during a device load a fault happens (5) + the returned page (c) is loaded by the listen thread allowing the main + threads device load to carry on. + +- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) + + letting the destination CPUs start running. At the end of the + ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and + is no longer used by migration, while the listen thread carries on servicing + page data until the end of migration. + +Postcopy states +--------------- Postcopy moves through a series of states (see postcopy_state) from ADVISE->DISCARD->LISTEN->RUNNING->END - Advise: Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) + - Advise - Discard: Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) - Listen: The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). + - Discard - Running: POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. - End: The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. + - Listen -=== Source side page maps === + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Source side page maps +--------------------- The source side keeps two bitmaps during postcopy; 'the migration bitmap' and 'unsent map'. The 'migration bitmap' is basically the same as in @@ -529,6 +550,7 @@ The 'unsent map' is used for the transition to postcopy. It is a bitmap that has a bit cleared whenever a page is sent to the destination, however during the transition to postcopy mode it is combined with the migration bitmap to form a set of pages that: + a) Have been sent but then redirtied (which must be discarded) b) Have not yet been sent - which also must be discarded to cause any transparent huge pages built during precopy to be broken. @@ -540,15 +562,17 @@ request for a page that has already been sent is ignored. Duplicate requests such as this can happen as a page is sent at about the same time the destination accesses it. -=== Postcopy with hugepages === +Postcopy with hugepages +----------------------- Postcopy now works with hugetlbfs backed memory: + a) The linux kernel on the destination must support userfault on hugepages. b) The huge-page configuration on the source and destination VMs must be identical; i.e. RAMBlocks on both sides must use the same page size. - c) Note that -mem-path /dev/hugepages will fall back to allocating normal + c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal RAM if it doesn't have enough hugepages, triggering (b) to fail. - Using -mem-prealloc enforces the allocation using hugepages. + Using ``-mem-prealloc`` enforces the allocation using hugepages. d) Care should be taken with the size of hugepage used; postcopy with 2MB hugepages works well, however 1GB hugepages are likely to be problematic since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, From 9c43435dc1f872a1452792fd8d9dbcf5c0482e46 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Mon, 30 Oct 2017 14:27:40 +0100 Subject: [PATCH 05/27] migration: free result string Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert Reported-by: Peter Xu --- tests/migration-test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/migration-test.c b/tests/migration-test.c index be598d3257..799e24ebc6 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -358,13 +358,14 @@ static void migrate_check_parameter(QTestState *who, const char *parameter, const char *value) { QDict *rsp, *rsp_return; - const char *result; + char *result; rsp = wait_command(who, "{ 'execute': 'query-migrate-parameters' }"); rsp_return = qdict_get_qdict(rsp, "return"); result = g_strdup_printf("%" PRId64, qdict_get_try_int(rsp_return, parameter, -1)); g_assert_cmpstr(result, ==, value); + g_free(result); QDECREF(rsp); } From 029ff892e1049ca4f93f9d8f0059bc02fffd6e32 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 5 Dec 2017 13:27:40 +0100 Subject: [PATCH 06/27] migration: fix analyze-migration.py script with radix table Since commit 3a38429748 ("Add a "no HPT" encoding to HTAB migration stream") the HTAB migration stream contains a header set to "-1", meaning there is no HPT. Teach analyze-migration.py to ignore the section in this case. Without this fix, the script fails with a dump from a POWER9 guest: Traceback (most recent call last): File "./qemu/scripts/analyze-migration.py", line 602, in dump.read(dump_memory = args.memory) File "./qemu/scripts/analyze-migration.py", line 539, in read section.read() File "./qemu/scripts/analyze-migration.py", line 250, in read self.file.readvar(n_valid * self.HASH_PTE_SIZE_64) File "./qemu/scripts/analyze-migration.py", line 64, in readvar raise Exception("Unexpected end of %s at 0x%x" % (self.filename, self.file.tell())) Exception: Unexpected end of migrate.dump at 0x1d4763ba Fixes: 3a38429748 ("Add a "no HPT" encoding to HTAB migration stream") Signed-off-by: Laurent Vivier Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Greg Kurz Signed-off-by: Juan Quintela --- scripts/analyze-migration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py index 14553876a2..88ff4adb30 100755 --- a/scripts/analyze-migration.py +++ b/scripts/analyze-migration.py @@ -234,6 +234,10 @@ class HTABSection(object): header = self.file.read32() + if (header == -1): + # "no HPT" encoding + return + if (header > 0): # First section, just the hash shift return From 31bf06a9d6844d9977b028bd9b855d53288f979c Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Mon, 30 Oct 2017 16:16:25 +0300 Subject: [PATCH 07/27] migration: introduce postcopy-blocktime capability Right now it could be used on destination side to enable vCPU blocktime calculation for postcopy live migration. vCPU blocktime - it's time since vCPU thread was put into interruptible sleep, till memory page was copied and thread awake. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 9 +++++++++ migration/migration.h | 1 + qapi/migration.json | 6 +++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 19917a4b5b..fb876e9fca 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1499,6 +1499,15 @@ bool migrate_zero_blocks(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; } +bool migrate_postcopy_blocktime(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME]; +} + bool migrate_use_compression(void) { MigrationState *s; diff --git a/migration/migration.h b/migration/migration.h index 663415fe48..9ab8d07004 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -201,6 +201,7 @@ int migrate_compress_level(void); int migrate_compress_threads(void); int migrate_decompress_threads(void); bool migrate_use_events(void); +bool migrate_postcopy_blocktime(void); /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, diff --git a/qapi/migration.json b/qapi/migration.json index 4cd3d13158..4d8ccdf776 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -352,12 +352,16 @@ # # @x-multifd: Use more than one fd for migration (since 2.11) # +# @postcopy-blocktime: Calculate downtime for postcopy live migration +# (since 2.12) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', - 'block', 'return-path', 'pause-before-switchover', 'x-multifd' ] } + 'block', 'return-path', 'pause-before-switchover', 'x-multifd', + 'postcopy-blocktime' ] } ## # @MigrationCapabilityStatus: From 01a87f0bd38b06ff92e0fc08ecf54eefab6d9ca8 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Mon, 30 Oct 2017 16:16:26 +0300 Subject: [PATCH 08/27] migration: add postcopy blocktime ctx into MigrationIncomingState This patch adds request to kernel space for UFFD_FEATURE_THREAD_ID, in case this feature is provided by kernel. PostcopyBlocktimeContext is encapsulated inside postcopy-ram.c, due to it being a postcopy-only feature. Also it defines PostcopyBlocktimeContext's instance live time. Information from PostcopyBlocktimeContext instance will be provided much after postcopy migration end, instance of PostcopyBlocktimeContext will live till QEMU exit, but part of it (vcpu_addr, page_fault_vcpu_time) used only during calculation, will be released when postcopy ended or failed. To enable postcopy blocktime calculation on destination, need to request proper compatibility (Patch for documentation will be at the tail of the patch set). As an example following command enable that capability, assume QEMU was started with -chardev socket,id=charmonitor,path=/var/lib/migrate-vm-monitor.sock option to control it [root@host]#printf "{\"execute\" : \"qmp_capabilities\"}\r\n \ {\"execute\": \"migrate-set-capabilities\" , \"arguments\": { \"capabilities\": [ { \"capability\": \"postcopy-blocktime\", \"state\": true } ] } }" | nc -U /var/lib/migrate-vm-monitor.sock Or just with HMP (qemu) migrate_set_capability postcopy-blocktime on Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.h | 8 ++++++ migration/postcopy-ram.c | 59 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/migration/migration.h b/migration/migration.h index 9ab8d07004..7e06c11773 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -22,6 +22,8 @@ #include "hw/qdev.h" #include "io/channel.h" +struct PostcopyBlocktimeContext; + /* State for the incoming migration */ struct MigrationIncomingState { QEMUFile *from_src_file; @@ -59,6 +61,12 @@ struct MigrationIncomingState { /* The coroutine we should enter (back) after failover */ Coroutine *migration_incoming_co; QemuSemaphore colo_incoming_sem; + + /* + * PostcopyBlocktimeContext to keep information for postcopy + * live migration, to calculate vCPU block time + * */ + struct PostcopyBlocktimeContext *blocktime_ctx; }; MigrationIncomingState *migration_incoming_get_current(void); diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index bec6c2c66b..88f3de44ad 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -61,6 +61,52 @@ struct PostcopyDiscardState { #include #include +typedef struct PostcopyBlocktimeContext { + /* time when page fault initiated per vCPU */ + int64_t *page_fault_vcpu_time; + /* page address per vCPU */ + uintptr_t *vcpu_addr; + int64_t total_blocktime; + /* blocktime per vCPU */ + int64_t *vcpu_blocktime; + /* point in time when last page fault was initiated */ + int64_t last_begin; + /* number of vCPU are suspended */ + int smp_cpus_down; + + /* + * Handler for exit event, necessary for + * releasing whole blocktime_ctx + */ + Notifier exit_notifier; +} PostcopyBlocktimeContext; + +static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) +{ + g_free(ctx->page_fault_vcpu_time); + g_free(ctx->vcpu_addr); + g_free(ctx->vcpu_blocktime); + g_free(ctx); +} + +static void migration_exit_cb(Notifier *n, void *data) +{ + PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext, + exit_notifier); + destroy_blocktime_context(ctx); +} + +static struct PostcopyBlocktimeContext *blocktime_context_new(void) +{ + PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); + ctx->page_fault_vcpu_time = g_new0(int64_t, smp_cpus); + ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); + ctx->vcpu_blocktime = g_new0(int64_t, smp_cpus); + + ctx->exit_notifier.notify = migration_exit_cb; + qemu_add_exit_notifier(&ctx->exit_notifier); + return ctx; +} /** * receive_ufd_features: check userfault fd features, to request only supported @@ -153,6 +199,19 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis) } } +#ifdef UFFD_FEATURE_THREAD_ID + if (migrate_postcopy_blocktime() && mis && + UFFD_FEATURE_THREAD_ID & supported_features) { + /* kernel supports that feature */ + /* don't create blocktime_context if it exists */ + if (!mis->blocktime_ctx) { + mis->blocktime_ctx = blocktime_context_new(); + } + + asked_features |= UFFD_FEATURE_THREAD_ID; + } +#endif + /* * request features, even if asked_features is 0, due to * kernel expects UFFD_API before UFFDIO_REGISTER, per From 3be98be4e9f59055afb5f2b27f9296c7093b4e75 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Mon, 30 Oct 2017 16:16:27 +0300 Subject: [PATCH 09/27] migration: calculate vCPU blocktime on dst side This patch provides blocktime calculation per vCPU, as a summary and as a overlapped value for all vCPUs. This approach was suggested by Peter Xu, as an improvements of previous approch where QEMU kept tree with faulted page address and cpus bitmask in it. Now QEMU is keeping array with faulted page address as value and vCPU as index. It helps to find proper vCPU at UFFD_COPY time. Also it keeps list for blocktime per vCPU (could be traced with page_fault_addr) Blocktime will not calculated if postcopy_blocktime field of MigrationIncomingState wasn't initialized. Signed-off-by: Alexey Perevalov Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/postcopy-ram.c | 143 ++++++++++++++++++++++++++++++++++++++- migration/trace-events | 5 +- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 88f3de44ad..ca8c3bce8e 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -553,6 +553,142 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, return 0; } +static int get_mem_fault_cpu_index(uint32_t pid) +{ + CPUState *cpu_iter; + + CPU_FOREACH(cpu_iter) { + if (cpu_iter->thread_id == pid) { + trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid); + return cpu_iter->cpu_index; + } + } + trace_get_mem_fault_cpu_index(-1, pid); + return -1; +} + +/* + * This function is being called when pagefault occurs. It + * tracks down vCPU blocking time. + * + * @addr: faulted host virtual address + * @ptid: faulted process thread id + * @rb: ramblock appropriate to addr + */ +static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, + RAMBlock *rb) +{ + int cpu, already_received; + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + int64_t now_ms; + + if (!dc || ptid == 0) { + return; + } + cpu = get_mem_fault_cpu_index(ptid); + if (cpu < 0) { + return; + } + + now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if (dc->vcpu_addr[cpu] == 0) { + atomic_inc(&dc->smp_cpus_down); + } + + atomic_xchg__nocheck(&dc->last_begin, now_ms); + atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms); + atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr); + + /* check it here, not at the begining of the function, + * due to, check could accur early than bitmap_set in + * qemu_ufd_copy_ioctl */ + already_received = ramblock_recv_bitmap_test(rb, (void *)addr); + if (already_received) { + atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0); + atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0); + atomic_dec(&dc->smp_cpus_down); + } + trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], + cpu, already_received); +} + +/* + * This function just provide calculated blocktime per cpu and trace it. + * Total blocktime is calculated in mark_postcopy_blocktime_end. + * + * + * Assume we have 3 CPU + * + * S1 E1 S1 E1 + * -----***********------------xxx***************------------------------> CPU1 + * + * S2 E2 + * ------------****************xxx---------------------------------------> CPU2 + * + * S3 E3 + * ------------------------****xxx********-------------------------------> CPU3 + * + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1 + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3 + * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 - + * it's a part of total blocktime. + * S1 - here is last_begin + * Legend of the picture is following: + * * - means blocktime per vCPU + * x - means overlapped blocktime (total blocktime) + * + * @addr: host virtual address + */ +static void mark_postcopy_blocktime_end(uintptr_t addr) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + int i, affected_cpu = 0; + int64_t now_ms; + bool vcpu_total_blocktime = false; + int64_t read_vcpu_time; + + if (!dc) { + return; + } + + now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + /* lookup cpu, to clear it, + * that algorithm looks straighforward, but it's not + * optimal, more optimal algorithm is keeping tree or hash + * where key is address value is a list of */ + for (i = 0; i < smp_cpus; i++) { + uint64_t vcpu_blocktime = 0; + + read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); + if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || + read_vcpu_time == 0) { + continue; + } + atomic_xchg__nocheck(&dc->vcpu_addr[i], 0); + vcpu_blocktime = now_ms - read_vcpu_time; + affected_cpu += 1; + /* we need to know is that mark_postcopy_end was due to + * faulted page, another possible case it's prefetched + * page and in that case we shouldn't be here */ + if (!vcpu_total_blocktime && + atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) { + vcpu_total_blocktime = true; + } + /* continue cycle, due to one page could affect several vCPUs */ + dc->vcpu_blocktime[i] += vcpu_blocktime; + } + + atomic_sub(&dc->smp_cpus_down, affected_cpu); + if (vcpu_total_blocktime) { + dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0); + } + trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, + affected_cpu); +} + /* * Handle faults detected by the USERFAULT markings */ @@ -630,8 +766,11 @@ static void *postcopy_ram_fault_thread(void *opaque) rb_offset &= ~(qemu_ram_pagesize(rb) - 1); trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), - rb_offset); + rb_offset, + msg.arg.pagefault.feat.ptid); + mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address), + msg.arg.pagefault.feat.ptid, rb); /* * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) @@ -721,6 +860,8 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr, if (!ret) { ramblock_recv_bitmap_set_range(rb, host_addr, pagesize / qemu_target_page_size()); + mark_postcopy_blocktime_end((uintptr_t)host_addr); + } return ret; } diff --git a/migration/trace-events b/migration/trace-events index 6f29fcc686..462d15717d 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -115,6 +115,8 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" process_incoming_migration_co_postcopy_end_main(void) "" migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s" migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname) "ioc=%p ioctype=%s hostname=%s" +mark_postcopy_blocktime_begin(uint64_t addr, void *dd, int64_t time, int cpu, int received) "addr: 0x%" PRIx64 ", dd: %p, time: %" PRId64 ", cpu: %d, already_received: %d" +mark_postcopy_blocktime_end(uint64_t addr, void *dd, int64_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %" PRId64 ", affected_cpu: %d" # migration/rdma.c qemu_rdma_accept_incoming_migration(void) "" @@ -191,7 +193,7 @@ postcopy_ram_enable_notify(void) "" postcopy_ram_fault_thread_entry(void) "" postcopy_ram_fault_thread_exit(void) "" postcopy_ram_fault_thread_quit(void) "" -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx" +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx pid=%u" postcopy_ram_incoming_cleanup_closeuf(void) "" postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" @@ -200,6 +202,7 @@ save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u" # migration/exec.c migration_exec_outgoing(const char *cmd) "cmd=%s" From 2f7dae9dd3b5bb436f6089816cf24f4d0867d32e Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Mon, 30 Oct 2017 16:16:28 +0300 Subject: [PATCH 10/27] migration: postcopy_blocktime documentation Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- docs/devel/migration.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst index bf97080dac..015a9ebdf7 100644 --- a/docs/devel/migration.rst +++ b/docs/devel/migration.rst @@ -401,6 +401,20 @@ will now cause the transition from precopy to postcopy. It can be issued immediately after migration is started or any time later on. Issuing it after the end of a migration is harmless. +Blocktime is a postcopy live migration metric, intended to show how +long the vCPU was in state of interruptable sleep due to pagefault. +That metric is calculated both for all vCPUs as overlapped value, and +separately for each vCPU. These values are calculated on destination +side. To enable postcopy blocktime calculation, enter following +command on destination monitor: + +``migrate_set_capability postcopy-blocktime on`` + +Postcopy blocktime can be retrieved by query-migrate qmp command. +postcopy-blocktime value of qmp command will show overlapped blocking +time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +time per vCPU. + .. note:: During the postcopy phase, the bandwidth limits set using ``migrate_set_speed`` is ignored (to avoid delaying requested pages that From 5f32dc8ee073d250abd9e8915f6884ba20d5de62 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Mon, 30 Oct 2017 16:16:29 +0300 Subject: [PATCH 11/27] migration: add blocktime calculation into migration-test This patch just requests blocktime calculation, and check it in case when UFFD_FEATURE_THREAD_ID feature is set on the host. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- tests/migration-test.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/migration-test.c b/tests/migration-test.c index 799e24ebc6..9fd5dadc0d 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -25,6 +25,7 @@ const unsigned start_address = 1024 * 1024; const unsigned end_address = 100 * 1024 * 1024; bool got_stop; +static bool uffd_feature_thread_id; #if defined(__linux__) #include @@ -54,6 +55,7 @@ static bool ufd_version_check(void) g_test_message("Skipping test: UFFDIO_API failed"); return false; } + uffd_feature_thread_id = api_struct.features & UFFD_FEATURE_THREAD_ID; ioctl_mask = (__u64)1 << _UFFDIO_REGISTER | (__u64)1 << _UFFDIO_UNREGISTER; @@ -266,6 +268,16 @@ static uint64_t get_migration_pass(QTestState *who) return result; } +static void read_blocktime(QTestState *who) +{ + QDict *rsp, *rsp_return; + + rsp = wait_command(who, "{ 'execute': 'query-migrate' }"); + rsp_return = qdict_get_qdict(rsp, "return"); + g_assert(qdict_haskey(rsp_return, "postcopy-blocktime")); + QDECREF(rsp); +} + static void wait_for_migration_complete(QTestState *who) { QDict *rsp, *rsp_return; @@ -525,6 +537,7 @@ static void test_migrate(void) migrate_set_capability(from, "postcopy-ram", "true"); migrate_set_capability(to, "postcopy-ram", "true"); + migrate_set_capability(to, "postcopy-blocktime", "true"); /* We want to pick a speed slow enough that the test completes * quickly, but that it doesn't complete precopy even on a slow @@ -553,6 +566,9 @@ static void test_migrate(void) wait_for_serial("dest_serial"); wait_for_migration_complete(from); + if (uffd_feature_thread_id) { + read_blocktime(to); + } g_free(uri); test_migrate_end(from, to); From ca6011c2329121ecc25081f884a5d9016acc6572 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Mon, 30 Oct 2017 16:16:30 +0300 Subject: [PATCH 12/27] migration: add postcopy total blocktime into query-migrate Postcopy total blocktime is available on destination side only. But query-migrate was possible only for source. This patch adds ability to call query-migrate on destination. To be able to see postcopy blocktime, need to request postcopy-blocktime capability. The query-migrate command will show following sample result: {"return": "postcopy-vcpu-blocktime": [115, 100], "status": "completed", "postcopy-blocktime": 100 }} postcopy_vcpu_blocktime contains list, where the first item is the first vCPU in QEMU. This patch has a drawback, it combines states of incoming and outgoing migration. Ongoing migration state will overwrite incoming state. Looks like better to separate query-migrate for incoming and outgoing migration or add parameter to indicate type of migration. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alexey Perevalov Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- hmp.c | 15 +++++++++++ migration/migration.c | 42 +++++++++++++++++++++++++++--- migration/migration.h | 4 +++ migration/postcopy-ram.c | 56 ++++++++++++++++++++++++++++++++++++++++ migration/trace-events | 1 + qapi/migration.json | 11 +++++++- 6 files changed, 124 insertions(+), 5 deletions(-) diff --git a/hmp.c b/hmp.c index 056bf70cf1..c6bab5373b 100644 --- a/hmp.c +++ b/hmp.c @@ -264,6 +264,21 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) info->cpu_throttle_percentage); } + if (info->has_postcopy_blocktime) { + monitor_printf(mon, "postcopy blocktime: %" PRId64 "\n", + info->postcopy_blocktime); + } + + if (info->has_postcopy_vcpu_blocktime) { + Visitor *v; + char *str; + v = string_output_visitor_new(false, &str); + visit_type_int64List(v, NULL, &info->postcopy_vcpu_blocktime, NULL); + visit_complete(v, &str); + monitor_printf(mon, "postcopy vcpu blocktime: %s\n", str); + g_free(str); + visit_free(v); + } qapi_free_MigrationInfo(info); qapi_free_MigrationCapabilityStatusList(caps); } diff --git a/migration/migration.c b/migration/migration.c index fb876e9fca..17cc219da2 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -591,14 +591,15 @@ static void populate_disk_info(MigrationInfo *info) } } -MigrationInfo *qmp_query_migrate(Error **errp) +static void fill_source_migration_info(MigrationInfo *info) { - MigrationInfo *info = g_malloc0(sizeof(*info)); MigrationState *s = migrate_get_current(); switch (s->state) { case MIGRATION_STATUS_NONE: /* no migration has happened ever */ + /* do not overwrite destination migration status */ + return; break; case MIGRATION_STATUS_SETUP: info->has_status = true; @@ -649,8 +650,6 @@ MigrationInfo *qmp_query_migrate(Error **errp) break; } info->status = s->state; - - return info; } /** @@ -714,6 +713,41 @@ static bool migrate_caps_check(bool *cap_list, return true; } +static void fill_destination_migration_info(MigrationInfo *info) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + switch (mis->state) { + case MIGRATION_STATUS_NONE: + return; + break; + case MIGRATION_STATUS_SETUP: + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_COLO: + info->has_status = true; + break; + case MIGRATION_STATUS_COMPLETED: + info->has_status = true; + fill_destination_postcopy_migration_info(info); + break; + } + info->status = mis->state; +} + +MigrationInfo *qmp_query_migrate(Error **errp) +{ + MigrationInfo *info = g_malloc0(sizeof(*info)); + + fill_destination_migration_info(info); + fill_source_migration_info(info); + + return info; +} + void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, Error **errp) { diff --git a/migration/migration.h b/migration/migration.h index 7e06c11773..d84bc550b1 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -71,6 +71,10 @@ struct MigrationIncomingState { MigrationIncomingState *migration_incoming_get_current(void); void migration_incoming_state_destroy(void); +/* + * Functions to work with blocktime context + */ +void fill_destination_postcopy_migration_info(MigrationInfo *info); #define TYPE_MIGRATION "migration" diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index ca8c3bce8e..7814da5b4b 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -108,6 +108,55 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void) return ctx; } +static int64List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) +{ + int64List *list = NULL, *entry = NULL; + int i; + + for (i = smp_cpus - 1; i >= 0; i--) { + entry = g_new0(int64List, 1); + entry->value = ctx->vcpu_blocktime[i]; + entry->next = list; + list = entry; + } + + return list; +} + +/* + * This function just populates MigrationInfo from postcopy's + * blocktime context. It will not populate MigrationInfo, + * unless postcopy-blocktime capability was set. + * + * @info: pointer to MigrationInfo to populate + */ +void fill_destination_postcopy_migration_info(MigrationInfo *info) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + + if (!bc) { + return; + } + + info->has_postcopy_blocktime = true; + info->postcopy_blocktime = bc->total_blocktime; + info->has_postcopy_vcpu_blocktime = true; + info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); +} + +static uint64_t get_postcopy_total_blocktime(void) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyBlocktimeContext *bc = mis->blocktime_ctx; + + if (!bc) { + return 0; + } + + return bc->total_blocktime; +} + /** * receive_ufd_features: check userfault fd features, to request only supported * features in the future. @@ -482,6 +531,9 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size); mis->postcopy_tmp_zero_page = NULL; } + trace_postcopy_ram_incoming_cleanup_blocktime( + get_postcopy_total_blocktime()); + trace_postcopy_ram_incoming_cleanup_exit(); return 0; } @@ -959,6 +1011,10 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis) #else /* No target OS support, stubs just fail */ +void fill_destination_postcopy_migration_info(MigrationInfo *info) +{ +} + bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) { error_report("%s: No OS support", __func__); diff --git a/migration/trace-events b/migration/trace-events index 462d15717d..141e773305 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -198,6 +198,7 @@ postcopy_ram_incoming_cleanup_closeuf(void) "" postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" postcopy_ram_incoming_cleanup_join(void) "" +postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu64 save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" diff --git a/qapi/migration.json b/qapi/migration.json index 4d8ccdf776..70e7b677ef 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -155,6 +155,13 @@ # @error-desc: the human readable error description string, when # @status is 'failed'. Clients should not attempt to parse the # error strings. (Since 2.7) +# +# @postcopy-blocktime: total time when all vCPU were blocked during postcopy +# live migration (Since 2.12) +# +# @postcopy-vcpu-blocktime: list of the postcopy blocktime per vCPU (Since 2.12) +# + # # Since: 0.14.0 ## @@ -167,7 +174,9 @@ '*downtime': 'int', '*setup-time': 'int', '*cpu-throttle-percentage': 'int', - '*error-desc': 'str'} } + '*error-desc': 'str', + '*postcopy-blocktime' : 'int64', + '*postcopy-vcpu-blocktime': ['int64']} } ## # @query-migrate: From bae416e5ba65701d3c5238164517158066d615e5 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 15 Dec 2017 11:51:23 +0000 Subject: [PATCH 13/27] migration: Guard ram_bytes_remaining against early call Calling ram_bytes_remaining during the early part of setup is unsafe because the ram_state isn't yet initialised. This can happen in the sequence: migrate migrate_cancel info migrate if the migrate sticks trying to connect (e.g. to an unresponsive destination due to the connect timeout). Here 'info migrate' sees a state of CANCELLING and so assumes the migrate has partially happened. partial fix for: RH bz: https://bugzilla.redhat.com/show_bug.cgi?id=1525899 Reported-by: Xianxian Wang Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/ram.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 021d583b9b..cb1950f3eb 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -237,7 +237,8 @@ static RAMState *ram_state; uint64_t ram_bytes_remaining(void) { - return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE; + return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : + 0; } MigrationStats ram_counters; From 1f8956041ad3851c28f0e3f6ee023d3bcc55ca50 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 28 Dec 2017 12:16:16 +0300 Subject: [PATCH 14/27] migration: finalize current_migration object current_migration has .instance_finalize callback, but it is not called, because nobody unrefs current_migration. Fix that. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- include/migration/misc.h | 1 + migration/migration.c | 5 +++++ vl.c | 1 + 3 files changed, 7 insertions(+) diff --git a/include/migration/misc.h b/include/migration/misc.h index c079b7771b..77fd4f587c 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -44,6 +44,7 @@ void dump_vmstate_json_to_file(FILE *out_fp); /* migration/migration.c */ void migration_object_init(void); +void migration_object_finalize(void); void qemu_start_incoming_migration(const char *uri, Error **errp); bool migration_is_idle(void); void add_migration_state_change_notifier(Notifier *notify); diff --git a/migration/migration.c b/migration/migration.c index 17cc219da2..7a77b193c1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -132,6 +132,11 @@ void migration_object_init(void) } } +void migration_object_finalize(void) +{ + object_unref(OBJECT(current_migration)); +} + /* For outgoing */ MigrationState *migrate_get_current(void) { diff --git a/vl.c b/vl.c index 444b7507da..f1c52fe9fe 100644 --- a/vl.c +++ b/vl.c @@ -4792,6 +4792,7 @@ int main(int argc, char **argv, char **envp) monitor_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); + migration_object_finalize(); /* TODO: unref root container, check all devices are ok */ return 0; From 0d649a0e95b46fab94090f852a43c72d2621ebb9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:05 +0800 Subject: [PATCH 15/27] migration: assert colo instead of check When reaching here if we are still "active" it means we must be in colo state. After a quick discussion offlist, we decided to use the safer error_report(). Finally I want to use "switch" here rather than lots of complicated if clauses. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 7a77b193c1..f579752d21 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2348,7 +2348,15 @@ static void *migration_thread(void *opaque) } runstate_set(RUN_STATE_POSTMIGRATE); } else { - if (s->state == MIGRATION_STATUS_ACTIVE && enable_colo) { + if (s->state == MIGRATION_STATUS_ACTIVE) { + /* + * We should really assert here, but since it's during + * migration, let's try to reduce the usage of assertions. + */ + if (!enable_colo) { + error_report("%s: critical error: calling COLO code without " + "COLO enabled", __func__); + } migrate_start_colo_process(s); qemu_savevm_state_cleanup(); /* From 0ceccd858a8d2d462109c2e0e0ca51861eebdd80 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:06 +0800 Subject: [PATCH 16/27] migration: qemu_savevm_state_cleanup() in cleanup Moving existing callers all into migrate_fd_cleanup(). It simplifies migration_thread() a bit. Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Signed-off-by: Juan Quintela --- migration/migration.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index f579752d21..02d8bc07b1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1107,6 +1107,8 @@ static void migrate_fd_cleanup(void *opaque) qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; + qemu_savevm_state_cleanup(); + if (s->to_dst_file) { Error *local_err = NULL; @@ -2329,13 +2331,6 @@ static void *migration_thread(void *opaque) end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_mutex_lock_iothread(); - /* - * The resource has been allocated by migration will be reused in COLO - * process, so don't release them. - */ - if (!enable_colo) { - qemu_savevm_state_cleanup(); - } if (s->state == MIGRATION_STATUS_COMPLETED) { uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); s->total_time = end_time - s->total_time; @@ -2358,7 +2353,6 @@ static void *migration_thread(void *opaque) "COLO enabled", __func__); } migrate_start_colo_process(s); - qemu_savevm_state_cleanup(); /* * Fixme: we will run VM in COLO no matter its old running state. * After exited COLO, we will keep running. From deb74fb6701755a4f864977608ea3919e4970fae Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:07 +0800 Subject: [PATCH 17/27] migration: remove "enable_colo" var It's only used once, clean it up a bit. Signed-off-by: Peter Xu Signed-off-by: Juan Quintela --- migration/migration.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 02d8bc07b1..0b513c2020 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2216,7 +2216,6 @@ static void *migration_thread(void *opaque) bool entered_postcopy = false; /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; - bool enable_colo = migrate_colo_enabled(); rcu_register_thread(); @@ -2348,7 +2347,7 @@ static void *migration_thread(void *opaque) * We should really assert here, but since it's during * migration, let's try to reduce the usage of assertions. */ - if (!enable_colo) { + if (!migrate_colo_enabled()) { error_report("%s: critical error: calling COLO code without " "COLO enabled", __func__); } From 4af246a34e3155365e6be9dda201f3467d0498d9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:08 +0800 Subject: [PATCH 18/27] migration: split use of MigrationState.total_time It was used either to: 1. store initial timestamp of migration start, and 2. store total time used by last migration Let's provide two parameters for each of them. Mix use of the two is slightly misleading. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 7 ++++--- migration/migration.h | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 0b513c2020..23f6e5092c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -619,7 +619,7 @@ static void fill_source_migration_info(MigrationInfo *info) info->has_status = true; info->has_total_time = true; info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - s->total_time; + - s->start_time; info->has_expected_downtime = true; info->expected_downtime = s->expected_downtime; info->has_setup_time = true; @@ -1300,7 +1300,8 @@ MigrationState *migrate_init(void) migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); - s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->total_time = 0; return s; } @@ -2332,7 +2333,7 @@ static void *migration_thread(void *opaque) qemu_mutex_lock_iothread(); if (s->state == MIGRATION_STATUS_COMPLETED) { uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); - s->total_time = end_time - s->total_time; + s->total_time = end_time - s->start_time; if (!entered_postcopy) { s->downtime = end_time - start_time; } diff --git a/migration/migration.h b/migration/migration.h index d84bc550b1..d1132b1cf7 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -115,6 +115,9 @@ struct MigrationState } rp_state; double mbps; + /* Timestamp when recent migration starts (ms) */ + int64_t start_time; + /* Total time used by latest migration (ms) */ int64_t total_time; int64_t downtime; int64_t expected_downtime; From 7287cbd46e2d6fb582ca78c3cb49b1e53d91a761 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:09 +0800 Subject: [PATCH 19/27] migration: move vm_old_running into global state Firstly, it was passed around. Let's just move it into MigrationState just like many other variables as state of migration, renaming it to vm_was_running. One thing to mention is that for postcopy, we actually don't need this knowledge at all since postcopy can't resume a VM even if it fails (we can see that from the old code too: when we try to resume we also check against "entered_postcopy" variable). So further we do this: - in postcopy_start(), we don't update vm_old_running since useless - in migration_thread(), we don't need to check entered_postcopy when resume, since it's only used for precopy. Comment this out too for that variable definition. Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Signed-off-by: Juan Quintela --- migration/migration.c | 17 +++++++---------- migration/migration.h | 6 ++++++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 23f6e5092c..e26f80d9aa 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1302,6 +1302,7 @@ MigrationState *migrate_init(void) s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); s->total_time = 0; + s->vm_was_running = false; return s; } @@ -1885,7 +1886,7 @@ static int await_return_path_close_on_source(MigrationState *ms) * Switch from normal iteration to postcopy * Returns non-0 on error */ -static int postcopy_start(MigrationState *ms, bool *old_vm_running) +static int postcopy_start(MigrationState *ms) { int ret; QIOChannelBuffer *bioc; @@ -1903,7 +1904,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) trace_postcopy_start_set_run(); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); global_state_store(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); if (ret < 0) { @@ -2094,11 +2094,9 @@ static int migration_maybe_pause(MigrationState *s, * * @s: Current migration state * @current_active_state: The migration state we expect to be in - * @*old_vm_running: Pointer to old_vm_running flag * @*start_time: Pointer to time to update */ static void migration_completion(MigrationState *s, int current_active_state, - bool *old_vm_running, int64_t *start_time) { int ret; @@ -2107,7 +2105,7 @@ static void migration_completion(MigrationState *s, int current_active_state, qemu_mutex_lock_iothread(); *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); + s->vm_was_running = runstate_is_running(); ret = global_state_store(); if (!ret) { @@ -2213,7 +2211,6 @@ static void *migration_thread(void *opaque) int64_t threshold_size = 0; int64_t start_time = initial_time; int64_t end_time; - bool old_vm_running = false; bool entered_postcopy = false; /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; @@ -2272,7 +2269,7 @@ static void *migration_thread(void *opaque) pend_nonpost <= threshold_size && atomic_read(&s->start_postcopy)) { - if (!postcopy_start(s, &old_vm_running)) { + if (!postcopy_start(s)) { current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; entered_postcopy = true; } @@ -2284,7 +2281,7 @@ static void *migration_thread(void *opaque) } else { trace_migration_thread_low_pending(pending_size); migration_completion(s, current_active_state, - &old_vm_running, &start_time); + &start_time); break; } } @@ -2357,9 +2354,9 @@ static void *migration_thread(void *opaque) * Fixme: we will run VM in COLO no matter its old running state. * After exited COLO, we will keep running. */ - old_vm_running = true; + s->vm_was_running = true; } - if (old_vm_running && !entered_postcopy) { + if (s->vm_was_running) { vm_start(); } else { if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { diff --git a/migration/migration.h b/migration/migration.h index d1132b1cf7..4cae263992 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -123,6 +123,12 @@ struct MigrationState int64_t expected_downtime; bool enabled_capabilities[MIGRATION_CAPABILITY__MAX]; int64_t setup_time; + /* + * Whether guest was running when we enter the completion stage. + * If migration is interrupted by any reason, we need to continue + * running the guest on source. + */ + bool vm_was_running; /* Flag set once the migration has been asked to enter postcopy */ bool start_postcopy; From 64909f974012038266a4785a33935742f1e96b84 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:10 +0800 Subject: [PATCH 20/27] migration: introduce downtime_start Introduce MigrationState.downtime_start to replace the local variable "start_time" in migration_thread to avoid passing things around. Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Signed-off-by: Juan Quintela --- migration/migration.c | 12 ++++-------- migration/migration.h | 2 ++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index e26f80d9aa..fff2823ad5 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2094,16 +2094,14 @@ static int migration_maybe_pause(MigrationState *s, * * @s: Current migration state * @current_active_state: The migration state we expect to be in - * @*start_time: Pointer to time to update */ -static void migration_completion(MigrationState *s, int current_active_state, - int64_t *start_time) +static void migration_completion(MigrationState *s, int current_active_state) { int ret; if (s->state == MIGRATION_STATUS_ACTIVE) { qemu_mutex_lock_iothread(); - *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); s->vm_was_running = runstate_is_running(); ret = global_state_store(); @@ -2209,7 +2207,6 @@ static void *migration_thread(void *opaque) * measured bandwidth */ int64_t threshold_size = 0; - int64_t start_time = initial_time; int64_t end_time; bool entered_postcopy = false; /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ @@ -2280,8 +2277,7 @@ static void *migration_thread(void *opaque) qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy); } else { trace_migration_thread_low_pending(pending_size); - migration_completion(s, current_active_state, - &start_time); + migration_completion(s, current_active_state); break; } } @@ -2332,7 +2328,7 @@ static void *migration_thread(void *opaque) uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); s->total_time = end_time - s->start_time; if (!entered_postcopy) { - s->downtime = end_time - start_time; + s->downtime = end_time - s->downtime_start; } if (s->total_time) { s->mbps = (((double) transferred_bytes * 8.0) / diff --git a/migration/migration.h b/migration/migration.h index 4cae263992..0aad12f9a1 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -119,6 +119,8 @@ struct MigrationState int64_t start_time; /* Total time used by latest migration (ms) */ int64_t total_time; + /* Timestamp when VM is down (ms) to migrate the last stuff */ + int64_t downtime_start; int64_t downtime; int64_t expected_downtime; bool enabled_capabilities[MIGRATION_CAPABILITY__MAX]; From cf011f082d7a2a9abfc0a257cc4de1cfcfbef24d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:11 +0800 Subject: [PATCH 21/27] migration: introduce migrate_calculate_complete Generalize the calculation part when migration complete into a function to simplify migration_thread(). Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index fff2823ad5..c1de46a101 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2190,6 +2190,25 @@ bool migrate_colo_enabled(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO]; } +static void migration_calculate_complete(MigrationState *s) +{ + uint64_t bytes = qemu_ftell(s->to_dst_file); + int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + s->total_time = end_time - s->start_time; + if (!s->downtime) { + /* + * It's still not set, so we are precopy migration. For + * postcopy, downtime is calculated during postcopy_start(). + */ + s->downtime = end_time - s->downtime_start; + } + + if (s->total_time) { + s->mbps = ((double) bytes * 8.0) / s->total_time / 1000; + } +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -2207,7 +2226,6 @@ static void *migration_thread(void *opaque) * measured bandwidth */ int64_t threshold_size = 0; - int64_t end_time; bool entered_postcopy = false; /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; @@ -2321,19 +2339,10 @@ static void *migration_thread(void *opaque) trace_migration_thread_after_loop(); /* If we enabled cpu throttling for auto-converge, turn it off. */ cpu_throttle_stop(); - end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_mutex_lock_iothread(); if (s->state == MIGRATION_STATUS_COMPLETED) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); - s->total_time = end_time - s->start_time; - if (!entered_postcopy) { - s->downtime = end_time - s->downtime_start; - } - if (s->total_time) { - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) s->total_time)) / 1000; - } + migration_calculate_complete(s); runstate_set(RUN_STATE_POSTMIGRATE); } else { if (s->state == MIGRATION_STATUS_ACTIVE) { From 39b9e17905c5ff8302c0ff17af546cbf68e0b4f6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:12 +0800 Subject: [PATCH 22/27] migration: use switch at the end of migration It converts the old if clauses into switch, explicitly mentions the possible migration states. The old nested "if"s are not clear on what we do on different states. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 44 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index c1de46a101..6023218ffa 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2341,26 +2341,30 @@ static void *migration_thread(void *opaque) cpu_throttle_stop(); qemu_mutex_lock_iothread(); - if (s->state == MIGRATION_STATUS_COMPLETED) { + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: migration_calculate_complete(s); runstate_set(RUN_STATE_POSTMIGRATE); - } else { - if (s->state == MIGRATION_STATUS_ACTIVE) { - /* - * We should really assert here, but since it's during - * migration, let's try to reduce the usage of assertions. - */ - if (!migrate_colo_enabled()) { - error_report("%s: critical error: calling COLO code without " - "COLO enabled", __func__); - } - migrate_start_colo_process(s); - /* - * Fixme: we will run VM in COLO no matter its old running state. - * After exited COLO, we will keep running. - */ - s->vm_was_running = true; + break; + + case MIGRATION_STATUS_ACTIVE: + /* + * We should really assert here, but since it's during + * migration, let's try to reduce the usage of assertions. + */ + if (!migrate_colo_enabled()) { + error_report("%s: critical error: calling COLO code without " + "COLO enabled", __func__); } + migrate_start_colo_process(s); + /* + * Fixme: we will run VM in COLO no matter its old running state. + * After exited COLO, we will keep running. + */ + s->vm_was_running = true; + /* Fallthrough */ + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: if (s->vm_was_running) { vm_start(); } else { @@ -2368,6 +2372,12 @@ static void *migration_thread(void *opaque) runstate_set(RUN_STATE_POSTMIGRATE); } } + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; } qemu_bh_schedule(s->cleanup_bh); qemu_mutex_unlock_iothread(); From b15df1ae5063c7c181f8f068f9eba7661b3b5e1c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:13 +0800 Subject: [PATCH 23/27] migration: cleanup stats update into function We have quite a few lines in migration_thread() that calculates some statistics for the migration interations. Isolate it into a single function to improve readability. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 84 +++++++++++++++++++++++++------------------ migration/migration.h | 11 ++++++ 2 files changed, 60 insertions(+), 35 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 6023218ffa..d7f85aa460 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1303,6 +1303,8 @@ MigrationState *migrate_init(void) s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); s->total_time = 0; s->vm_was_running = false; + s->iteration_initial_bytes = 0; + s->threshold_size = 0; return s; } @@ -2209,6 +2211,43 @@ static void migration_calculate_complete(MigrationState *s) } } +static void migration_update_counters(MigrationState *s, + int64_t current_time) +{ + uint64_t transferred, time_spent; + int64_t threshold_size; + double bandwidth; + + if (current_time < s->iteration_start_time + BUFFER_DELAY) { + return; + } + + transferred = qemu_ftell(s->to_dst_file) - s->iteration_initial_bytes; + time_spent = current_time - s->iteration_start_time; + bandwidth = (double)transferred / time_spent; + threshold_size = bandwidth * s->parameters.downtime_limit; + + s->mbps = (((double) transferred * 8.0) / + ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; + + /* + * if we haven't sent anything, we don't want to + * recalculate. 10000 is a small enough number for our purposes + */ + if (ram_counters.dirty_pages_rate && transferred > 10000) { + s->expected_downtime = ram_counters.dirty_pages_rate * + qemu_target_page_size() / bandwidth; + } + + qemu_file_reset_rate_limit(s->to_dst_file); + + s->iteration_start_time = current_time; + s->iteration_initial_bytes = qemu_ftell(s->to_dst_file); + + trace_migrate_transferred(transferred, time_spent, + bandwidth, threshold_size); +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -2216,22 +2255,15 @@ static void migration_calculate_complete(MigrationState *s) static void *migration_thread(void *opaque) { MigrationState *s = opaque; - /* Used by the bandwidth calcs, updated later */ - int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); - int64_t initial_bytes = 0; - /* - * The final stage happens when the remaining data is smaller than - * this threshold; it's calculated from the requested downtime and - * measured bandwidth - */ - int64_t threshold_size = 0; bool entered_postcopy = false; /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; rcu_register_thread(); + s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_savevm_state_header(s->to_dst_file); /* @@ -2271,17 +2303,17 @@ static void *migration_thread(void *opaque) if (!qemu_file_rate_limit(s->to_dst_file)) { uint64_t pend_post, pend_nonpost; - qemu_savevm_state_pending(s->to_dst_file, threshold_size, + qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_nonpost, &pend_post); pending_size = pend_nonpost + pend_post; - trace_migrate_pending(pending_size, threshold_size, + trace_migrate_pending(pending_size, s->threshold_size, pend_post, pend_nonpost); - if (pending_size && pending_size >= threshold_size) { + if (pending_size && pending_size >= s->threshold_size) { /* Still a significant amount to transfer */ if (migrate_postcopy() && s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && - pend_nonpost <= threshold_size && + pend_nonpost <= s->threshold_size && atomic_read(&s->start_postcopy)) { if (!postcopy_start(s)) { @@ -2306,33 +2338,15 @@ static void *migration_thread(void *opaque) trace_migration_thread_file_err(); break; } + current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - if (current_time >= initial_time + BUFFER_DELAY) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - - initial_bytes; - uint64_t time_spent = current_time - initial_time; - double bandwidth = (double)transferred_bytes / time_spent; - threshold_size = bandwidth * s->parameters.downtime_limit; - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; + migration_update_counters(s, current_time); - trace_migrate_transferred(transferred_bytes, time_spent, - bandwidth, threshold_size); - /* if we haven't sent anything, we don't want to recalculate - 10000 is a small enough number for our purposes */ - if (ram_counters.dirty_pages_rate && transferred_bytes > 10000) { - s->expected_downtime = ram_counters.dirty_pages_rate * - qemu_target_page_size() / bandwidth; - } - - qemu_file_reset_rate_limit(s->to_dst_file); - initial_time = current_time; - initial_bytes = qemu_ftell(s->to_dst_file); - } if (qemu_file_rate_limit(s->to_dst_file)) { /* usleep expects microseconds */ - g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); + g_usleep((s->iteration_start_time + BUFFER_DELAY - + current_time) * 1000); } } diff --git a/migration/migration.h b/migration/migration.h index 0aad12f9a1..f2bc1aaf85 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -102,6 +102,17 @@ struct MigrationState QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + /* bytes already send at the beggining of current interation */ + uint64_t iteration_initial_bytes; + /* time at the start of current iteration */ + int64_t iteration_start_time; + /* + * The final stage happens when the remaining data is smaller than + * this threshold; it's calculated from the requested downtime and + * measured bandwidth + */ + int64_t threshold_size; + /* params from 'migrate-set-parameters' */ MigrationParameters parameters; From 2ad873057e0d7f227ed8a18f74db6f0433b32e74 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:14 +0800 Subject: [PATCH 24/27] migration: major cleanup for migrate iterations The major work for migration iterations are to move RAM/block/... data via qemu_savevm_state_iterate(). Generalize those part into a single function. Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Signed-off-by: Juan Quintela --- migration/migration.c | 90 ++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index d7f85aa460..a95f5e469c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2095,11 +2095,11 @@ static int migration_maybe_pause(MigrationState *s, * The caller 'breaks' the loop when this returns. * * @s: Current migration state - * @current_active_state: The migration state we expect to be in */ -static void migration_completion(MigrationState *s, int current_active_state) +static void migration_completion(MigrationState *s) { int ret; + int current_active_state = s->state; if (s->state == MIGRATION_STATUS_ACTIVE) { qemu_mutex_lock_iothread(); @@ -2248,6 +2248,51 @@ static void migration_update_counters(MigrationState *s, bandwidth, threshold_size); } +/* Migration thread iteration status */ +typedef enum { + MIG_ITERATE_RESUME, /* Resume current iteration */ + MIG_ITERATE_SKIP, /* Skip current iteration */ + MIG_ITERATE_BREAK, /* Break the loop */ +} MigIterateState; + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState migration_iteration_run(MigrationState *s) +{ + uint64_t pending_size, pend_post, pend_nonpost; + bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; + + qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, + &pend_nonpost, &pend_post); + pending_size = pend_nonpost + pend_post; + + trace_migrate_pending(pending_size, s->threshold_size, + pend_post, pend_nonpost); + + if (pending_size && pending_size >= s->threshold_size) { + /* Still a significant amount to transfer */ + if (migrate_postcopy() && !in_postcopy && + pend_nonpost <= s->threshold_size && + atomic_read(&s->start_postcopy)) { + if (postcopy_start(s)) { + error_report("%s: postcopy failed to start", __func__); + } + return MIG_ITERATE_SKIP; + } + /* Just another iteration step */ + qemu_savevm_state_iterate(s->to_dst_file, + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); + } else { + trace_migration_thread_low_pending(pending_size); + migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -2256,9 +2301,6 @@ static void *migration_thread(void *opaque) { MigrationState *s = opaque; int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); - bool entered_postcopy = false; - /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ - enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; rcu_register_thread(); @@ -2298,43 +2340,21 @@ static void *migration_thread(void *opaque) while (s->state == MIGRATION_STATUS_ACTIVE || s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { int64_t current_time; - uint64_t pending_size; if (!qemu_file_rate_limit(s->to_dst_file)) { - uint64_t pend_post, pend_nonpost; - - qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, - &pend_nonpost, &pend_post); - pending_size = pend_nonpost + pend_post; - trace_migrate_pending(pending_size, s->threshold_size, - pend_post, pend_nonpost); - if (pending_size && pending_size >= s->threshold_size) { - /* Still a significant amount to transfer */ - - if (migrate_postcopy() && - s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && - pend_nonpost <= s->threshold_size && - atomic_read(&s->start_postcopy)) { - - if (!postcopy_start(s)) { - current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; - entered_postcopy = true; - } - - continue; - } - /* Just another iteration step */ - qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy); - } else { - trace_migration_thread_low_pending(pending_size); - migration_completion(s, current_active_state); + MigIterateState iter_state = migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { break; } } if (qemu_file_get_error(s->to_dst_file)) { - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); + if (migration_is_setup_or_active(s->state)) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } trace_migration_thread_file_err(); break; } From 199aa6d4e445b863cc0e373496b9d0900437d8a4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:15 +0800 Subject: [PATCH 25/27] migration: put the finish part into a new function This patch only moved the last part of migration_thread() into a new function migration_iteration_finish() to make it much shorter. With previous works to remove some local variables, now it's fairly easy to do that. Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Signed-off-by: Juan Quintela --- migration/migration.c | 94 ++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index a95f5e469c..793fd163f7 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2293,6 +2293,54 @@ static MigIterateState migration_iteration_run(MigrationState *s) return MIG_ITERATE_RESUME; } +static void migration_iteration_finish(MigrationState *s) +{ + /* If we enabled cpu throttling for auto-converge, turn it off. */ + cpu_throttle_stop(); + + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + runstate_set(RUN_STATE_POSTMIGRATE); + break; + + case MIGRATION_STATUS_ACTIVE: + /* + * We should really assert here, but since it's during + * migration, let's try to reduce the usage of assertions. + */ + if (!migrate_colo_enabled()) { + error_report("%s: critical error: calling COLO code without " + "COLO enabled", __func__); + } + migrate_start_colo_process(s); + /* + * Fixme: we will run VM in COLO no matter its old running state. + * After exited COLO, we will keep running. + */ + s->vm_was_running = true; + /* Fallthrough */ + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + if (s->vm_was_running) { + vm_start(); + } else { + if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { + runstate_set(RUN_STATE_POSTMIGRATE); + } + } + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + qemu_bh_schedule(s->cleanup_bh); + qemu_mutex_unlock_iothread(); +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -2371,51 +2419,7 @@ static void *migration_thread(void *opaque) } trace_migration_thread_after_loop(); - /* If we enabled cpu throttling for auto-converge, turn it off. */ - cpu_throttle_stop(); - - qemu_mutex_lock_iothread(); - switch (s->state) { - case MIGRATION_STATUS_COMPLETED: - migration_calculate_complete(s); - runstate_set(RUN_STATE_POSTMIGRATE); - break; - - case MIGRATION_STATUS_ACTIVE: - /* - * We should really assert here, but since it's during - * migration, let's try to reduce the usage of assertions. - */ - if (!migrate_colo_enabled()) { - error_report("%s: critical error: calling COLO code without " - "COLO enabled", __func__); - } - migrate_start_colo_process(s); - /* - * Fixme: we will run VM in COLO no matter its old running state. - * After exited COLO, we will keep running. - */ - s->vm_was_running = true; - /* Fallthrough */ - case MIGRATION_STATUS_FAILED: - case MIGRATION_STATUS_CANCELLED: - if (s->vm_was_running) { - vm_start(); - } else { - if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { - runstate_set(RUN_STATE_POSTMIGRATE); - } - } - break; - - default: - /* Should not reach here, but if so, forgive the VM. */ - error_report("%s: Unknown ending state %d", __func__, s->state); - break; - } - qemu_bh_schedule(s->cleanup_bh); - qemu_mutex_unlock_iothread(); - + migration_iteration_finish(s); rcu_unregister_thread(); return NULL; } From 26978faf2faa4e7688622694ca787a8abc691147 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:16 +0800 Subject: [PATCH 26/27] migration: remove some block_cleanup_parameters() Keep the one in migrate_fd_cleanup() would be enough. Removing the other two. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 793fd163f7..47367cffae 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1160,7 +1160,6 @@ void migrate_fd_error(MigrationState *s, const Error *error) MIGRATION_STATUS_FAILED); migrate_set_error(s, error); notifier_list_notify(&migration_state_notifiers, s); - block_cleanup_parameters(s); } static void migrate_fd_cancel(MigrationState *s) @@ -1206,7 +1205,6 @@ static void migrate_fd_cancel(MigrationState *s) s->block_inactive = false; } } - block_cleanup_parameters(s); } void add_migration_state_change_notifier(Notifier *notify) From 816306826a45f4d15352e32d157172af3a35899f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 3 Jan 2018 20:20:17 +0800 Subject: [PATCH 27/27] migration: remove notify in fd_error It is already called in migrate_fd_cleanup. Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 47367cffae..d3a1c494c0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1159,7 +1159,6 @@ void migrate_fd_error(MigrationState *s, const Error *error) migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); migrate_set_error(s, error); - notifier_list_notify(&migration_state_notifiers, s); } static void migrate_fd_cancel(MigrationState *s)