From 728470bea15b11ba7b3e3db54f0d9939908e0e65 Mon Sep 17 00:00:00 2001 From: Gonglei Date: Tue, 23 Jun 2015 15:56:38 +0800 Subject: [PATCH 01/28] rdma: fix memory leak Variable "r" going out of scope leaks the storage it points to in line 3268. Signed-off-by: Gonglei Reviewed-by: Amit Shah Signed-off-by: Juan Quintela --- migration/rdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration/rdma.c b/migration/rdma.c index b777273b59..0a00290160 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -3263,12 +3263,13 @@ static const QEMUFileOps rdma_write_ops = { static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) { - QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA)); + QEMUFileRDMA *r; if (qemu_file_mode_is_not_valid(mode)) { return NULL; } + r = g_malloc0(sizeof(QEMUFileRDMA)); r->rdma = rdma; if (mode[0] == 'w') { From 1aca9a5f7d5a1ef9ee0233eac0fccc77ea6f0626 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 23 Jun 2015 17:34:35 +0100 Subject: [PATCH 02/28] Only try and read a VMDescription if it should be there The VMDescription section maybe after the EOF mark, the current code does a 'qemu_get_byte' and either gets the header byte identifying the description or an error (which it ignores). Doing the 'get' upsets RDMA which hangs on old machine types without the VMDescription. Just avoid reading the VMDescription if we wouldn't send it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/savevm.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index 9e0e286797..1a9b00b320 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1127,16 +1127,35 @@ int qemu_loadvm_state(QEMUFile *f) * Try to read in the VMDESC section as well, so that dumping tools that * intercept our migration stream have the chance to see it. */ - if (qemu_get_byte(f) == QEMU_VM_VMDESCRIPTION) { - uint32_t size = qemu_get_be32(f); - uint8_t *buf = g_malloc(0x1000); - while (size > 0) { - uint32_t read_chunk = MIN(size, 0x1000); - qemu_get_buffer(f, buf, read_chunk); - size -= read_chunk; + /* We've got to be careful; if we don't read the data and just shut the fd + * then the sender can error if we close while it's still sending. + * We also mustn't read data that isn't there; some transports (RDMA) + * will stall waiting for that data when the source has already closed. + */ + if (should_send_vmdesc()) { + uint8_t *buf; + uint32_t size; + section_type = qemu_get_byte(f); + + if (section_type != QEMU_VM_VMDESCRIPTION) { + error_report("Expected vmdescription section, but got %d", + section_type); + /* + * It doesn't seem worth failing at this point since + * we apparently have an otherwise valid VM state + */ + } else { + buf = g_malloc(0x1000); + size = qemu_get_be32(f); + + while (size > 0) { + uint32_t read_chunk = MIN(size, 0x1000); + qemu_get_buffer(f, buf, read_chunk); + size -= read_chunk; + } + g_free(buf); } - g_free(buf); } cpu_synchronize_all_post_init(); From 24ec68ef84fdacd5dddb83f3b341165c4815e6d6 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:20 +0100 Subject: [PATCH 03/28] rdma typos A couple of typo fixes. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/rdma.c | 6 +++--- trace-events | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index 0a00290160..fc6b81c447 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -1223,7 +1223,7 @@ const char *print_wrid(int wrid) /* * Perform a non-optimized memory unregistration after every transfer - * for demonsration purposes, only if pin-all is not requested. + * for demonstration purposes, only if pin-all is not requested. * * Potential optimizations: * 1. Start a new thread to run this function continuously @@ -3288,7 +3288,7 @@ static void rdma_accept_incoming_migration(void *opaque) QEMUFile *f; Error *local_err = NULL, **errp = &local_err; - trace_qemu_dma_accept_incoming_migration(); + trace_qemu_rdma_accept_incoming_migration(); ret = qemu_rdma_accept(rdma); if (ret) { @@ -3296,7 +3296,7 @@ static void rdma_accept_incoming_migration(void *opaque) return; } - trace_qemu_dma_accept_incoming_migration_accepted(); + trace_qemu_rdma_accept_incoming_migration_accepted(); f = qemu_fopen_rdma(rdma, "rb"); if (f == NULL) { diff --git a/trace-events b/trace-events index d24d80aed2..0917df287f 100644 --- a/trace-events +++ b/trace-events @@ -1405,8 +1405,8 @@ migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PR migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 # migration/rdma.c -qemu_dma_accept_incoming_migration(void) "" -qemu_dma_accept_incoming_migration_accepted(void) "" +qemu_rdma_accept_incoming_migration(void) "" +qemu_rdma_accept_incoming_migration_accepted(void) "" qemu_rdma_accept_pin_state(bool pin) "%d" qemu_rdma_accept_pin_verbsc(void *verbs) "Verbs context after listen: %p" qemu_rdma_block_for_wrid_miss(const char *wcompstr, int wcomp, const char *gcompstr, uint64_t req) "A Wanted wrid %s (%d) but got %s (%" PRIu64 ")" From 4fb5364b9096d6110c46604dbf1e19b7e766e757 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:21 +0100 Subject: [PATCH 04/28] Store block name in local blocks structure In a later patch the block name will be used to match up two views of the block list. Keep a copy of the block name with the local block list. (At some point it could be argued that it would be best just to let migration see the innards of RAMBlock and avoid the need to use foreach). Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Michael R. Hines Signed-off-by: Juan Quintela --- migration/rdma.c | 35 +++++++++++++++++++++-------------- trace-events | 2 +- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index fc6b81c447..6fa96019ce 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -215,17 +215,18 @@ static void network_to_caps(RDMACapabilities *cap) * the information. It's small anyway, so a list is overkill. */ typedef struct RDMALocalBlock { - uint8_t *local_host_addr; /* local virtual address */ - uint64_t remote_host_addr; /* remote virtual address */ - uint64_t offset; - uint64_t length; - struct ibv_mr **pmr; /* MRs for chunk-level registration */ - struct ibv_mr *mr; /* MR for non-chunk-level registration */ - uint32_t *remote_keys; /* rkeys for chunk-level registration */ - uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ - int index; /* which block are we */ - bool is_ram_block; - int nb_chunks; + char *block_name; + uint8_t *local_host_addr; /* local virtual address */ + uint64_t remote_host_addr; /* remote virtual address */ + uint64_t offset; + uint64_t length; + struct ibv_mr **pmr; /* MRs for chunk-level registration */ + struct ibv_mr *mr; /* MR for non-chunk-level registration */ + uint32_t *remote_keys; /* rkeys for chunk-level registration */ + uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ + int index; /* which block are we */ + bool is_ram_block; + int nb_chunks; unsigned long *transit_bitmap; unsigned long *unregister_bitmap; } RDMALocalBlock; @@ -511,7 +512,8 @@ static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, return result; } -static int rdma_add_block(RDMAContext *rdma, void *host_addr, +static int rdma_add_block(RDMAContext *rdma, const char *block_name, + void *host_addr, ram_addr_t block_offset, uint64_t length) { RDMALocalBlocks *local = &rdma->local_ram_blocks; @@ -539,6 +541,7 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, block = &local->block[local->nb_blocks]; + block->block_name = g_strdup(block_name); block->local_host_addr = host_addr; block->offset = block_offset; block->length = length; @@ -554,7 +557,8 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); - trace_rdma_add_block(local->nb_blocks, (uintptr_t) block->local_host_addr, + trace_rdma_add_block(block_name, local->nb_blocks, + (uintptr_t) block->local_host_addr, block->offset, block->length, (uintptr_t) (block->local_host_addr + block->length), BITS_TO_LONGS(block->nb_chunks) * @@ -574,7 +578,7 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, static int qemu_rdma_init_one_block(const char *block_name, void *host_addr, ram_addr_t block_offset, ram_addr_t length, void *opaque) { - return rdma_add_block(opaque, host_addr, block_offset, length); + return rdma_add_block(opaque, block_name, host_addr, block_offset, length); } /* @@ -636,6 +640,9 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) g_free(block->remote_keys); block->remote_keys = NULL; + g_free(block->block_name); + block->block_name = NULL; + for (x = 0; x < local->nb_blocks; x++) { g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset); } diff --git a/trace-events b/trace-events index 0917df287f..b29d8af8fc 100644 --- a/trace-events +++ b/trace-events @@ -1458,7 +1458,7 @@ qemu_rdma_write_one_recvregres(int mykey, int theirkey, uint64_t chunk) "Receive qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) "Sending registration request chunk %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64 qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)" qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64 -rdma_add_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" +rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" rdma_delete_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" rdma_start_incoming_migration(void) "" rdma_start_incoming_migration_after_dest_init(void) "" From b12f7777981953b7d939496283014740bdd6de64 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:22 +0100 Subject: [PATCH 05/28] Translate offsets to destination address space The 'offset' field in RDMACompress and 'current_addr' field in RDMARegister are commented as being offsets within a particular RAMBlock, however they appear to actually be offsets within the ram_addr_t space. The code currently assumes that the offsets on the source/destination match, this change removes the need for the assumption for these structures by translating the addresses into the ram_addr_t space of the destination host. Note: An alternative would be to change the fields to actually take the data they're commented for; this would potentially be simpler but would break stream compatibility for those cases that currently work. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/rdma.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index 6fa96019ce..d489012758 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -412,7 +412,7 @@ static void network_to_control(RDMAControlHeader *control) */ typedef struct QEMU_PACKED { union QEMU_PACKED { - uint64_t current_addr; /* offset into the ramblock of the chunk */ + uint64_t current_addr; /* offset into the ram_addr_t space */ uint64_t chunk; /* chunk to lookup if unregistering */ } key; uint32_t current_index; /* which ramblock the chunk belongs to */ @@ -420,8 +420,19 @@ typedef struct QEMU_PACKED { uint64_t chunks; /* how many sequential chunks to register */ } RDMARegister; -static void register_to_network(RDMARegister *reg) +static void register_to_network(RDMAContext *rdma, RDMARegister *reg) { + RDMALocalBlock *local_block; + local_block = &rdma->local_ram_blocks.block[reg->current_index]; + + if (local_block->is_ram_block) { + /* + * current_addr as passed in is an address in the local ram_addr_t + * space, we need to translate this for the destination + */ + reg->key.current_addr -= local_block->offset; + reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; + } reg->key.current_addr = htonll(reg->key.current_addr); reg->current_index = htonl(reg->current_index); reg->chunks = htonll(reg->chunks); @@ -437,13 +448,19 @@ static void network_to_register(RDMARegister *reg) typedef struct QEMU_PACKED { uint32_t value; /* if zero, we will madvise() */ uint32_t block_idx; /* which ram block index */ - uint64_t offset; /* where in the remote ramblock this chunk */ + uint64_t offset; /* Address in remote ram_addr_t space */ uint64_t length; /* length of the chunk */ } RDMACompress; -static void compress_to_network(RDMACompress *comp) +static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) { comp->value = htonl(comp->value); + /* + * comp->offset as passed in is an address in the local ram_addr_t + * space, we need to translate this for the destination + */ + comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; + comp->offset += rdma->dest_blocks[comp->block_idx].offset; comp->block_idx = htonl(comp->block_idx); comp->offset = htonll(comp->offset); comp->length = htonll(comp->length); @@ -1296,7 +1313,7 @@ static int qemu_rdma_unregister_waiting(RDMAContext *rdma) rdma->total_registrations--; reg.key.chunk = chunk; - register_to_network(®); + register_to_network(rdma, ®); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, &resp, NULL, NULL); if (ret < 0) { @@ -1917,7 +1934,7 @@ retry: trace_qemu_rdma_write_one_zero(chunk, sge.length, current_index, current_addr); - compress_to_network(&comp); + compress_to_network(rdma, &comp); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &comp, NULL, NULL, NULL); @@ -1944,7 +1961,7 @@ retry: trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index, current_addr); - register_to_network(®); + register_to_network(rdma, ®); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, &resp, ®_result_idx, NULL); if (ret < 0) { From 632e3a5cd812d6bbd38fd2f3ffc189ff5ea51926 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:23 +0100 Subject: [PATCH 06/28] Rework ram_control_load_hook to hook during block load We need the names of RAMBlocks as they're loaded for RDMA, reuse a slightly modified ram_control_load_hook: a) Pass a 'data' parameter to use for the name in the block-reg case b) Only some hook types now require the presence of a hook function. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- include/migration/migration.h | 2 +- include/migration/qemu-file.h | 14 +++++++++----- migration/qemu-file.c | 16 +++++++++++----- migration/ram.c | 4 +++- migration/rdma.c | 28 ++++++++++++++++++++++------ trace-events | 2 +- 6 files changed, 47 insertions(+), 19 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index 9387c8c9d4..afba2335e3 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -179,7 +179,7 @@ int migrate_decompress_threads(void); void ram_control_before_iterate(QEMUFile *f, uint64_t flags); void ram_control_after_iterate(QEMUFile *f, uint64_t flags); -void ram_control_load_hook(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data); /* Whenever this is found in the data stream, the flags * will be passed to ram_control_load_hook in the incoming-migration diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 4f67d79227..ea49f33fac 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -63,16 +63,20 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, /* * This function provides hooks around different * stages of RAM migration. + * 'opaque' is the backend specific data in QEMUFile + * 'data' is call specific data associated with the 'flags' value */ -typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); +typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags, + void *data); /* * Constants used by ram_control_* hooks */ -#define RAM_CONTROL_SETUP 0 -#define RAM_CONTROL_ROUND 1 -#define RAM_CONTROL_HOOK 2 -#define RAM_CONTROL_FINISH 3 +#define RAM_CONTROL_SETUP 0 +#define RAM_CONTROL_ROUND 1 +#define RAM_CONTROL_HOOK 2 +#define RAM_CONTROL_FINISH 3 +#define RAM_CONTROL_BLOCK_REG 4 /* * This function allows override of where the RAM page diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 557c1c1a62..6bb3dc15cd 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -129,7 +129,7 @@ void ram_control_before_iterate(QEMUFile *f, uint64_t flags) int ret = 0; if (f->ops->before_ram_iterate) { - ret = f->ops->before_ram_iterate(f, f->opaque, flags); + ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } @@ -141,24 +141,30 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags) int ret = 0; if (f->ops->after_ram_iterate) { - ret = f->ops->after_ram_iterate(f, f->opaque, flags); + ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } } } -void ram_control_load_hook(QEMUFile *f, uint64_t flags) +void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) { int ret = -EINVAL; if (f->ops->hook_ram_load) { - ret = f->ops->hook_ram_load(f, f->opaque, flags); + ret = f->ops->hook_ram_load(f, f->opaque, flags, data); if (ret < 0) { qemu_file_set_error(f, ret); } } else { - qemu_file_set_error(f, ret); + /* + * Hook is a hook specifically requested by the source sending a flag + * that expects there to be a hook on the destination. + */ + if (flags == RAM_CONTROL_HOOK) { + qemu_file_set_error(f, ret); + } } } diff --git a/migration/ram.c b/migration/ram.c index 57368e1575..644f52ad85 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1477,6 +1477,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) error_report_err(local_err); } } + ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, + block->idstr); break; } } @@ -1545,7 +1547,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) break; default: if (flags & RAM_SAVE_FLAG_HOOK) { - ram_control_load_hook(f, flags); + ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); } else { error_report("Unknown combination of migration flags: %#x", flags); diff --git a/migration/rdma.c b/migration/rdma.c index d489012758..fab736e9ca 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -2913,8 +2913,7 @@ err_rdma_dest_wait: * * Keep doing this until the source tells us to stop. */ -static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, - uint64_t flags) +static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) { RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), .type = RDMA_CONTROL_REGISTER_RESULT, @@ -2944,7 +2943,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, CHECK_ERROR_STATE(); do { - trace_qemu_rdma_registration_handle_wait(flags); + trace_qemu_rdma_registration_handle_wait(); ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); @@ -3132,8 +3131,25 @@ out: return ret; } +static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) +{ + switch (flags) { + case RAM_CONTROL_BLOCK_REG: + /* TODO A later patch */ + return 0; + break; + + case RAM_CONTROL_HOOK: + return qemu_rdma_registration_handle(f, opaque); + + default: + /* Shouldn't be called with any other values */ + abort(); + } +} + static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, - uint64_t flags) + uint64_t flags, void *data) { QEMUFileRDMA *rfile = opaque; RDMAContext *rdma = rfile->rdma; @@ -3152,7 +3168,7 @@ static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, * First, flush writes, if any. */ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, - uint64_t flags) + uint64_t flags, void *data) { Error *local_err = NULL, **errp = &local_err; QEMUFileRDMA *rfile = opaque; @@ -3274,7 +3290,7 @@ static const QEMUFileOps rdma_read_ops = { .get_buffer = qemu_rdma_get_buffer, .get_fd = qemu_rdma_get_fd, .close = qemu_rdma_close, - .hook_ram_load = qemu_rdma_registration_handle, + .hook_ram_load = rdma_load_hook, }; static const QEMUFileOps rdma_write_ops = { diff --git a/trace-events b/trace-events index b29d8af8fc..f1b94d66e3 100644 --- a/trace-events +++ b/trace-events @@ -1439,7 +1439,7 @@ qemu_rdma_registration_handle_register_rkey(int rkey) "%x" qemu_rdma_registration_handle_unregister(int requests) "%d requests" qemu_rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64 qemu_rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64 -qemu_rdma_registration_handle_wait(uint64_t flags) "Waiting for next request %" PRIu64 +qemu_rdma_registration_handle_wait(void) "" qemu_rdma_registration_start(uint64_t flags) "%" PRIu64 qemu_rdma_registration_stop(uint64_t flags) "%" PRIu64 qemu_rdma_registration_stop_ram(void) "" From 03fcab38617ac9bcd6ed28cb1b6a0ecd8fb3bc82 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:24 +0100 Subject: [PATCH 07/28] Allow rdma_delete_block to work without the hash In the next patch we remove the hash on the destination, rdma_delete_block does two things with the hash which can be avoided: a) The caller passes the offset and rdma_delete_block looks it up in the hash; fixed by getting the caller to pass the block b) The hash gets recreated after deletion; fixed by making that conditional on the hash being initialised. While this function is currently only used during cleanup, Michael asked that we keep it general for future dynamic block registration work. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/rdma.c | 27 ++++++++++++++++----------- trace-events | 2 +- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index fab736e9ca..347d380912 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -618,16 +618,19 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) return 0; } -static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) +/* + * Note: If used outside of cleanup, the caller must ensure that the destination + * block structures are also updated + */ +static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) { RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *) block_offset); RDMALocalBlock *old = local->block; int x; - assert(block); - + if (rdma->blockmap) { + g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); + } if (block->pmr) { int j; @@ -660,8 +663,11 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) g_free(block->block_name); block->block_name = NULL; - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset); + if (rdma->blockmap) { + for (x = 0; x < local->nb_blocks; x++) { + g_hash_table_remove(rdma->blockmap, + (void *)(uintptr_t)old[x].offset); + } } if (local->nb_blocks > 1) { @@ -683,8 +689,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) local->block = NULL; } - trace_rdma_delete_block(local->nb_blocks, - (uintptr_t)block->local_host_addr, + trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, block->offset, block->length, (uintptr_t)(block->local_host_addr + block->length), BITS_TO_LONGS(block->nb_chunks) * @@ -694,7 +699,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) local->nb_blocks--; - if (local->nb_blocks) { + if (local->nb_blocks && rdma->blockmap) { for (x = 0; x < local->nb_blocks; x++) { g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)local->block[x].offset, @@ -2222,7 +2227,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) if (rdma->local_ram_blocks.block) { while (rdma->local_ram_blocks.nb_blocks) { - rdma_delete_block(rdma, rdma->local_ram_blocks.block->offset); + rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); } } diff --git a/trace-events b/trace-events index f1b94d66e3..b1e572ae87 100644 --- a/trace-events +++ b/trace-events @@ -1459,7 +1459,7 @@ qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)" qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64 rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" -rdma_delete_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" +rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" rdma_start_incoming_migration(void) "" rdma_start_incoming_migration_after_dest_init(void) "" rdma_start_incoming_migration_after_rdma_listen(void) "" From 760ff4bebc86d08b252809e0da7261c986d022ff Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:25 +0100 Subject: [PATCH 08/28] Rework ram block hash RDMA uses a hash from block offset->RAM Block; this isn't needed on the destination, and it becomes harder to maintain after the next patch in the series that sorts the block list. Split the hash so that it's only generated on the source. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/rdma.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index 347d380912..a652e6705f 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -534,23 +534,22 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name, ram_addr_t block_offset, uint64_t length) { RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *)(uintptr_t)block_offset); + RDMALocalBlock *block; RDMALocalBlock *old = local->block; - assert(block == NULL); - local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); if (local->nb_blocks) { int x; - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, - (void *)(uintptr_t)old[x].offset); - g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)old[x].offset, - &local->block[x]); + if (rdma->blockmap) { + for (x = 0; x < local->nb_blocks; x++) { + g_hash_table_remove(rdma->blockmap, + (void *)(uintptr_t)old[x].offset); + g_hash_table_insert(rdma->blockmap, + (void *)(uintptr_t)old[x].offset, + &local->block[x]); + } } memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); g_free(old); @@ -572,7 +571,9 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name, block->is_ram_block = local->init ? false : true; - g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + if (rdma->blockmap) { + g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + } trace_rdma_add_block(block_name, local->nb_blocks, (uintptr_t) block->local_host_addr, @@ -608,7 +609,6 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) RDMALocalBlocks *local = &rdma->local_ram_blocks; assert(rdma->blockmap == NULL); - rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); memset(local, 0, sizeof *local); qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); trace_qemu_rdma_init_ram_blocks(local->nb_blocks); @@ -2300,6 +2300,14 @@ static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all) goto err_rdma_source_init; } + /* Build the hash that maps from offset to RAMBlock */ + rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); + for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) { + g_hash_table_insert(rdma->blockmap, + (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset, + &rdma->local_ram_blocks.block[idx]); + } + for (idx = 0; idx < RDMA_WRID_MAX; idx++) { ret = qemu_rdma_reg_control(rdma, idx); if (ret) { From e4d633207c129dc5b7d145240ac4a1997ef3902f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:26 +0100 Subject: [PATCH 09/28] Sort destination RAMBlocks to be the same as the source Use the order of incoming RAMBlocks from the source to record an index number; that then allows us to sort the destination local RAMBlock list to match the source. Now that the RAMBlocks are known to be in the same order, this simplifies the RDMA Registration step which previously tried to match RAMBlocks based on offset (which isn't guaranteed to match). Looking at the existing compress code, I think it was erroneously relying on an assumption of matching ordering, which this fixes. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/rdma.c | 101 ++++++++++++++++++++++++++++++++++------------- trace-events | 2 + 2 files changed, 75 insertions(+), 28 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index a652e6705f..73844a31fd 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -225,6 +225,7 @@ typedef struct RDMALocalBlock { uint32_t *remote_keys; /* rkeys for chunk-level registration */ uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ int index; /* which block are we */ + unsigned int src_index; /* (Only used on dest) */ bool is_ram_block; int nb_chunks; unsigned long *transit_bitmap; @@ -354,6 +355,9 @@ typedef struct RDMAContext { RDMALocalBlocks local_ram_blocks; RDMADestBlock *dest_blocks; + /* Index of the next RAMBlock received during block registration */ + unsigned int next_src_index; + /* * Migration on *destination* started. * Then use coroutine yield function. @@ -562,6 +566,7 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name, block->offset = block_offset; block->length = length; block->index = local->nb_blocks; + block->src_index = ~0U; /* Filled in by the receipt of the block list */ block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; block->transit_bitmap = bitmap_new(block->nb_chunks); bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); @@ -2917,6 +2922,14 @@ err_rdma_dest_wait: return ret; } +static int dest_ram_sort_func(const void *a, const void *b) +{ + unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; + unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; + + return (a_index < b_index) ? -1 : (a_index != b_index); +} + /* * During each iteration of the migration, we listen for instructions * by the source VM to perform dynamic page registrations before they @@ -2994,6 +3007,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) case RDMA_CONTROL_RAM_BLOCKS_REQUEST: trace_qemu_rdma_registration_handle_ram_blocks(); + /* Sort our local RAM Block list so it's the same as the source, + * we can do this since we've filled in a src_index in the list + * as we received the RAMBlock list earlier. + */ + qsort(rdma->local_ram_blocks.block, + rdma->local_ram_blocks.nb_blocks, + sizeof(RDMALocalBlock), dest_ram_sort_func); if (rdma->pin_all) { ret = qemu_rdma_reg_whole_ram_blocks(rdma); if (ret) { @@ -3021,6 +3041,12 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) rdma->dest_blocks[i].length = local->block[i].length; dest_block_to_network(&rdma->dest_blocks[i]); + trace_qemu_rdma_registration_handle_ram_blocks_loop( + local->block[i].block_name, + local->block[i].offset, + local->block[i].length, + local->block[i].local_host_addr, + local->block[i].src_index); } blocks.len = rdma->local_ram_blocks.nb_blocks @@ -3144,13 +3170,44 @@ out: return ret; } +/* Destination: + * Called via a ram_control_load_hook during the initial RAM load section which + * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks + * on the source. + * We've already built our local RAMBlock list, but not yet sent the list to + * the source. + */ +static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name) +{ + RDMAContext *rdma = rfile->rdma; + int curr; + int found = -1; + + /* Find the matching RAMBlock in our local list */ + for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { + if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { + found = curr; + break; + } + } + + if (found == -1) { + error_report("RAMBlock '%s' not found on destination", name); + return -ENOENT; + } + + rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; + trace_rdma_block_notification_handle(name, rdma->next_src_index); + rdma->next_src_index++; + + return 0; +} + static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) { switch (flags) { case RAM_CONTROL_BLOCK_REG: - /* TODO A later patch */ - return 0; - break; + return rdma_block_notification_handle(opaque, data); case RAM_CONTROL_HOOK: return qemu_rdma_registration_handle(f, opaque); @@ -3201,7 +3258,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, if (flags == RAM_CONTROL_SETUP) { RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; RDMALocalBlocks *local = &rdma->local_ram_blocks; - int reg_result_idx, i, j, nb_dest_blocks; + int reg_result_idx, i, nb_dest_blocks; head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; trace_qemu_rdma_registration_stop_ram(); @@ -3237,9 +3294,10 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, */ if (local->nb_blocks != nb_dest_blocks) { - ERROR(errp, "ram blocks mismatch #1! " + ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) " "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); + "not identical on both the source and destination.", + local->nb_blocks, nb_dest_blocks); return -EINVAL; } @@ -3249,30 +3307,17 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, for (i = 0; i < nb_dest_blocks; i++) { network_to_dest_block(&rdma->dest_blocks[i]); - /* search local ram blocks */ - for (j = 0; j < local->nb_blocks; j++) { - if (rdma->dest_blocks[i].offset != local->block[j].offset) { - continue; - } - - if (rdma->dest_blocks[i].length != local->block[j].length) { - ERROR(errp, "ram blocks mismatch #2! " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); - return -EINVAL; - } - local->block[j].remote_host_addr = - rdma->dest_blocks[i].remote_host_addr; - local->block[j].remote_rkey = rdma->dest_blocks[i].remote_rkey; - break; - } - - if (j >= local->nb_blocks) { - ERROR(errp, "ram blocks mismatch #3! " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); + /* We require that the blocks are in the same order */ + if (rdma->dest_blocks[i].length != local->block[i].length) { + ERROR(errp, "Block %s/%d has a different length %" PRIu64 + "vs %" PRIu64, local->block[i].block_name, i, + local->block[i].length, + rdma->dest_blocks[i].length); return -EINVAL; } + local->block[i].remote_host_addr = + rdma->dest_blocks[i].remote_host_addr; + local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; } } diff --git a/trace-events b/trace-events index b1e572ae87..a38dd2e7ae 100644 --- a/trace-events +++ b/trace-events @@ -1433,6 +1433,7 @@ qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu6 qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64 qemu_rdma_registration_handle_finished(void) "" qemu_rdma_registration_handle_ram_blocks(void) "" +qemu_rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u" qemu_rdma_registration_handle_register(int requests) "%d requests" qemu_rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64 qemu_rdma_registration_handle_register_rkey(int rkey) "%x" @@ -1459,6 +1460,7 @@ qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)" qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64 rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" +rdma_block_notification_handle(const char *name, int index) "%s at %d" rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" rdma_start_incoming_migration(void) "" rdma_start_incoming_migration_after_dest_init(void) "" From afcddefdbe75d0c20bf6e11b5512ba768ce0700c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:27 +0100 Subject: [PATCH 10/28] Sanity check RDMA remote data Perform some basic (but probably not complete) sanity checking on requests from the RDMA source. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Michael R. Hines Signed-off-by: Juan Quintela --- migration/rdma.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/migration/rdma.c b/migration/rdma.c index 73844a31fd..73a79be645 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -2992,6 +2992,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) trace_qemu_rdma_registration_handle_compress(comp->length, comp->block_idx, comp->offset); + if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { + error_report("rdma: 'compress' bad block index %u (vs %d)", + (unsigned int)comp->block_idx, + rdma->local_ram_blocks.nb_blocks); + ret = -EIO; + break; + } block = &(rdma->local_ram_blocks.block[comp->block_idx]); host_addr = block->local_host_addr + @@ -3080,8 +3087,23 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) trace_qemu_rdma_registration_handle_register_loop(count, reg->current_index, reg->key.current_addr, reg->chunks); + if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { + error_report("rdma: 'register' bad block index %u (vs %d)", + (unsigned int)reg->current_index, + rdma->local_ram_blocks.nb_blocks); + ret = -ENOENT; + break; + } block = &(rdma->local_ram_blocks.block[reg->current_index]); if (block->is_ram_block) { + if (block->offset > reg->key.current_addr) { + error_report("rdma: bad register address for block %s" + " offset: %" PRIx64 " current_addr: %" PRIx64, + block->block_name, block->offset, + reg->key.current_addr); + ret = -ERANGE; + break; + } host_addr = (block->local_host_addr + (reg->key.current_addr - block->offset)); chunk = ram_chunk_index(block->local_host_addr, @@ -3090,6 +3112,14 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) chunk = reg->key.chunk; host_addr = block->local_host_addr + (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); + /* Check for particularly bad chunk value */ + if (host_addr < (void *)block->local_host_addr) { + error_report("rdma: bad chunk for block %s" + " chunk: %" PRIx64, + block->block_name, reg->key.chunk); + ret = -ERANGE; + break; + } } chunk_start = ram_chunk_start(block, chunk); chunk_end = ram_chunk_end(block, chunk + reg->chunks); From ef4b722d19cab845eaa0d1f912018b09a9d8288b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 11 Jun 2015 18:17:28 +0100 Subject: [PATCH 11/28] Fail more cleanly in mismatched RAM cases If the number of RAMBlocks was different on the source from the destination, QEMU would hang waiting for a disconnect on the source and wouldn't release from that hang until the destination was manually killed. Mark the stream as being in error, this causes the destination to die and the source to carry on. (It still gets a whole bunch of warnings on the destination, and I've not managed to complete another migration after the 1st one, still progress). Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- migration/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/rdma.c b/migration/rdma.c index 73a79be645..f106b2a818 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -3328,6 +3328,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, "Your QEMU command line parameters are probably " "not identical on both the source and destination.", local->nb_blocks, nb_dest_blocks); + rdma->error_state = -EINVAL; return -EINVAL; } @@ -3343,6 +3344,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, "vs %" PRIu64, local->block[i].block_name, i, local->block[i].length, rdma->dest_blocks[i].length); + rdma->error_state = -EINVAL; return -EINVAL; } local->block[i].remote_host_addr = From ff14e817f6c5f110b77e22185b256a17a96aa881 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 12 Jun 2015 18:37:52 +0100 Subject: [PATCH 12/28] Fix older machine type compatibility on power with section footers I forgot to add compatibility for Power when adding section footers. Signed-off-by: Dr. David Alan Gilbert Fixes: 37fb569c0198cba58e3e Signed-off-by: Juan Quintela --- hw/ppc/spapr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index f174e5a0f3..01f8da8f3c 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -34,6 +34,7 @@ #include "sysemu/cpus.h" #include "sysemu/kvm.h" #include "kvm_ppc.h" +#include "migration/migration.h" #include "mmu-hash64.h" #include "qom/cpu.h" @@ -1851,6 +1852,7 @@ static const TypeInfo spapr_machine_info = { static void spapr_compat_2_3(Object *obj) { + savevm_skip_section_footers(); } static void spapr_compat_2_2(Object *obj) From 5e0f1940caf49f56e3bee123aa92e42a3f7fad20 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 8 Oct 2014 11:53:22 +0200 Subject: [PATCH 13/28] runstate: Add runstate store This allows us to store the current state to send it through migration. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- include/sysemu/sysemu.h | 1 + vl.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index df809518b4..44570d17e6 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -28,6 +28,7 @@ bool runstate_check(RunState state); void runstate_set(RunState new_state); int runstate_is_running(void); bool runstate_needs_reset(void); +bool runstate_store(char *str, size_t size); typedef struct vm_change_state_entry VMChangeStateEntry; typedef void VMChangeStateHandler(void *opaque, int running, RunState state); diff --git a/vl.c b/vl.c index 69ad90c87f..fec7e930d4 100644 --- a/vl.c +++ b/vl.c @@ -634,6 +634,18 @@ bool runstate_check(RunState state) return current_run_state == state; } +bool runstate_store(char *str, size_t size) +{ + const char *state = RunState_lookup[current_run_state]; + size_t len = strlen(state) + 1; + + if (len > size) { + return false; + } + memcpy(str, state, len); + return true; +} + static void runstate_init(void) { const RunStateTransition *p; From ca3fc39ea9045188e37b84c4f92ee79c7ed4b1c3 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 8 Oct 2014 12:47:08 +0200 Subject: [PATCH 14/28] runstate: migration allows more transitions now Next commit would allow to move from incoming migration to error happening on source. Should we add more states to this transition? Luiz? Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- vl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vl.c b/vl.c index fec7e930d4..19a8737a9a 100644 --- a/vl.c +++ b/vl.c @@ -573,8 +573,14 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_DEBUG, RUN_STATE_RUNNING }, { RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE }, - { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_INMIGRATE, RUN_STATE_INTERNAL_ERROR }, + { RUN_STATE_INMIGRATE, RUN_STATE_IO_ERROR }, { RUN_STATE_INMIGRATE, RUN_STATE_PAUSED }, + { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_INMIGRATE, RUN_STATE_SHUTDOWN }, + { RUN_STATE_INMIGRATE, RUN_STATE_SUSPENDED }, + { RUN_STATE_INMIGRATE, RUN_STATE_WATCHDOG }, + { RUN_STATE_INMIGRATE, RUN_STATE_GUEST_PANICKED }, { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED }, { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE }, From df4b1024526cae3479da3492d6371fd4a7324a03 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 8 Oct 2014 10:58:10 +0200 Subject: [PATCH 15/28] migration: create new section to store global state This includes a new section that for now just stores the current qemu state. Right now, there are only one way to control what is the state of the target after migration. - If you run the target qemu with -S, it would start stopped. - If you run the target qemu without -S, it would run just after migration finishes. The problem here is what happens if we start the target without -S and there happens one error during migration that puts current state as -EIO. Migration would ends (notice that the error happend doing block IO, network IO, i.e. nothing related with migration), and when migration finish, we would just "continue" running on destination, probably hanging the guest/corruption data, whatever. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- include/migration/migration.h | 1 + migration/migration.c | 105 +++++++++++++++++++++++++++++++--- trace-events | 3 + vl.c | 1 + 4 files changed, 103 insertions(+), 7 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index afba2335e3..86df6cc245 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -197,4 +197,5 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, void ram_mig_init(void); void savevm_skip_section_footers(void); +void register_global_state(void); #endif diff --git a/migration/migration.c b/migration/migration.c index c6ac08a0cb..5e436f7106 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -26,6 +26,7 @@ #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" +#include "qapi/util.h" #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ @@ -97,6 +98,83 @@ void migration_incoming_state_destroy(void) mis_current = NULL; } + +typedef struct { + uint32_t size; + uint8_t runstate[100]; +} GlobalState; + +static GlobalState global_state; + +static int global_state_store(void) +{ + if (!runstate_store((char *)global_state.runstate, + sizeof(global_state.runstate))) { + error_report("runstate name too big: %s", global_state.runstate); + trace_migrate_state_too_big(); + return -EINVAL; + } + return 0; +} + +static char *global_state_get_runstate(void) +{ + return (char *)global_state.runstate; +} + +static int global_state_post_load(void *opaque, int version_id) +{ + GlobalState *s = opaque; + int ret = 0; + char *runstate = (char *)s->runstate; + + trace_migrate_global_state_post_load(runstate); + + if (strcmp(runstate, "running") != 0) { + Error *local_err = NULL; + int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX, + -1, &local_err); + + if (r == -1) { + if (local_err) { + error_report_err(local_err); + } + return -EINVAL; + } + ret = vm_stop_force_state(r); + } + + return ret; +} + +static void global_state_pre_save(void *opaque) +{ + GlobalState *s = opaque; + + trace_migrate_global_state_pre_save((char *)s->runstate); + s->size = strlen((char *)s->runstate) + 1; +} + +static const VMStateDescription vmstate_globalstate = { + .name = "globalstate", + .version_id = 1, + .minimum_version_id = 1, + .post_load = global_state_post_load, + .pre_save = global_state_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(size, GlobalState), + VMSTATE_BUFFER(runstate, GlobalState), + VMSTATE_END_OF_LIST() + }, +}; + +void register_global_state(void) +{ + /* We would use it independently that we receive it */ + strcpy((char *)&global_state.runstate, ""); + vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); +} + /* * Called on -incoming with a defer: uri. * The migration can be started later after any parameters have been @@ -164,10 +242,20 @@ static void process_incoming_migration_co(void *opaque) exit(EXIT_FAILURE); } - if (autostart) { + /* runstate == "" means that we haven't received it through the + * wire, so we obey autostart. runstate == runing means that we + * need to run it, we need to make sure that we do it after + * everything else has finished. Every other state change is done + * at the post_load function */ + + if (strcmp(global_state_get_runstate(), "running") == 0) { vm_start(); - } else { - runstate_set(RUN_STATE_PAUSED); + } else if (strcmp(global_state_get_runstate(), "") == 0) { + if (autostart) { + vm_start(); + } else { + runstate_set(RUN_STATE_PAUSED); + } } migrate_decompress_threads_join(); } @@ -793,10 +881,13 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret >= 0) { - qemu_file_set_rate_limit(s->file, INT64_MAX); - qemu_savevm_state_complete(s->file); + ret = global_state_store(); + if (!ret) { + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (ret >= 0) { + qemu_file_set_rate_limit(s->file, INT64_MAX); + qemu_savevm_state_complete(s->file); + } } qemu_mutex_unlock_iothread(); diff --git a/trace-events b/trace-events index a38dd2e7ae..c0111d0296 100644 --- a/trace-events +++ b/trace-events @@ -1403,6 +1403,9 @@ migrate_fd_error(void) "" migrate_fd_cancel(void) "" migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64 migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 +migrate_state_too_big(void) "" +migrate_global_state_post_load(const char *state) "loaded state: %s" +migrate_global_state_pre_save(const char *state) "saved state: %s" # migration/rdma.c qemu_rdma_accept_incoming_migration(void) "" diff --git a/vl.c b/vl.c index 19a8737a9a..cfa6133308 100644 --- a/vl.c +++ b/vl.c @@ -4624,6 +4624,7 @@ int main(int argc, char **argv, char **envp) return 0; } + register_global_state(); if (incoming) { Error *local_err = NULL; qemu_start_incoming_migration(incoming, &local_err); From 13d16814d2058f10461e6987c8216950389c1310 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 8 Oct 2014 13:58:24 +0200 Subject: [PATCH 16/28] global_state: Make section optional This section would be sent: a- for all new machine types b- for old machine types if section state is different form {running,paused} that were the only giving us troubles. So, in new qemus: it is alwasy there. In old qemus: they are only there if it an error has happened, basically stoping on target. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c | 1 + hw/ppc/spapr.c | 1 + include/migration/migration.h | 1 + migration/migration.c | 29 +++++++++++++++++++++++++++++ 5 files changed, 33 insertions(+) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 56cdcb9661..3ee3b47cec 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -312,6 +312,7 @@ static void pc_compat_2_3(MachineState *machine) if (kvm_enabled()) { pcms->smm = ON_OFF_AUTO_OFF; } + global_state_set_optional(); } static void pc_compat_2_2(MachineState *machine) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 8aa3a67fdf..0dc4ab149a 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -295,6 +295,7 @@ static void pc_compat_2_3(MachineState *machine) if (kvm_enabled()) { pcms->smm = ON_OFF_AUTO_OFF; } + global_state_set_optional(); } static void pc_compat_2_2(MachineState *machine) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 01f8da8f3c..bcf5f0f6ba 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1853,6 +1853,7 @@ static const TypeInfo spapr_machine_info = { static void spapr_compat_2_3(Object *obj) { savevm_skip_section_footers(); + global_state_set_optional(); } static void spapr_compat_2_2(Object *obj) diff --git a/include/migration/migration.h b/include/migration/migration.h index 86df6cc245..f153eba47a 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -198,4 +198,5 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, void ram_mig_init(void); void savevm_skip_section_footers(void); void register_global_state(void); +void global_state_set_optional(void); #endif diff --git a/migration/migration.c b/migration/migration.c index 5e436f7106..edb4f3ee92 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -100,6 +100,7 @@ void migration_incoming_state_destroy(void) typedef struct { + bool optional; uint32_t size; uint8_t runstate[100]; } GlobalState; @@ -122,6 +123,33 @@ static char *global_state_get_runstate(void) return (char *)global_state.runstate; } +void global_state_set_optional(void) +{ + global_state.optional = true; +} + +static bool global_state_needed(void *opaque) +{ + GlobalState *s = opaque; + char *runstate = (char *)s->runstate; + + /* If it is not optional, it is mandatory */ + + if (s->optional == false) { + return true; + } + + /* If state is running or paused, it is not needed */ + + if (strcmp(runstate, "running") == 0 || + strcmp(runstate, "paused") == 0) { + return false; + } + + /* for any other state it is needed */ + return true; +} + static int global_state_post_load(void *opaque, int version_id) { GlobalState *s = opaque; @@ -161,6 +189,7 @@ static const VMStateDescription vmstate_globalstate = { .minimum_version_id = 1, .post_load = global_state_post_load, .pre_save = global_state_pre_save, + .needed = global_state_needed, .fields = (VMStateField[]) { VMSTATE_UINT32(size, GlobalState), VMSTATE_BUFFER(runstate, GlobalState), From df8961522a3d6bc7bb60c2830ef59e7c6c67a928 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 15 Oct 2014 09:39:14 +0200 Subject: [PATCH 17/28] vmstate: Create optional sections To make sections optional, we need to do it at the beggining of the code. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- include/migration/vmstate.h | 2 ++ migration/savevm.c | 8 ++++++++ migration/vmstate.c | 11 +++++++++++ trace-events | 1 + 4 files changed, 22 insertions(+) diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 0695d7c3de..f51ff693e9 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -820,6 +820,8 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, QJSON *vmdesc); +bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); + int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, const VMStateDescription *vmsd, void *base, int alias_id, diff --git a/migration/savevm.c b/migration/savevm.c index 1a9b00b320..e779c96ef9 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -836,6 +836,11 @@ void qemu_savevm_state_complete(QEMUFile *f) if ((!se->ops || !se->ops->save_state) && !se->vmsd) { continue; } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + trace_savevm_section_skip(se->idstr, se->section_id); + continue; + } + trace_savevm_section_start(se->idstr, se->section_id); json_start_object(vmdesc, NULL); @@ -949,6 +954,9 @@ static int qemu_save_device_state(QEMUFile *f) if ((!se->ops || !se->ops->save_state) && !se->vmsd) { continue; } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + continue; + } save_section_header(f, se, QEMU_VM_SECTION_FULL); diff --git a/migration/vmstate.c b/migration/vmstate.c index 6138d1acb7..e8ccf22f67 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -276,6 +276,17 @@ static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc, json_end_object(vmdesc); } + +bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque) +{ + if (vmsd->needed && !vmsd->needed(opaque)) { + /* optional section not needed */ + return false; + } + return true; +} + + void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, QJSON *vmdesc) { diff --git a/trace-events b/trace-events index c0111d0296..2d395c59a9 100644 --- a/trace-events +++ b/trace-events @@ -1191,6 +1191,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u" qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u" savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u" savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d" +savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u" savevm_state_begin(void) "" savevm_state_header(void) "" savevm_state_iterate(void) "" From 61964c23e5ddd5a33f15699e45ce126f879e3e33 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 13 May 2015 18:17:43 +0200 Subject: [PATCH 18/28] migration: Add configuration section It needs to be the first one and it is not optional, that is the reason why it is opencoded. For new machine types, it is required that machine type name is the same in both sides. It is just done right now for pc's. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c | 1 + include/migration/migration.h | 2 ++ migration/savevm.c | 61 +++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 3ee3b47cec..4c3cb40ce0 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -313,6 +313,7 @@ static void pc_compat_2_3(MachineState *machine) pcms->smm = ON_OFF_AUTO_OFF; } global_state_set_optional(); + savevm_skip_configuration(); } static void pc_compat_2_2(MachineState *machine) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 0dc4ab149a..43e6c18777 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -296,6 +296,7 @@ static void pc_compat_2_3(MachineState *machine) pcms->smm = ON_OFF_AUTO_OFF; } global_state_set_optional(); + savevm_skip_configuration(); } static void pc_compat_2_2(MachineState *machine) diff --git a/include/migration/migration.h b/include/migration/migration.h index f153eba47a..a308eccf18 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -34,6 +34,7 @@ #define QEMU_VM_SECTION_FULL 0x04 #define QEMU_VM_SUBSECTION 0x05 #define QEMU_VM_VMDESCRIPTION 0x06 +#define QEMU_VM_CONFIGURATION 0x07 #define QEMU_VM_SECTION_FOOTER 0x7e struct MigrationParams { @@ -199,4 +200,5 @@ void ram_mig_init(void); void savevm_skip_section_footers(void); void register_global_state(void); void global_state_set_optional(void); +void savevm_skip_configuration(void); #endif diff --git a/migration/savevm.c b/migration/savevm.c index e779c96ef9..b28a2694ad 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -246,11 +246,55 @@ typedef struct SaveStateEntry { typedef struct SaveState { QTAILQ_HEAD(, SaveStateEntry) handlers; int global_section_id; + bool skip_configuration; + uint32_t len; + const char *name; } SaveState; static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), .global_section_id = 0, + .skip_configuration = false, +}; + +void savevm_skip_configuration(void) +{ + savevm_state.skip_configuration = true; +} + + +static void configuration_pre_save(void *opaque) +{ + SaveState *state = opaque; + const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + + state->len = strlen(current_name); + state->name = current_name; +} + +static int configuration_post_load(void *opaque, int version_id) +{ + SaveState *state = opaque; + const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + + if (strncmp(state->name, current_name, state->len) != 0) { + error_report("Machine type received is '%s' and local is '%s'", + state->name, current_name); + return -EINVAL; + } + return 0; +} + +static const VMStateDescription vmstate_configuration = { + .name = "configuration", + .version_id = 1, + .post_load = configuration_post_load, + .pre_save = configuration_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(len, SaveState), + VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), + VMSTATE_END_OF_LIST() + }, }; static void dump_vmstate_vmsd(FILE *out_file, @@ -723,6 +767,11 @@ void qemu_savevm_state_begin(QEMUFile *f, se->ops->set_params(params, se->opaque); } + if (!savevm_state.skip_configuration) { + qemu_put_byte(f, QEMU_VM_CONFIGURATION); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); + } + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_setup) { continue; @@ -1037,6 +1086,18 @@ int qemu_loadvm_state(QEMUFile *f) return -ENOTSUP; } + if (!savevm_state.skip_configuration) { + if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { + error_report("Configuration section missing"); + return -EINVAL; + } + ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); + + if (ret) { + return ret; + } + } + while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; SaveStateEntry *se; From a5c17b5f68ff7e37ce6e6e1f5457307fe07629e7 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 17 Jun 2015 02:06:20 +0200 Subject: [PATCH 19/28] migration: Use cmpxchg correctly cmpxchg returns the old value Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- migration/migration.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index edb4f3ee92..1e34aa5f20 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -509,7 +509,7 @@ void qmp_migrate_set_parameters(bool has_compress_level, static void migrate_set_state(MigrationState *s, int old_state, int new_state) { - if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) { + if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) { trace_migrate_set_state(new_state); } } From 656a233440e552230f9d1da016b94a81b86658dc Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 1 Jul 2015 09:32:29 +0200 Subject: [PATCH 20/28] migration: ensure we start in NONE state Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- migration/migration.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 1e34aa5f20..5c1233fb01 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -694,7 +694,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, error_setg(errp, QERR_MIGRATION_ACTIVE); return; } - if (runstate_check(RUN_STATE_INMIGRATE)) { error_setg(errp, "Guest is waiting for an incoming migration"); return; @@ -709,6 +708,12 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, return; } + /* We are starting a new migration, so we want to start in a clean + state. This change is only needed if previous migration + failed/was cancelled. We don't use migrate_set_state() because + we are setting the initial state, not changing it. */ + s->state = MIGRATION_STATUS_NONE; + s = migrate_init(¶ms); if (strstart(uri, "tcp:", &p)) { From 7844337d1efb2c47dc3f306d7621e1cafca8ba67 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 17 Jun 2015 01:36:40 +0200 Subject: [PATCH 21/28] migration: Use always helper to set state There were three places that were not using the migrate_set_state() helper, just fix that. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- migration/migration.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 5c1233fb01..78eb71535e 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -549,7 +549,7 @@ void migrate_fd_error(MigrationState *s) { trace_migrate_fd_error(); assert(s->file == NULL); - s->state = MIGRATION_STATUS_FAILED; + migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); trace_migrate_set_state(MIGRATION_STATUS_FAILED); notifier_list_notify(&migration_state_notifiers, s); } @@ -634,7 +634,7 @@ static MigrationState *migrate_init(const MigrationParams *params) s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = decompress_thread_count; s->bandwidth_limit = bandwidth_limit; - s->state = MIGRATION_STATUS_SETUP; + migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); trace_migrate_set_state(MIGRATION_STATUS_SETUP); s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -733,7 +733,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } else { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol"); - s->state = MIGRATION_STATUS_FAILED; + migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); return; } From f2bb932491185a39922dff0514c4b08c092f3c35 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 17 Jun 2015 01:38:25 +0200 Subject: [PATCH 22/28] migration: No need to call trace_migrate_set_state() We now use the helper everywhere, so no need to call this on this two places. See on previous commit that there were a place where we missed to mark the trace. Now all tracing is done in migrate_set_state(). Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- migration/migration.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 78eb71535e..ffaa5c8a15 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -550,7 +550,6 @@ void migrate_fd_error(MigrationState *s) trace_migrate_fd_error(); assert(s->file == NULL); migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); - trace_migrate_set_state(MIGRATION_STATUS_FAILED); notifier_list_notify(&migration_state_notifiers, s); } @@ -635,7 +634,6 @@ static MigrationState *migrate_init(const MigrationParams *params) decompress_thread_count; s->bandwidth_limit = bandwidth_limit; migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); - trace_migrate_set_state(MIGRATION_STATUS_SETUP); s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); return s; From 598cd2bda0845096d2f06500e45b4d0d399b384a Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 20 May 2015 12:16:15 +0200 Subject: [PATCH 23/28] migration: create migration event We have one argument that tells us what event has happened. Signed-off-by: Juan Quintela Reviewed-by: Eric Blake --- docs/qmp/qmp-events.txt | 14 ++++++++++++++ migration/migration.c | 2 ++ qapi/event.json | 12 ++++++++++++ 3 files changed, 28 insertions(+) diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt index 4c13d48726..d92cc4833b 100644 --- a/docs/qmp/qmp-events.txt +++ b/docs/qmp/qmp-events.txt @@ -473,6 +473,20 @@ Example: { "timestamp": {"seconds": 1290688046, "microseconds": 417172}, "event": "SPICE_MIGRATE_COMPLETED" } +MIGRATION +--------- + +Emitted when a migration event happens + +Data: None. + + - "status": migration status + See MigrationStatus in ~/qapi-schema.json for possible values + +Example: + +{"timestamp": {"seconds": 1432121972, "microseconds": 744001}, + "event": "MIGRATION", "data": {"status": "completed"}} STOP ---- diff --git a/migration/migration.c b/migration/migration.c index ffaa5c8a15..d8415c4985 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -27,6 +27,7 @@ #include "qmp-commands.h" #include "trace.h" #include "qapi/util.h" +#include "qapi-event.h" #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ @@ -510,6 +511,7 @@ void qmp_migrate_set_parameters(bool has_compress_level, static void migrate_set_state(MigrationState *s, int old_state, int new_state) { if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) { + qapi_event_send_migration(new_state, &error_abort); trace_migrate_set_state(new_state); } } diff --git a/qapi/event.json b/qapi/event.json index 378dda572a..f0cef010f0 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -242,6 +242,18 @@ ## { 'event': 'SPICE_MIGRATE_COMPLETED' } +## +# @MIGRATION +# +# Emitted when a migration event happens +# +# @status: @MigrationStatus describing the current migration status. +# +# Since: 2.4 +## +{ 'event': 'MIGRATION', + 'data': {'status': 'MigrationStatus'}} + ## # @ACPI_DEVICE_OST # From b05dc72342b27585909d9e99d95d17fd3dfbb269 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Tue, 7 Jul 2015 14:44:05 +0200 Subject: [PATCH 24/28] migration: Make events a capability Make check fails with events. THis is due to the parser/lexer that it uses. Just in case that they are more broken parsers, just only send events when there are capabilities. Signed-off-by: Juan Quintela Reviewed-by: Dr. David Alan Gilbert --- include/migration/migration.h | 1 + migration/migration.c | 20 ++++++++++++++++++-- qapi-schema.json | 5 ++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index a308eccf18..b2711ef305 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -177,6 +177,7 @@ bool migrate_use_compression(void); int migrate_compress_level(void); int migrate_compress_threads(void); int migrate_decompress_threads(void); +bool migrate_use_events(void); void ram_control_before_iterate(QEMUFile *f, uint64_t flags); void ram_control_after_iterate(QEMUFile *f, uint64_t flags); diff --git a/migration/migration.c b/migration/migration.c index d8415c4985..cd32eacc69 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -205,6 +205,14 @@ void register_global_state(void) vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); } +static void migrate_generate_event(int new_state) +{ + if (migrate_use_events()) { + qapi_event_send_migration(new_state, &error_abort); + trace_migrate_set_state(new_state); + } +} + /* * Called on -incoming with a defer: uri. * The migration can be started later after any parameters have been @@ -511,8 +519,7 @@ void qmp_migrate_set_parameters(bool has_compress_level, static void migrate_set_state(MigrationState *s, int old_state, int new_state) { if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) { - qapi_event_send_migration(new_state, &error_abort); - trace_migrate_set_state(new_state); + migrate_generate_event(new_state); } } @@ -862,6 +869,15 @@ int migrate_decompress_threads(void) return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; } +bool migrate_use_events(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; +} + int migrate_use_xbzrle(void) { MigrationState *s; diff --git a/qapi-schema.json b/qapi-schema.json index 106008cdeb..1285b8c74a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -523,6 +523,9 @@ # minimize migration traffic. The feature is disabled by default. # (since 2.4 ) # +# @events: generate events for each migration state change +# (since 2.4 ) +# # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # @@ -530,7 +533,7 @@ ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', - 'compress'] } + 'compress', 'events'] } ## # @MigrationCapabilityStatus From 7cf1fe6d68eb2ad3b77e2a89f097354db5d627e2 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Wed, 20 May 2015 17:15:42 +0200 Subject: [PATCH 25/28] migration: Add migration events on target side We reuse the migration events from the source side, sending them on the appropiate place. Signed-off-by: Juan Quintela Reviewed-by: Eric Blake Reviewed-by: Dr. David Alan Gilbert --- migration/migration.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index cd32eacc69..45719a0f5f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -230,6 +230,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p; + qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort); if (!strcmp(uri, "defer")) { deferred_incoming_migration(errp); } else if (strstart(uri, "tcp:", &p)) { @@ -258,7 +259,7 @@ static void process_incoming_migration_co(void *opaque) int ret; migration_incoming_state_new(f); - + migrate_generate_event(MIGRATION_STATUS_ACTIVE); ret = qemu_loadvm_state(f); qemu_fclose(f); @@ -266,10 +267,12 @@ static void process_incoming_migration_co(void *opaque) migration_incoming_state_destroy(); if (ret < 0) { + migrate_generate_event(MIGRATION_STATUS_FAILED); error_report("load of migration failed: %s", strerror(-ret)); migrate_decompress_threads_join(); exit(EXIT_FAILURE); } + migrate_generate_event(MIGRATION_STATUS_COMPLETED); qemu_announce_self(); /* Make sure all file formats flush their mutable metadata */ From 59f39a47411ab6007a592555dc639aa9753f8d23 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jul 2015 09:22:03 +0100 Subject: [PATCH 26/28] check_section_footers: Check the correct section_id The section footers check was incorrectly checking the section_id in the SaveStateEntry not the LoadStateEntry. These can validly be different if the two QEMU instances have instantiated their devices in a different order. The test only cares that we're finishing the same section we started, and hence it's the LoadStateEntry that we care about. Signed-off-by: Dr. David Alan Gilbert Reported-by: Christian Borntraeger Signed-off-by: Juan Quintela --- migration/savevm.c | 74 +++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index b28a2694ad..86735fc53a 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -697,41 +697,6 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se) } } -/* - * Read a footer off the wire and check that it matches the expected section - * - * Returns: true if the footer was good - * false if there is a problem (and calls error_report to say why) - */ -static bool check_section_footer(QEMUFile *f, SaveStateEntry *se) -{ - uint8_t read_mark; - uint32_t read_section_id; - - if (skip_section_footers) { - /* No footer to check */ - return true; - } - - read_mark = qemu_get_byte(f); - - if (read_mark != QEMU_VM_SECTION_FOOTER) { - error_report("Missing section footer for %s", se->idstr); - return false; - } - - read_section_id = qemu_get_be32(f); - if (read_section_id != se->section_id) { - error_report("Mismatched section id in footer for %s -" - " read 0x%x expected 0x%x", - se->idstr, read_section_id, se->section_id); - return false; - } - - /* All good */ - return true; -} - bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -1046,6 +1011,41 @@ struct LoadStateEntry { int version_id; }; +/* + * Read a footer off the wire and check that it matches the expected section + * + * Returns: true if the footer was good + * false if there is a problem (and calls error_report to say why) + */ +static bool check_section_footer(QEMUFile *f, LoadStateEntry *le) +{ + uint8_t read_mark; + uint32_t read_section_id; + + if (skip_section_footers) { + /* No footer to check */ + return true; + } + + read_mark = qemu_get_byte(f); + + if (read_mark != QEMU_VM_SECTION_FOOTER) { + error_report("Missing section footer for %s", le->se->idstr); + return false; + } + + read_section_id = qemu_get_be32(f); + if (read_section_id != le->section_id) { + error_report("Mismatched section id in footer for %s -" + " read 0x%x expected 0x%x", + le->se->idstr, read_section_id, le->section_id); + return false; + } + + /* All good */ + return true; +} + void loadvm_free_handlers(MigrationIncomingState *mis) { LoadStateEntry *le, *new_le; @@ -1151,7 +1151,7 @@ int qemu_loadvm_state(QEMUFile *f) " device '%s'", instance_id, idstr); goto out; } - if (!check_section_footer(f, le->se)) { + if (!check_section_footer(f, le)) { ret = -EINVAL; goto out; } @@ -1178,7 +1178,7 @@ int qemu_loadvm_state(QEMUFile *f) section_id, le->se->idstr); goto out; } - if (!check_section_footer(f, le->se)) { + if (!check_section_footer(f, le)) { ret = -EINVAL; goto out; } From 2ff64038a59e8de2baa485806be0838f49f70b79 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 2 Jul 2015 20:18:05 +0800 Subject: [PATCH 27/28] migration: protect migration_bitmap Signed-off-by: Li Zhijian Signed-off-by: Wen Congyang Signed-off-by: Juan Quintela --- migration/ram.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 644f52ad85..9c0bcfefc8 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -494,6 +494,7 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, return 1; } +/* Called with rcu_read_lock() to protect migration_bitmap */ static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) @@ -502,26 +503,31 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, unsigned long nr = base + (start >> TARGET_PAGE_BITS); uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); + unsigned long *bitmap; unsigned long next; + bitmap = atomic_rcu_read(&migration_bitmap); if (ram_bulk_stage && nr > base) { next = nr + 1; } else { - next = find_next_bit(migration_bitmap, size, nr); + next = find_next_bit(bitmap, size, nr); } if (next < size) { - clear_bit(next, migration_bitmap); + clear_bit(next, bitmap); migration_dirty_pages--; } return (next - base) << TARGET_PAGE_BITS; } +/* Called with rcu_read_lock() to protect migration_bitmap */ static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) { + unsigned long *bitmap; + bitmap = atomic_rcu_read(&migration_bitmap); migration_dirty_pages += - cpu_physical_memory_sync_dirty_bitmap(migration_bitmap, start, length); + cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); } @@ -1017,10 +1023,15 @@ void free_xbzrle_decoded_buf(void) static void migration_end(void) { - if (migration_bitmap) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against this migration_bitmap + */ + unsigned long *bitmap = migration_bitmap; + atomic_rcu_set(&migration_bitmap, NULL); + if (bitmap) { memory_global_dirty_log_stop(); - g_free(migration_bitmap); - migration_bitmap = NULL; + synchronize_rcu(); + g_free(bitmap); } XBZRLE_cache_lock(); From dd63169766abd2b8dc33f4451dac5e778458a47c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 2 Jul 2015 20:18:06 +0800 Subject: [PATCH 28/28] migration: extend migration_bitmap Prevously, if we hotplug a device(e.g. device_add e1000) during migration is processing in source side, qemu will add a new ram block but migration_bitmap is not extended. In this case, migration_bitmap will overflow and lead qemu abort unexpectedly. Signed-off-by: Li Zhijian Signed-off-by: Wen Congyang Signed-off-by: Juan Quintela --- exec.c | 5 +++++ include/exec/exec-all.h | 3 +++ migration/ram.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/exec.c b/exec.c index 251dc79e10..b7f7f9818f 100644 --- a/exec.c +++ b/exec.c @@ -1414,6 +1414,11 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) } } + new_ram_size = MAX(old_ram_size, + (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); + if (new_ram_size > old_ram_size) { + migration_bitmap_extend(old_ram_size, new_ram_size); + } /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index d678114cb2..2e74760ade 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -365,4 +365,7 @@ static inline bool cpu_can_do_io(CPUState *cpu) return cpu->can_do_io != 0; } +#if !defined(CONFIG_USER_ONLY) +void migration_bitmap_extend(ram_addr_t old, ram_addr_t new); +#endif #endif diff --git a/migration/ram.c b/migration/ram.c index 9c0bcfefc8..c696814196 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -222,6 +222,7 @@ static RAMBlock *last_seen_block; static RAMBlock *last_sent_block; static ram_addr_t last_offset; static unsigned long *migration_bitmap; +static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; @@ -569,11 +570,13 @@ static void migration_bitmap_sync(void) trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); + qemu_mutex_lock(&migration_bitmap_mutex); rcu_read_lock(); QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); } rcu_read_unlock(); + qemu_mutex_unlock(&migration_bitmap_mutex); trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); @@ -1062,6 +1065,30 @@ static void reset_ram_globals(void) #define MAX_WAIT 50 /* ms, half buffered_file limit */ +void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) +{ + /* called in qemu main thread, so there is + * no writing race against this migration_bitmap + */ + if (migration_bitmap) { + unsigned long *old_bitmap = migration_bitmap, *bitmap; + bitmap = bitmap_new(new); + + /* prevent migration_bitmap content from being set bit + * by migration_bitmap_sync_range() at the same time. + * it is safe to migration if migration_bitmap is cleared bit + * at the same time. + */ + qemu_mutex_lock(&migration_bitmap_mutex); + bitmap_copy(bitmap, old_bitmap, old); + bitmap_set(bitmap, old, new - old); + atomic_rcu_set(&migration_bitmap, bitmap); + qemu_mutex_unlock(&migration_bitmap_mutex); + migration_dirty_pages += new - old; + synchronize_rcu(); + g_free(old_bitmap); + } +} /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -1078,6 +1105,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) dirty_rate_high_cnt = 0; bitmap_sync_count = 0; migration_bitmap_sync_init(); + qemu_mutex_init(&migration_bitmap_mutex); if (migrate_use_xbzrle()) { XBZRLE_cache_lock();