From ead3f1bff99f4a4227975a1f026f4091e50f199f Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Sat, 3 Jul 2021 00:16:34 +0300 Subject: [PATCH 01/11] block/mirror: set .co for active-write MirrorOp objects This field is unused, but it very helpful for debugging. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210702211636.228981-2-vsementsov@virtuozzo.com> Signed-off-by: Kevin Wolf --- block/mirror.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/mirror.c b/block/mirror.c index 019f6deaa5..ad6aac2f95 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1343,6 +1343,7 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s, .bytes = bytes, .is_active_write = true, .is_in_flight = true, + .co = qemu_coroutine_self(), }; qemu_co_queue_init(&op->waiting_requests); QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); From e0f69d83d5c5c039b133b60b5a7130dedeeaca42 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Sat, 3 Jul 2021 00:16:35 +0300 Subject: [PATCH 02/11] iotest 151: add test-case that shows active mirror dead-lock There is a dead-lock in active mirror: when we have parallel intersecting requests (note that non intersecting requests may be considered intersecting after aligning to mirror granularity), it may happen that request A waits request B in mirror_wait_on_conflicts() and request B waits for A. Look at the test for details. Test now dead-locks, that's why it's disabled. Next commit will fix mirror and enable the test. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210702211636.228981-3-vsementsov@virtuozzo.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/151 | 62 ++++++++++++++++++++++++++++++++++++-- tests/qemu-iotests/151.out | 4 +-- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151 index 182f6b5321..ab46c5e8ba 100755 --- a/tests/qemu-iotests/151 +++ b/tests/qemu-iotests/151 @@ -38,8 +38,9 @@ class TestActiveMirror(iotests.QMPTestCase): 'if': 'none', 'node-name': 'source-node', 'driver': iotests.imgfmt, - 'file': {'driver': 'file', - 'filename': source_img}} + 'file': {'driver': 'blkdebug', + 'image': {'driver': 'file', + 'filename': source_img}}} blk_target = {'node-name': 'target-node', 'driver': iotests.imgfmt, @@ -141,6 +142,63 @@ class TestActiveMirror(iotests.QMPTestCase): self.potential_writes_in_flight = False + def testIntersectingActiveIO(self): + # FIXME: test-case is dead-locking. To reproduce dead-lock just drop + # this return statement + return + + # Fill the source image + result = self.vm.hmp_qemu_io('source', 'write -P 1 0 2M') + + # Start the block job (very slowly) + result = self.vm.qmp('blockdev-mirror', + job_id='mirror', + filter_node_name='mirror-node', + device='source-node', + target='target-node', + sync='full', + copy_mode='write-blocking', + speed=1) + + self.vm.hmp_qemu_io('source', 'break write_aio A') + self.vm.hmp_qemu_io('source', 'aio_write 0 1M') # 1 + self.vm.hmp_qemu_io('source', 'wait_break A') + self.vm.hmp_qemu_io('source', 'aio_write 0 2M') # 2 + self.vm.hmp_qemu_io('source', 'aio_write 0 2M') # 3 + + # Now 2 and 3 are in mirror_wait_on_conflicts, waiting for 1 + + self.vm.hmp_qemu_io('source', 'break write_aio B') + self.vm.hmp_qemu_io('source', 'aio_write 1M 2M') # 4 + self.vm.hmp_qemu_io('source', 'wait_break B') + + # 4 doesn't wait for 2 and 3, because they didn't yet set + # in_flight_bitmap. So, nothing prevents 4 to go except for our + # break-point B. + + self.vm.hmp_qemu_io('source', 'resume A') + + # Now we resumed 1, so 2 and 3 goes to the next iteration of while loop + # in mirror_wait_on_conflicts(). They don't exit, as bitmap is dirty + # due to request 4. And they start to wait: 2 wait for 3, 3 wait for 2 + # - DEAD LOCK. + # Note that it's important that we add request 4 at last: requests are + # appended to the list, so we are sure that 4 is last in the list, so 2 + # and 3 now waits for each other, not for 4. + + self.vm.hmp_qemu_io('source', 'resume B') + + # Resuming 4 doesn't help, 2 and 3 already dead-locked + # To check the dead-lock run: + # gdb -p $(pidof qemu-system-x86_64) -ex 'set $job=(MirrorBlockJob *)jobs.lh_first' -ex 'p *$job->ops_in_flight.tqh_first' -ex 'p *$job->ops_in_flight.tqh_first->next.tqe_next' + # You'll see two MirrorOp objects waiting on each other + + result = self.vm.qmp('block-job-set-speed', device='mirror', speed=0) + self.assert_qmp(result, 'return', {}) + self.complete_and_wait(drive='mirror') + + self.potential_writes_in_flight = False + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'raw'], diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out index 8d7e996700..89968f35d7 100644 --- a/tests/qemu-iotests/151.out +++ b/tests/qemu-iotests/151.out @@ -1,5 +1,5 @@ -... +.... ---------------------------------------------------------------------- -Ran 3 tests +Ran 4 tests OK From d44dae1a7cf782ec9235746ebb0e6c1a20dd7288 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Sat, 3 Jul 2021 00:16:36 +0300 Subject: [PATCH 03/11] block/mirror: fix active mirror dead-lock in mirror_wait_on_conflicts It's possible that requests start to wait each other in mirror_wait_on_conflicts(). To avoid it let's use same technique as in block/io.c in bdrv_wait_serialising_requests_locked() / bdrv_find_conflicting_request(): don't wait on intersecting request if it is already waiting for some other request. For details of the dead-lock look at testIntersectingActiveIO() test-case which we actually fixing now. Fixes: d06107ade0ce74dc39739bac80de84b51ec18546 Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210702211636.228981-4-vsementsov@virtuozzo.com> Signed-off-by: Kevin Wolf --- block/mirror.c | 12 ++++++++++++ tests/qemu-iotests/151 | 18 +++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index ad6aac2f95..98fc66eabf 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -107,6 +107,7 @@ struct MirrorOp { bool is_in_flight; CoQueue waiting_requests; Coroutine *co; + MirrorOp *waiting_for_op; QTAILQ_ENTRY(MirrorOp) next; }; @@ -159,7 +160,18 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self, if (ranges_overlap(self_start_chunk, self_nb_chunks, op_start_chunk, op_nb_chunks)) { + /* + * If the operation is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). + */ + if (op->waiting_for_op) { + continue; + } + + self->waiting_for_op = op; qemu_co_queue_wait(&op->waiting_requests, NULL); + self->waiting_for_op = NULL; break; } } diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151 index ab46c5e8ba..93d14193d0 100755 --- a/tests/qemu-iotests/151 +++ b/tests/qemu-iotests/151 @@ -143,10 +143,6 @@ class TestActiveMirror(iotests.QMPTestCase): self.potential_writes_in_flight = False def testIntersectingActiveIO(self): - # FIXME: test-case is dead-locking. To reproduce dead-lock just drop - # this return statement - return - # Fill the source image result = self.vm.hmp_qemu_io('source', 'write -P 1 0 2M') @@ -180,18 +176,14 @@ class TestActiveMirror(iotests.QMPTestCase): # Now we resumed 1, so 2 and 3 goes to the next iteration of while loop # in mirror_wait_on_conflicts(). They don't exit, as bitmap is dirty - # due to request 4. And they start to wait: 2 wait for 3, 3 wait for 2 - # - DEAD LOCK. - # Note that it's important that we add request 4 at last: requests are - # appended to the list, so we are sure that 4 is last in the list, so 2 - # and 3 now waits for each other, not for 4. + # due to request 4. + # In the past at that point 2 and 3 would wait for each other producing + # a dead-lock. Now this is fixed and they will wait for request 4. self.vm.hmp_qemu_io('source', 'resume B') - # Resuming 4 doesn't help, 2 and 3 already dead-locked - # To check the dead-lock run: - # gdb -p $(pidof qemu-system-x86_64) -ex 'set $job=(MirrorBlockJob *)jobs.lh_first' -ex 'p *$job->ops_in_flight.tqh_first' -ex 'p *$job->ops_in_flight.tqh_first->next.tqe_next' - # You'll see two MirrorOp objects waiting on each other + # After resuming 4, one of 2 and 3 goes first and set in_flight_bitmap, + # so the other will wait for it. result = self.vm.qmp('block-job-set-speed', device='mirror', speed=0) self.assert_qmp(result, 'return', {}) From e5f05f8c375157211c7da625a0d3f3ccdb4957d5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 9 Jul 2021 18:41:41 +0200 Subject: [PATCH 04/11] block: Add option to use driver whitelist even in tools Currently, the block driver whitelists are only applied for the system emulator. All other binaries still give unrestricted access to all block drivers. There are use cases where this made sense because the main concern was avoiding customers running VMs on less optimised block drivers and getting bad performance. Allowing the same image format e.g. as a target for 'qemu-img convert' is not a problem then. However, if the concern is the supportability of the driver in general, either in full or when used read-write, not applying the list driver whitelist in tools doesn't help - especially since qemu-nbd and qemu-storage-daemon now give access to more or less the same operations in block drivers as running a system emulator. In order to address this, introduce a new configure option that enforces the driver whitelist in all binaries. Signed-off-by: Kevin Wolf Message-Id: <20210709164141.254097-1-kwolf@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block.c | 3 +++ configure | 14 ++++++++++++-- meson.build | 1 + 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/block.c b/block.c index be083f389e..e97ce0b1c8 100644 --- a/block.c +++ b/block.c @@ -6162,6 +6162,9 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, void bdrv_init(void) { +#ifdef CONFIG_BDRV_WHITELIST_TOOLS + use_bdrv_whitelist = 1; +#endif module_call_init(MODULE_INIT_BLOCK); } diff --git a/configure b/configure index 63f38fa94c..232c54dcc1 100755 --- a/configure +++ b/configure @@ -243,6 +243,7 @@ cross_prefix="" audio_drv_list="" block_drv_rw_whitelist="" block_drv_ro_whitelist="" +block_drv_whitelist_tools="no" host_cc="cc" audio_win_int="" libs_qga="" @@ -1016,6 +1017,10 @@ for opt do ;; --block-drv-ro-whitelist=*) block_drv_ro_whitelist=$(echo "$optarg" | sed -e 's/,/ /g') ;; + --enable-block-drv-whitelist-in-tools) block_drv_whitelist_tools="yes" + ;; + --disable-block-drv-whitelist-in-tools) block_drv_whitelist_tools="no" + ;; --enable-debug-tcg) debug_tcg="yes" ;; --disable-debug-tcg) debug_tcg="no" @@ -1800,10 +1805,12 @@ Advanced options (experts only): --block-drv-whitelist=L Same as --block-drv-rw-whitelist=L --block-drv-rw-whitelist=L set block driver read-write whitelist - (affects only QEMU, not qemu-img) + (by default affects only QEMU, not tools like qemu-img) --block-drv-ro-whitelist=L set block driver read-only whitelist - (affects only QEMU, not qemu-img) + (by default affects only QEMU, not tools like qemu-img) + --enable-block-drv-whitelist-in-tools + use block whitelist also in tools instead of only QEMU --enable-trace-backends=B Set trace backend Available backends: $trace_backend_list --with-trace-file=NAME Full PATH,NAME of file to store traces @@ -4583,6 +4590,9 @@ if test "$audio_win_int" = "yes" ; then fi echo "CONFIG_BDRV_RW_WHITELIST=$block_drv_rw_whitelist" >> $config_host_mak echo "CONFIG_BDRV_RO_WHITELIST=$block_drv_ro_whitelist" >> $config_host_mak +if test "$block_drv_whitelist_tools" = "yes" ; then + echo "CONFIG_BDRV_WHITELIST_TOOLS=y" >> $config_host_mak +fi if test "$xfs" = "yes" ; then echo "CONFIG_XFS=y" >> $config_host_mak fi diff --git a/meson.build b/meson.build index 6e4d2d8034..2f377098d7 100644 --- a/meson.build +++ b/meson.build @@ -2996,6 +2996,7 @@ summary_info += {'coroutine pool': config_host['CONFIG_COROUTINE_POOL'] == '1 if have_block summary_info += {'Block whitelist (rw)': config_host['CONFIG_BDRV_RW_WHITELIST']} summary_info += {'Block whitelist (ro)': config_host['CONFIG_BDRV_RO_WHITELIST']} + summary_info += {'Use block whitelist in tools': config_host.has_key('CONFIG_BDRV_WHITELIST_TOOLS')} summary_info += {'VirtFS support': have_virtfs} summary_info += {'build virtiofs daemon': have_virtiofsd} summary_info += {'Live block migration': config_host.has_key('CONFIG_LIVE_BLOCK_MIGRATION')} From 1e12ecfd2cd38d06278ee7424fa2ab0bf3c10e93 Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Sun, 18 Jul 2021 16:48:24 +0200 Subject: [PATCH 05/11] replication: Remove s->active_disk s->active_disk is bs->file. Remove it and use local variables instead. Signed-off-by: Lukas Straub Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <2534f867ea9be5b666dfce19744b7d4e2b96c976.1626619393.git.lukasstraub2@web.de> Reviewed-by: Zhang Chen Signed-off-by: Kevin Wolf --- block/replication.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/block/replication.c b/block/replication.c index 774e15df16..9ad2dfdc69 100644 --- a/block/replication.c +++ b/block/replication.c @@ -35,7 +35,6 @@ typedef enum { typedef struct BDRVReplicationState { ReplicationMode mode; ReplicationStage stage; - BdrvChild *active_disk; BlockJob *commit_job; BdrvChild *hidden_disk; BdrvChild *secondary_disk; @@ -307,8 +306,10 @@ out: return ret; } -static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp) +static void secondary_do_checkpoint(BlockDriverState *bs, Error **errp) { + BDRVReplicationState *s = bs->opaque; + BdrvChild *active_disk = bs->file; Error *local_err = NULL; int ret; @@ -323,13 +324,13 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp) return; } - if (!s->active_disk->bs->drv) { + if (!active_disk->bs->drv) { error_setg(errp, "Active disk %s is ejected", - s->active_disk->bs->node_name); + active_disk->bs->node_name); return; } - ret = bdrv_make_empty(s->active_disk, errp); + ret = bdrv_make_empty(active_disk, errp); if (ret < 0) { return; } @@ -458,6 +459,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, BlockDriverState *bs = rs->opaque; BDRVReplicationState *s; BlockDriverState *top_bs; + BdrvChild *active_disk; int64_t active_length, hidden_length, disk_length; AioContext *aio_context; Error *local_err = NULL; @@ -495,15 +497,14 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, case REPLICATION_MODE_PRIMARY: break; case REPLICATION_MODE_SECONDARY: - s->active_disk = bs->file; - if (!s->active_disk || !s->active_disk->bs || - !s->active_disk->bs->backing) { + active_disk = bs->file; + if (!active_disk || !active_disk->bs || !active_disk->bs->backing) { error_setg(errp, "Active disk doesn't have backing file"); aio_context_release(aio_context); return; } - s->hidden_disk = s->active_disk->bs->backing; + s->hidden_disk = active_disk->bs->backing; if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) { error_setg(errp, "Hidden disk doesn't have backing file"); aio_context_release(aio_context); @@ -518,7 +519,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, } /* verify the length */ - active_length = bdrv_getlength(s->active_disk->bs); + active_length = bdrv_getlength(active_disk->bs); hidden_length = bdrv_getlength(s->hidden_disk->bs); disk_length = bdrv_getlength(s->secondary_disk->bs); if (active_length < 0 || hidden_length < 0 || disk_length < 0 || @@ -530,9 +531,9 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, } /* Must be true, or the bdrv_getlength() calls would have failed */ - assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv); + assert(active_disk->bs->drv && s->hidden_disk->bs->drv); - if (!s->active_disk->bs->drv->bdrv_make_empty || + if (!active_disk->bs->drv->bdrv_make_empty || !s->hidden_disk->bs->drv->bdrv_make_empty) { error_setg(errp, "Active disk or hidden disk doesn't support make_empty"); @@ -586,7 +587,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, s->stage = BLOCK_REPLICATION_RUNNING; if (s->mode == REPLICATION_MODE_SECONDARY) { - secondary_do_checkpoint(s, errp); + secondary_do_checkpoint(bs, errp); } s->error = 0; @@ -615,7 +616,7 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp) } if (s->mode == REPLICATION_MODE_SECONDARY) { - secondary_do_checkpoint(s, errp); + secondary_do_checkpoint(bs, errp); } aio_context_release(aio_context); } @@ -652,7 +653,6 @@ static void replication_done(void *opaque, int ret) if (ret == 0) { s->stage = BLOCK_REPLICATION_DONE; - s->active_disk = NULL; s->secondary_disk = NULL; s->hidden_disk = NULL; s->error = 0; @@ -705,7 +705,7 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp) } if (!failover) { - secondary_do_checkpoint(s, errp); + secondary_do_checkpoint(bs, errp); s->stage = BLOCK_REPLICATION_DONE; aio_context_release(aio_context); return; @@ -713,7 +713,7 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp) s->stage = BLOCK_REPLICATION_FAILOVER; s->commit_job = commit_active_start( - NULL, s->active_disk->bs, s->secondary_disk->bs, + NULL, bs->file->bs, s->secondary_disk->bs, JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT, NULL, replication_done, bs, true, errp); break; From a990a42b39338ffd12fb9640d792276313f75ed5 Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Sun, 18 Jul 2021 16:48:29 +0200 Subject: [PATCH 06/11] replication: Reduce usage of s->hidden_disk and s->secondary_disk In preparation for the next patch, initialize s->hidden_disk and s->secondary_disk later and replace access to them with local variables in the places where they aren't initialized yet. Signed-off-by: Lukas Straub Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <1eb9dc179267207d9c7eccaeb30761758e32e9ab.1626619393.git.lukasstraub2@web.de> Signed-off-by: Kevin Wolf --- block/replication.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/block/replication.c b/block/replication.c index 9ad2dfdc69..25bbdf5d4b 100644 --- a/block/replication.c +++ b/block/replication.c @@ -366,27 +366,35 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, Error **errp) { BDRVReplicationState *s = bs->opaque; + BdrvChild *hidden_disk, *secondary_disk; BlockReopenQueue *reopen_queue = NULL; + /* + * s->hidden_disk and s->secondary_disk may not be set yet, as they will + * only be set after the children are writable. + */ + hidden_disk = bs->file->bs->backing; + secondary_disk = hidden_disk->bs->backing; + if (writable) { - s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs); - s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs); + s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs); + s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs); } - bdrv_subtree_drained_begin(s->hidden_disk->bs); - bdrv_subtree_drained_begin(s->secondary_disk->bs); + bdrv_subtree_drained_begin(hidden_disk->bs); + bdrv_subtree_drained_begin(secondary_disk->bs); if (s->orig_hidden_read_only) { QDict *opts = qdict_new(); qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); - reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, + reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs, opts, true); } if (s->orig_secondary_read_only) { QDict *opts = qdict_new(); qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); - reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs, + reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs, opts, true); } @@ -401,8 +409,8 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, } } - bdrv_subtree_drained_end(s->hidden_disk->bs); - bdrv_subtree_drained_end(s->secondary_disk->bs); + bdrv_subtree_drained_end(hidden_disk->bs); + bdrv_subtree_drained_end(secondary_disk->bs); } static void backup_job_cleanup(BlockDriverState *bs) @@ -459,7 +467,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, BlockDriverState *bs = rs->opaque; BDRVReplicationState *s; BlockDriverState *top_bs; - BdrvChild *active_disk; + BdrvChild *active_disk, *hidden_disk, *secondary_disk; int64_t active_length, hidden_length, disk_length; AioContext *aio_context; Error *local_err = NULL; @@ -504,15 +512,15 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, return; } - s->hidden_disk = active_disk->bs->backing; - if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) { + hidden_disk = active_disk->bs->backing; + if (!hidden_disk->bs || !hidden_disk->bs->backing) { error_setg(errp, "Hidden disk doesn't have backing file"); aio_context_release(aio_context); return; } - s->secondary_disk = s->hidden_disk->bs->backing; - if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) { + secondary_disk = hidden_disk->bs->backing; + if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) { error_setg(errp, "The secondary disk doesn't have block backend"); aio_context_release(aio_context); return; @@ -520,8 +528,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, /* verify the length */ active_length = bdrv_getlength(active_disk->bs); - hidden_length = bdrv_getlength(s->hidden_disk->bs); - disk_length = bdrv_getlength(s->secondary_disk->bs); + hidden_length = bdrv_getlength(hidden_disk->bs); + disk_length = bdrv_getlength(secondary_disk->bs); if (active_length < 0 || hidden_length < 0 || disk_length < 0 || active_length != hidden_length || hidden_length != disk_length) { error_setg(errp, "Active disk, hidden disk, secondary disk's length" @@ -531,10 +539,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, } /* Must be true, or the bdrv_getlength() calls would have failed */ - assert(active_disk->bs->drv && s->hidden_disk->bs->drv); + assert(active_disk->bs->drv && hidden_disk->bs->drv); if (!active_disk->bs->drv->bdrv_make_empty || - !s->hidden_disk->bs->drv->bdrv_make_empty) { + !hidden_disk->bs->drv->bdrv_make_empty) { error_setg(errp, "Active disk or hidden disk doesn't support make_empty"); aio_context_release(aio_context); @@ -549,6 +557,9 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, return; } + s->hidden_disk = hidden_disk; + s->secondary_disk = secondary_disk; + /* start backup job now */ error_setg(&s->blocker, "Block device is in use by internal backup job"); From 3b78420bb14f7e439a079aee28eeec997a229c5e Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Sun, 18 Jul 2021 16:48:33 +0200 Subject: [PATCH 07/11] replication: Properly attach children The replication driver needs access to the children block-nodes of it's child so it can issue bdrv_make_empty() and bdrv_co_pwritev() to manage the replication. However, it does this by directly copying the BdrvChilds, which is wrong. Fix this by properly attaching the block-nodes with bdrv_attach_child() and requesting the required permissions. This ultimatively fixes a potential crash in replication_co_writev(), because it may write to s->secondary_disk if it is in state BLOCK_REPLICATION_FAILOVER_FAILED, without requesting write permissions first. And now the workaround in secondary_do_checkpoint() can be removed. Signed-off-by: Lukas Straub Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <5d0539d729afb8072d0d7cde977c5066285591b4.1626619393.git.lukasstraub2@web.de> Signed-off-by: Kevin Wolf --- block/replication.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/block/replication.c b/block/replication.c index 25bbdf5d4b..b74192f795 100644 --- a/block/replication.c +++ b/block/replication.c @@ -165,7 +165,12 @@ static void replication_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { - *nperm = BLK_PERM_CONSISTENT_READ; + if (role & BDRV_CHILD_PRIMARY) { + *nperm = BLK_PERM_CONSISTENT_READ; + } else { + *nperm = 0; + } + if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) { *nperm |= BLK_PERM_WRITE; } @@ -557,8 +562,25 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, return; } - s->hidden_disk = hidden_disk; - s->secondary_disk = secondary_disk; + bdrv_ref(hidden_disk->bs); + s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk", + &child_of_bds, BDRV_CHILD_DATA, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + aio_context_release(aio_context); + return; + } + + bdrv_ref(secondary_disk->bs); + s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs, + "secondary disk", &child_of_bds, + BDRV_CHILD_DATA, &local_err); + if (local_err) { + error_propagate(errp, local_err); + aio_context_release(aio_context); + return; + } /* start backup job now */ error_setg(&s->blocker, @@ -664,7 +686,9 @@ static void replication_done(void *opaque, int ret) if (ret == 0) { s->stage = BLOCK_REPLICATION_DONE; + bdrv_unref_child(bs, s->secondary_disk); s->secondary_disk = NULL; + bdrv_unref_child(bs, s->hidden_disk); s->hidden_disk = NULL; s->error = 0; } else { From c2cf0ecab5455f41ab56c131b21e153a3befa8b0 Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Sun, 18 Jul 2021 16:48:42 +0200 Subject: [PATCH 08/11] replication: Remove workaround Remove the workaround introduced in commit 6ecbc6c52672db5c13805735ca02784879ce8285 "replication: Avoid blk_make_empty() on read-only child". It is not needed anymore since s->hidden_disk is guaranteed to be writable when secondary_do_checkpoint() runs. Because replication_start(), _do_checkpoint() and _stop() are only called by COLO migration code and COLO-migration activates all disks via bdrv_invalidate_cache_all() before it calls these functions. Signed-off-by: Lukas Straub Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: Signed-off-by: Kevin Wolf --- block/replication.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/block/replication.c b/block/replication.c index b74192f795..32444b9a8f 100644 --- a/block/replication.c +++ b/block/replication.c @@ -346,17 +346,7 @@ static void secondary_do_checkpoint(BlockDriverState *bs, Error **errp) return; } - BlockBackend *blk = blk_new(qemu_get_current_aio_context(), - BLK_PERM_WRITE, BLK_PERM_ALL); - blk_insert_bs(blk, s->hidden_disk->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - blk_unref(blk); - return; - } - - ret = blk_make_empty(blk, errp); - blk_unref(blk); + ret = bdrv_make_empty(s->hidden_disk, errp); if (ret < 0) { return; } From 6af72274efd580fbfc048aad75d3e9af44614590 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 15 Jul 2021 15:48:53 +0300 Subject: [PATCH 09/11] block/vvfat: fix: drop backing Most probably this fake backing child doesn't work anyway (see notes about it in a8a4d15c1c34d). Still, since 25f78d9e2de528473d52 drivers are required to set .supports_backing if they want to call bdrv_set_backing_hd, so now vvfat just doesn't work because of this check. Let's finally drop this fake backing file. Fixes: 25f78d9e2de528473d52acfcf7acdfb64e3453d4 Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210715124853.13335-1-vsementsov@virtuozzo.com> Tested-by: John Arbuckle Signed-off-by: Kevin Wolf --- block/vvfat.c | 43 ++++--------------------------------------- 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/block/vvfat.c b/block/vvfat.c index ae9d387da7..34bf1e3a86 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -3098,26 +3098,6 @@ static int coroutine_fn vvfat_co_block_status(BlockDriverState *bs, return BDRV_BLOCK_DATA; } -static int coroutine_fn -write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) -{ - int ret; - - BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - qemu_co_mutex_lock(&s->lock); - ret = try_commit(s); - qemu_co_mutex_unlock(&s->lock); - - return ret; -} - -static BlockDriver vvfat_write_target = { - .format_name = "vvfat_write_target", - .instance_size = sizeof(void*), - .bdrv_co_pwritev = write_target_commit, -}; - static void vvfat_qcow_options(BdrvChildRole role, bool parent_is_format, int *child_flags, QDict *child_options, int parent_flags, QDict *parent_options) @@ -3133,7 +3113,6 @@ static int enable_write_target(BlockDriverState *bs, Error **errp) { BDRVVVFATState *s = bs->opaque; BlockDriver *bdrv_qcow = NULL; - BlockDriverState *backing; QemuOpts *opts = NULL; int ret; int size = sector2cluster(s, s->sector_count); @@ -3184,13 +3163,6 @@ static int enable_write_target(BlockDriverState *bs, Error **errp) unlink(s->qcow_filename); #endif - backing = bdrv_new_open_driver(&vvfat_write_target, NULL, BDRV_O_ALLOW_RDWR, - &error_abort); - *(void**) backing->opaque = s; - - bdrv_set_backing_hd(s->bs, backing, &error_abort); - bdrv_unref(backing); - return 0; err: @@ -3205,17 +3177,10 @@ static void vvfat_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { - if (role & BDRV_CHILD_DATA) { - /* This is a private node, nobody should try to attach to it */ - *nperm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; - *nshared = BLK_PERM_WRITE_UNCHANGED; - } else { - assert(role & BDRV_CHILD_COW); - /* The backing file is there so 'commit' can use it. vvfat doesn't - * access it in any way. */ - *nperm = 0; - *nshared = BLK_PERM_ALL; - } + assert(role & BDRV_CHILD_DATA); + /* This is a private node, nobody should try to attach to it */ + *nperm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; + *nshared = BLK_PERM_WRITE_UNCHANGED; } static void vvfat_close(BlockDriverState *bs) From 8573823f3ba2b63926f82d5732473e0cd73c1213 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 24 Jun 2021 10:38:24 +0200 Subject: [PATCH 10/11] block/export: Conditionally ignore set-context error When invoking block-export-add with some iothread and fixed-iothread=false, and changing the node's iothread fails, the error is supposed to be ignored. However, it is still stored in *errp, which is wrong. If a second error occurs, the "*errp must be NULL" assertion in error_setv() fails: qemu-system-x86_64: ../util/error.c:59: error_setv: Assertion `*errp == NULL' failed. So if fixed-iothread=false, we should ignore the error by passing NULL to bdrv_try_set_aio_context(). Fixes: f51d23c80af73c95e0ce703ad06a300f1b3d63ef ("block/export: add iothread and fixed-iothread options") Signed-off-by: Max Reitz Message-Id: <20210624083825.29224-2-mreitz@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- block/export/export.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/export/export.c b/block/export/export.c index fec7d9f738..6d3b9964c8 100644 --- a/block/export/export.c +++ b/block/export/export.c @@ -111,6 +111,7 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp) if (export->has_iothread) { IOThread *iothread; AioContext *new_ctx; + Error **set_context_errp; iothread = iothread_by_id(export->iothread); if (!iothread) { @@ -120,7 +121,9 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp) new_ctx = iothread_get_aio_context(iothread); - ret = bdrv_try_set_aio_context(bs, new_ctx, errp); + /* Ignore errors with fixed-iothread=false */ + set_context_errp = fixed_iothread ? errp : NULL; + ret = bdrv_try_set_aio_context(bs, new_ctx, set_context_errp); if (ret == 0) { aio_context_release(ctx); aio_context_acquire(new_ctx); From d21471696b07f30cb00453709d055a25c1afde85 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 24 Jun 2021 10:38:25 +0200 Subject: [PATCH 11/11] iotests/307: Test iothread conflict for exports Passing fixed-iothread=true should make iothread conflicts fatal, whereas fixed-iothread=false should not. Combine the second case with an error condition that is checked after the iothread is handled, to verify that qemu does not crash if there is such an error after changing the iothread failed. Signed-off-by: Max Reitz Message-Id: <20210624083825.29224-3-mreitz@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Tested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- tests/qemu-iotests/307 | 15 +++++++++++++++ tests/qemu-iotests/307.out | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/tests/qemu-iotests/307 b/tests/qemu-iotests/307 index c7685347bc..b429b5aa50 100755 --- a/tests/qemu-iotests/307 +++ b/tests/qemu-iotests/307 @@ -41,9 +41,11 @@ with iotests.FilePath('image') as img, \ iotests.log('=== Launch VM ===') vm.add_object('iothread,id=iothread0') + vm.add_object('iothread,id=iothread1') vm.add_blockdev(f'file,filename={img},node-name=file') vm.add_blockdev(f'{iotests.imgfmt},file=file,node-name=fmt') vm.add_blockdev('raw,file=file,node-name=ro,read-only=on') + vm.add_blockdev('null-co,node-name=null') vm.add_device(f'id=scsi0,driver=virtio-scsi,iothread=iothread0') vm.launch() @@ -74,6 +76,19 @@ with iotests.FilePath('image') as img, \ vm.qmp_log('query-block-exports') iotests.qemu_nbd_list_log('-k', socket) + iotests.log('\n=== Add export with conflicting iothread ===') + + vm.qmp_log('device_add', id='sdb', driver='scsi-hd', drive='null') + + # Should fail because of fixed-iothread + vm.qmp_log('block-export-add', id='export1', type='nbd', node_name='null', + iothread='iothread1', fixed_iothread=True, writable=True) + + # Should ignore the iothread conflict, but then fail because of the + # permission conflict (and not crash) + vm.qmp_log('block-export-add', id='export1', type='nbd', node_name='null', + iothread='iothread1', fixed_iothread=False, writable=True) + iotests.log('\n=== Add a writable export ===') # This fails because share-rw=off diff --git a/tests/qemu-iotests/307.out b/tests/qemu-iotests/307.out index 4b0c7e155a..ec8d2be0e0 100644 --- a/tests/qemu-iotests/307.out +++ b/tests/qemu-iotests/307.out @@ -51,6 +51,14 @@ exports available: 1 base:allocation +=== Add export with conflicting iothread === +{"execute": "device_add", "arguments": {"drive": "null", "driver": "scsi-hd", "id": "sdb"}} +{"return": {}} +{"execute": "block-export-add", "arguments": {"fixed-iothread": true, "id": "export1", "iothread": "iothread1", "node-name": "null", "type": "nbd", "writable": true}} +{"error": {"class": "GenericError", "desc": "Cannot change iothread of active block backend"}} +{"execute": "block-export-add", "arguments": {"fixed-iothread": false, "id": "export1", "iothread": "iothread1", "node-name": "null", "type": "nbd", "writable": true}} +{"error": {"class": "GenericError", "desc": "Permission conflict on node 'null': permissions 'write' are both required by an unnamed block device (uses node 'null' as 'root' child) and unshared by block device 'sdb' (uses node 'null' as 'root' child)."}} + === Add a writable export === {"execute": "block-export-add", "arguments": {"description": "This is the writable second export", "id": "export1", "name": "export1", "node-name": "fmt", "type": "nbd", "writable": true, "writethrough": true}} {"error": {"class": "GenericError", "desc": "Permission conflict on node 'fmt': permissions 'write' are both required by an unnamed block device (uses node 'fmt' as 'root' child) and unshared by block device 'sda' (uses node 'fmt' as 'root' child)."}}