From ddf2d98a94c8a98a661a217fb629cfd15f4dcec7 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 8 May 2018 14:54:35 +0100 Subject: [PATCH 1/4] qemu-iotests: reduce chance of races in 185 Commit 8565c3ab537e78f3e69977ec2c609dc9417a806e ("qemu-iotests: fix 185") identified a race condition in a sub-test. Similar issues also affect the other sub-tests. If disk I/O completes quickly, it races with the QMP 'quit' command. This causes spurious test failures because QMP events are emitted in an unpredictable order. This test relies on QEMU internals and there is no QMP API for getting deterministic behavior needed to make this test 100% reliable. At the same time, the test is useful and it would be a shame to remove it. Add sleep 0.5 to reduce the chance of races. This is not a real fix but appears to reduce spurious failures in practice. Cc: Vladimir Sementsov-Ogievskiy Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Message-id: 20180508135436.30140-2-stefanha@redhat.com Reviewed-by: Jeff Cody Signed-off-by: Jeff Cody --- tests/qemu-iotests/185 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185 index 298d88d04e..975404c99d 100755 --- a/tests/qemu-iotests/185 +++ b/tests/qemu-iotests/185 @@ -118,6 +118,9 @@ _send_qemu_cmd $h \ 'speed': 65536 } }" \ "return" +# If we don't sleep here 'quit' command races with disk I/O +sleep 0.5 + _send_qemu_cmd $h "{ 'execute': 'quit' }" "return" wait=1 _cleanup_qemu @@ -137,6 +140,9 @@ _send_qemu_cmd $h \ 'speed': 65536 } }" \ "return" +# If we don't sleep here 'quit' command races with disk I/O +sleep 0.5 + _send_qemu_cmd $h "{ 'execute': 'quit' }" "return" wait=1 _cleanup_qemu @@ -183,6 +189,9 @@ _send_qemu_cmd $h \ 'speed': 65536 } }" \ "return" +# If we don't sleep here 'quit' command races with disk I/O +sleep 0.5 + _send_qemu_cmd $h "{ 'execute': 'quit' }" "return" wait=1 _cleanup_qemu @@ -201,6 +210,9 @@ _send_qemu_cmd $h \ 'speed': 65536 } }" \ "return" +# If we don't sleep here 'quit' command races with disk I/O +sleep 0.5 + _send_qemu_cmd $h "{ 'execute': 'quit' }" "return" wait=1 _cleanup_qemu From 4c7e813ce964e230bb55cf4afc862ccb091ca3a3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 8 May 2018 14:54:36 +0100 Subject: [PATCH 2/4] blockjob: do not cancel timer in resume Currently the timer is cancelled and the block job is entered by block_job_resume(). This behavior causes drain to run extra blockjob iterations when the job was sleeping due to the ratelimit. This patch leaves the job asleep when block_job_resume() is called. Jobs can still be forcibly woken up using block_job_enter(), which is used to cancel jobs. After this patch drain no longer runs extra blockjob iterations. This is the expected behavior that qemu-iotests 185 used to rely on. We temporarily changed the 185 test output to make it pass for the QEMU 2.12 release but now it's time to address this issue. Cc: QingFeng Hao Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Reviewed-by: QingFeng Hao Message-id: 20180508135436.30140-3-stefanha@redhat.com Reviewed-by: Jeff Cody Signed-off-by: Jeff Cody --- blockjob.c | 22 +++++++++++++++------- tests/qemu-iotests/185 | 5 +---- tests/qemu-iotests/185.out | 12 +++++------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/blockjob.c b/blockjob.c index 36c5fdeb2f..112672a68b 100644 --- a/blockjob.c +++ b/blockjob.c @@ -209,6 +209,18 @@ static void block_job_txn_del_job(BlockJob *job) } } +/* Assumes the block_job_mutex is held */ +static bool block_job_timer_pending(BlockJob *job) +{ + return timer_pending(&job->sleep_timer); +} + +/* Assumes the block_job_mutex is held */ +static bool block_job_timer_not_pending(BlockJob *job) +{ + return !block_job_timer_pending(job); +} + static void block_job_pause(BlockJob *job) { job->pause_count++; @@ -221,7 +233,9 @@ static void block_job_resume(BlockJob *job) if (job->pause_count) { return; } - block_job_enter(job); + + /* kick only if no timer is pending */ + block_job_enter_cond(job, block_job_timer_not_pending); } void block_job_ref(BlockJob *job) @@ -656,12 +670,6 @@ static void block_job_completed_txn_success(BlockJob *job) } } -/* Assumes the block_job_mutex is held */ -static bool block_job_timer_pending(BlockJob *job) -{ - return timer_pending(&job->sleep_timer); -} - void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) { int64_t old_speed = job->speed; diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185 index 975404c99d..9a2d317414 100755 --- a/tests/qemu-iotests/185 +++ b/tests/qemu-iotests/185 @@ -101,14 +101,11 @@ echo # command to be received (after receiving the command, the rest runs # synchronously, so jobs can arbitrarily continue or complete). # -# Jobs present while QEMU is terminating iterate once more due to -# bdrv_drain_all(). -# # The buffer size for commit and streaming is 512k (waiting for 8 seconds after # the first request), for active commit and mirror it's large enough to cover # the full 4M, and for backup it's the qcow2 cluster size, which we know is # 64k. As all of these are at least as large as the speed, we are sure that the -# offset advances exactly twice before qemu exits. +# offset advances exactly once before qemu exits. _send_qemu_cmd $h \ "{ 'execute': 'block-commit', diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out index 992162f418..57eaf8d699 100644 --- a/tests/qemu-iotests/185.out +++ b/tests/qemu-iotests/185.out @@ -20,7 +20,7 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.q {"return": {}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 1048576, "speed": 65536, "type": "commit"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "commit"}} === Start active commit job and exit qemu === @@ -28,8 +28,7 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.q {"return": {}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}} === Start mirror job and exit qemu === @@ -38,8 +37,7 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 l {"return": {}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}} === Start backup job and exit qemu === @@ -48,7 +46,7 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 l {"return": {}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 131072, "speed": 65536, "type": "backup"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 65536, "speed": 65536, "type": "backup"}} === Start streaming job and exit qemu === @@ -56,6 +54,6 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 l {"return": {}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 1048576, "speed": 65536, "type": "stream"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "stream"}} No errors were found on the image. *** done From 54b7af4369a37afbd82573d0dcfb27febdb6dd24 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 16 May 2018 18:10:34 +0200 Subject: [PATCH 3/4] nfs: Fix error path in nfs_options_qdict_to_qapi() Don't throw away local_err, but propagate it to errp. Signed-off-by: Kevin Wolf Message-id: 20180516161034.27440-1-kwolf@redhat.com Reviewed-by: Eric Blake Reviewed-by: Jeff Cody Signed-off-by: Jeff Cody --- block/nfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/nfs.c b/block/nfs.c index 66fddf12d4..4ee2ad59d9 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -570,6 +570,7 @@ static BlockdevOptionsNfs *nfs_options_qdict_to_qapi(QDict *options, qobject_unref(crumpled); if (local_err) { + error_propagate(errp, local_err); return NULL; } From c82be42cc803b36fd7aed5dceec68312c7056fd5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 16 May 2018 18:08:16 +0200 Subject: [PATCH 4/4] nfs: Remove processed options from QDict Commit c22a03454 QAPIfied option parsing in the NFS block driver, but forgot to remove all the options we processed. Therefore, we get an error in bdrv_open_inherit(), which thinks the remaining options are invalid. Trying to open an NFS image will result in an error like this: Block protocol 'nfs' doesn't support the option 'server.host' Remove all options from the QDict to make the NFS driver work again. Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Message-id: 20180516160816.26259-1-kwolf@redhat.com Reviewed-by: Eric Blake Reviewed-by: Jeff Cody Signed-off-by: Jeff Cody --- block/nfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/nfs.c b/block/nfs.c index 4ee2ad59d9..3349b67a76 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -557,6 +557,7 @@ static BlockdevOptionsNfs *nfs_options_qdict_to_qapi(QDict *options, BlockdevOptionsNfs *opts = NULL; QObject *crumpled = NULL; Visitor *v; + const QDictEntry *e; Error *local_err = NULL; crumpled = qdict_crumple(options, errp); @@ -574,6 +575,12 @@ static BlockdevOptionsNfs *nfs_options_qdict_to_qapi(QDict *options, return NULL; } + /* Remove the processed options from the QDict (the visitor processes + * _all_ options in the QDict) */ + while ((e = qdict_first(options))) { + qdict_del(options, e->key); + } + return opts; }