From cae98cb87d269c33d23b2bccd79bb8d99a60d811 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 1 Jul 2015 15:45:50 +0100 Subject: [PATCH 1/2] block/mirror: limit qiov to IOV_MAX elements If mirror has more free buffers than IOV_MAX, preadv(2)/pwritev(2) EINVAL failures may be encountered. It is possible to trigger this by setting granularity to a low value like 8192. This patch stops appending chunks once IOV_MAX is reached. The spurious EINVAL failure can be reproduced with a qcow2 image file and the following QMP invocation: qmp.command('drive-mirror', device='virtio0', target='/tmp/r7.s1', granularity=8192, sync='full', mode='absolute-paths', format='raw') While the guest is running dd if=/dev/zero of=/var/tmp/foo oflag=direct bs=4k. Cc: Jeff Cody Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini Message-id: 1435761950-26714-1-git-send-email-stefanha@redhat.com Signed-off-by: Jeff Cody --- block/mirror.c | 4 ++++ trace-events | 1 + 2 files changed, 5 insertions(+) diff --git a/block/mirror.c b/block/mirror.c index fc4d8f561e..0841964e97 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -245,6 +245,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); break; } + if (IOV_MAX < nb_chunks + added_chunks) { + trace_mirror_break_iov_max(s, nb_chunks, added_chunks); + break; + } /* We have enough free space to copy these sectors. */ bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); diff --git a/trace-events b/trace-events index 94bf3bb184..8f9614adb5 100644 --- a/trace-events +++ b/trace-events @@ -94,6 +94,7 @@ mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirt mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d" mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" +mirror_break_iov_max(void *s, int nb_chunks, int added_chunks) "s %p requested chunks %d added_chunks %d" # block/backup.c backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d" From e424aff5f307227b1c2512bbb8ece891bb895cef Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 13 Aug 2015 10:41:50 +0200 Subject: [PATCH 2/2] mirror: Fix coroutine reentrance This fixes a regression introduced by commit dcfb3beb ("mirror: Do zero write on target if sectors not allocated"), which was reported to cause aborts with the message "Co-routine re-entered recursively". The cause for this bug is the following code in mirror_iteration_done(): if (s->common.busy) { qemu_coroutine_enter(s->common.co, NULL); } This has always been ugly because - unlike most places that reenter - it doesn't have a specific yield that it pairs with, but is more uncontrolled. What we really mean here is "reenter the coroutine if it's in one of the four explicit yields in mirror.c". This used to be equivalent with s->common.busy because neither mirror_run() nor mirror_iteration() call any function that could yield. However since commit dcfb3beb this doesn't hold true any more: bdrv_get_block_status_above() can yield. So what happens is that bdrv_get_block_status_above() wants to take a lock that is already held, so it adds itself to the queue of waiting coroutines and yields. Instead of being woken up by the unlock function, however, it gets woken up by mirror_iteration_done(), which is obviously wrong. In most cases the code actually happens to cope fairly well with such cases, but in this specific case, the unlock must already have scheduled the coroutine for wakeup when mirror_iteration_done() reentered it. And then the coroutine happened to process the scheduled restarts and tried to reenter itself recursively. This patch fixes the problem by pairing the reenter in mirror_iteration_done() with specific yields instead of abusing s->common.busy. Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Paolo Bonzini Reviewed-by: Stefan Hajnoczi Reviewed-by: Jeff Cody Message-id: 1439455310-11263-1-git-send-email-kwolf@redhat.com Signed-off-by: Jeff Cody --- block/mirror.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 0841964e97..94744432eb 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -60,6 +60,7 @@ typedef struct MirrorBlockJob { int sectors_in_flight; int ret; bool unmap; + bool waiting_for_io; } MirrorBlockJob; typedef struct MirrorOp { @@ -114,11 +115,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) qemu_iovec_destroy(&op->qiov); g_slice_free(MirrorOp, op); - /* Enter coroutine when it is not sleeping. The coroutine sleeps to - * rate-limit itself. The coroutine will eventually resume since there is - * a sleep timeout so don't wake it early. - */ - if (s->common.busy) { + if (s->waiting_for_io) { qemu_coroutine_enter(s->common.co, NULL); } } @@ -203,7 +200,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Wait for I/O to this cluster (from a previous iteration) to be done. */ while (test_bit(next_chunk, s->in_flight_bitmap)) { trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; } do { @@ -239,7 +238,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) */ while (nb_chunks == 0 && s->buf_free_count < added_chunks) { trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; } if (s->buf_free_count < nb_chunks + added_chunks) { trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); @@ -337,7 +338,9 @@ static void mirror_free_init(MirrorBlockJob *s) static void mirror_drain(MirrorBlockJob *s) { while (s->in_flight > 0) { + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; } } @@ -510,7 +513,9 @@ static void coroutine_fn mirror_run(void *opaque) if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); + s->waiting_for_io = true; qemu_coroutine_yield(); + s->waiting_for_io = false; continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s);