From b660a84bbb0eb1a76b505648d31d5e82594fb75e Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Tue, 7 Apr 2020 13:56:49 +0200
Subject: [PATCH 1/7] job: take each job's lock individually in job_txn_apply

All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.

Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.

This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.

This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.

One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c            |  9 ++++++++
 job-qmp.c             |  9 ++++++++
 job.c                 | 50 ++++++++++++++++++++++++++++++++++---------
 tests/test-blockjob.c |  2 ++
 4 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index fa8630cb41..5faddaa705 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3612,7 +3612,16 @@ void qmp_block_job_finalize(const char *id, Error **errp)
     }
 
     trace_qmp_block_job_finalize(job);
+    job_ref(&job->job);
     job_finalize(&job->job, errp);
+
+    /*
+     * Job's context might have changed via job_finalize (and job_txn_apply
+     * automatically acquires the new one), so make sure we release the correct
+     * one.
+     */
+    aio_context = blk_get_aio_context(job->blk);
+    job_unref(&job->job);
     aio_context_release(aio_context);
 }
 
diff --git a/job-qmp.c b/job-qmp.c
index fecc939ebd..f9a58832e1 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -114,7 +114,16 @@ void qmp_job_finalize(const char *id, Error **errp)
     }
 
     trace_qmp_job_finalize(job);
+    job_ref(job);
     job_finalize(job, errp);
+
+    /*
+     * Job's context might have changed via job_finalize (and job_txn_apply
+     * automatically acquires the new one), so make sure we release the correct
+     * one.
+     */
+    aio_context = job->aio_context;
+    job_unref(job);
     aio_context_release(aio_context);
 }
 
diff --git a/job.c b/job.c
index 134a07b92e..53be57a3a0 100644
--- a/job.c
+++ b/job.c
@@ -136,17 +136,38 @@ static void job_txn_del_job(Job *job)
     }
 }
 
-static int job_txn_apply(JobTxn *txn, int fn(Job *))
+static int job_txn_apply(Job *job, int fn(Job *))
 {
-    Job *job, *next;
+    AioContext *inner_ctx;
+    Job *other_job, *next;
+    JobTxn *txn = job->txn;
     int rc = 0;
 
-    QLIST_FOREACH_SAFE(job, &txn->jobs, txn_list, next) {
-        rc = fn(job);
+    /*
+     * Similar to job_completed_txn_abort, we take each job's lock before
+     * applying fn, but since we assume that outer_ctx is held by the caller,
+     * we need to release it here to avoid holding the lock twice - which would
+     * break AIO_WAIT_WHILE from within fn.
+     */
+    job_ref(job);
+    aio_context_release(job->aio_context);
+
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        inner_ctx = other_job->aio_context;
+        aio_context_acquire(inner_ctx);
+        rc = fn(other_job);
+        aio_context_release(inner_ctx);
         if (rc) {
             break;
         }
     }
+
+    /*
+     * Note that job->aio_context might have been changed by calling fn, so we
+     * can't use a local variable to cache it.
+     */
+    aio_context_acquire(job->aio_context);
+    job_unref(job);
     return rc;
 }
 
@@ -774,11 +795,11 @@ static void job_do_finalize(Job *job)
     assert(job && job->txn);
 
     /* prepare the transaction to complete */
-    rc = job_txn_apply(job->txn, job_prepare);
+    rc = job_txn_apply(job, job_prepare);
     if (rc) {
         job_completed_txn_abort(job);
     } else {
-        job_txn_apply(job->txn, job_finalize_single);
+        job_txn_apply(job, job_finalize_single);
     }
 }
 
@@ -824,10 +845,10 @@ static void job_completed_txn_success(Job *job)
         assert(other_job->ret == 0);
     }
 
-    job_txn_apply(txn, job_transition_to_pending);
+    job_txn_apply(job, job_transition_to_pending);
 
     /* If no jobs need manual finalization, automatically do so */
-    if (job_txn_apply(txn, job_needs_finalize) == 0) {
+    if (job_txn_apply(job, job_needs_finalize) == 0) {
         job_do_finalize(job);
     }
 }
@@ -849,9 +870,10 @@ static void job_completed(Job *job)
 static void job_exit(void *opaque)
 {
     Job *job = (Job *)opaque;
-    AioContext *ctx = job->aio_context;
+    AioContext *ctx;
 
-    aio_context_acquire(ctx);
+    job_ref(job);
+    aio_context_acquire(job->aio_context);
 
     /* This is a lie, we're not quiescent, but still doing the completion
      * callbacks. However, completion callbacks tend to involve operations that
@@ -862,6 +884,14 @@ static void job_exit(void *opaque)
 
     job_completed(job);
 
+    /*
+     * Note that calling job_completed can move the job to a different
+     * aio_context, so we cannot cache from above. job_txn_apply takes care of
+     * acquiring the new lock, and we ref/unref to avoid job_completed freeing
+     * the job underneath us.
+     */
+    ctx = job->aio_context;
+    job_unref(job);
     aio_context_release(ctx);
 }
 
diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c
index 4eeb184caf..7519847912 100644
--- a/tests/test-blockjob.c
+++ b/tests/test-blockjob.c
@@ -367,7 +367,9 @@ static void test_cancel_concluded(void)
     aio_poll(qemu_get_aio_context(), true);
     assert(job->status == JOB_STATUS_PENDING);
 
+    aio_context_acquire(job->aio_context);
     job_finalize(job, &error_abort);
+    aio_context_release(job->aio_context);
     assert(job->status == JOB_STATUS_CONCLUDED);
 
     cancel_common(s);

From 08558e33257ec796594bd411261028a93414a70c Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Tue, 7 Apr 2020 13:56:50 +0200
Subject: [PATCH 2/7] replication: assert we own context before job_cancel_sync

job_cancel_sync requires the job's lock to be held, all other callers
already do this (replication_stop, drive_backup_abort,
blockdev_backup_abort, job_cancel_sync_all, cancel_common).

In this case we're in a BlockDriver handler, so we already have a lock,
just assert that it is the same as the one used for the commit_job.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-3-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/replication.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/replication.c b/block/replication.c
index 413d95407d..da013c2041 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -144,12 +144,15 @@ fail:
 static void replication_close(BlockDriverState *bs)
 {
     BDRVReplicationState *s = bs->opaque;
+    Job *commit_job;
 
     if (s->stage == BLOCK_REPLICATION_RUNNING) {
         replication_stop(s->rs, false, NULL);
     }
     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
-        job_cancel_sync(&s->commit_job->job);
+        commit_job = &s->commit_job->job;
+        assert(commit_job->aio_context == qemu_get_current_aio_context());
+        job_cancel_sync(commit_job);
     }
 
     if (s->mode == REPLICATION_MODE_SECONDARY) {

From eca0f3524a4eb57d03a56b0cbcef5527a0981ce4 Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Tue, 7 Apr 2020 13:56:51 +0200
Subject: [PATCH 3/7] backup: don't acquire aio_context in backup_clean

All code-paths leading to backup_clean (via job_clean) have the job's
context already acquired. The job's context is guaranteed to be the same
as the one used by backup_top via backup_job_create.

Since the previous logic effectively acquired the lock twice, this
broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE
in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock
once, thus deadlocking with the IO thread.

This is a partial revert of 0abf2581717a19.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200407115651.69472-4-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/backup.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 7430ca5883..a7a7dcaf4c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -126,11 +126,7 @@ static void backup_abort(Job *job)
 static void backup_clean(Job *job)
 {
     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
-    AioContext *aio_context = bdrv_get_aio_context(s->backup_top);
-
-    aio_context_acquire(aio_context);
     bdrv_backup_top_drop(s->backup_top);
-    aio_context_release(aio_context);
 }
 
 void backup_do_checkpoint(BlockJob *job, Error **errp)

From 564806c529d4e0acad209b1e5b864a8886092f1f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 Apr 2020 14:12:57 +0200
Subject: [PATCH 4/7] block-backend: Reorder flush/pdiscard function
 definitions

Move all variants of the flush/pdiscard functions to a single place and
put the blk_co_*() version first because it is called by all other
variants (and will become static in the next patch).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200407121259.21350-2-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 92 +++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 8b8f2a80a0..17b2e87afa 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1488,38 +1488,6 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                         blk_aio_write_entry, flags, cb, opaque);
 }
 
-static void blk_aio_flush_entry(void *opaque)
-{
-    BlkAioEmAIOCB *acb = opaque;
-    BlkRwCo *rwco = &acb->rwco;
-
-    rwco->ret = blk_co_flush(rwco->blk);
-    blk_aio_complete(acb);
-}
-
-BlockAIOCB *blk_aio_flush(BlockBackend *blk,
-                          BlockCompletionFunc *cb, void *opaque)
-{
-    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
-}
-
-static void blk_aio_pdiscard_entry(void *opaque)
-{
-    BlkAioEmAIOCB *acb = opaque;
-    BlkRwCo *rwco = &acb->rwco;
-
-    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
-    blk_aio_complete(acb);
-}
-
-BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
-                             int64_t offset, int bytes,
-                             BlockCompletionFunc *cb, void *opaque)
-{
-    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
-                        cb, opaque);
-}
-
 void blk_aio_cancel(BlockAIOCB *acb)
 {
     bdrv_aio_cancel(acb);
@@ -1586,6 +1554,37 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
     return bdrv_co_pdiscard(blk->root, offset, bytes);
 }
 
+static void blk_aio_pdiscard_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
+                             int64_t offset, int bytes,
+                             BlockCompletionFunc *cb, void *opaque)
+{
+    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
+                        cb, opaque);
+}
+
+static void blk_pdiscard_entry(void *opaque)
+{
+    BlkRwCo *rwco = opaque;
+    QEMUIOVector *qiov = rwco->iobuf;
+
+    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size);
+    aio_wait_kick();
+}
+
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
+{
+    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
+}
+
 int blk_co_flush(BlockBackend *blk)
 {
     blk_wait_while_drained(blk);
@@ -1597,6 +1596,21 @@ int blk_co_flush(BlockBackend *blk)
     return bdrv_co_flush(blk_bs(blk));
 }
 
+static void blk_aio_flush_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_flush(rwco->blk);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_flush(BlockBackend *blk,
+                          BlockCompletionFunc *cb, void *opaque)
+{
+    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
+}
+
 static void blk_flush_entry(void *opaque)
 {
     BlkRwCo *rwco = opaque;
@@ -2083,20 +2097,6 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
     return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
 }
 
-static void blk_pdiscard_entry(void *opaque)
-{
-    BlkRwCo *rwco = opaque;
-    QEMUIOVector *qiov = rwco->iobuf;
-
-    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size);
-    aio_wait_kick();
-}
-
-int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
-{
-    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
-}
-
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                      int64_t pos, int size)
 {

From fbb92b6798894d3bf62fe3578d99fa62c720b242 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 Apr 2020 14:12:58 +0200
Subject: [PATCH 5/7] block: Increase BB.in_flight for coroutine and sync
 interfaces

External callers of blk_co_*() and of the synchronous blk_*() functions
don't currently increase the BlockBackend.in_flight counter, but calls
from blk_aio_*() do, so there is an inconsistency whether the counter
has been increased or not.

This patch moves the actual operations to static functions that can
later know they will always be called with in_flight increased exactly
once, even for external callers using the blk_co_*() coroutine
interfaces.

If the public blk_co_*() interface is unused, remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20200407121259.21350-3-kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c          | 103 +++++++++++++++++++++++++--------
 include/sysemu/block-backend.h |   1 -
 2 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 17b2e87afa..610dbfa0b2 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1147,9 +1147,10 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
     }
 }
 
-int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                               unsigned int bytes, QEMUIOVector *qiov,
-                               BdrvRequestFlags flags)
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes,
+              QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
     int ret;
     BlockDriverState *bs;
@@ -1178,10 +1179,24 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
     return ret;
 }
 
-int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
-                                     unsigned int bytes,
-                                     QEMUIOVector *qiov, size_t qiov_offset,
-                                     BdrvRequestFlags flags)
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags)
+{
+    int ret;
+
+    blk_inc_in_flight(blk);
+    ret = blk_do_preadv(blk, offset, bytes, qiov, flags);
+    blk_dec_in_flight(blk);
+
+    return ret;
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes,
+                    QEMUIOVector *qiov, size_t qiov_offset,
+                    BdrvRequestFlags flags)
 {
     int ret;
     BlockDriverState *bs;
@@ -1214,6 +1229,20 @@ int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
     return ret;
 }
 
+int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
+                                     unsigned int bytes,
+                                     QEMUIOVector *qiov, size_t qiov_offset,
+                                     BdrvRequestFlags flags)
+{
+    int ret;
+
+    blk_inc_in_flight(blk);
+    ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
+    blk_dec_in_flight(blk);
+
+    return ret;
+}
+
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                 unsigned int bytes, QEMUIOVector *qiov,
                                 BdrvRequestFlags flags)
@@ -1234,7 +1263,7 @@ static void blk_read_entry(void *opaque)
     BlkRwCo *rwco = opaque;
     QEMUIOVector *qiov = rwco->iobuf;
 
-    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
+    rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size,
                               qiov, rwco->flags);
     aio_wait_kick();
 }
@@ -1244,8 +1273,8 @@ static void blk_write_entry(void *opaque)
     BlkRwCo *rwco = opaque;
     QEMUIOVector *qiov = rwco->iobuf;
 
-    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
-                               qiov, rwco->flags);
+    rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size,
+                                    qiov, 0, rwco->flags);
     aio_wait_kick();
 }
 
@@ -1262,6 +1291,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
         .ret    = NOT_DONE,
     };
 
+    blk_inc_in_flight(blk);
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         co_entry(&rwco);
@@ -1270,6 +1300,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
         bdrv_coroutine_enter(blk_bs(blk), co);
         BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
     }
+    blk_dec_in_flight(blk);
 
     return rwco.ret;
 }
@@ -1394,7 +1425,7 @@ static void blk_aio_read_entry(void *opaque)
     }
 
     assert(qiov->size == acb->bytes);
-    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
+    rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
                               qiov, rwco->flags);
     blk_aio_complete(acb);
 }
@@ -1412,8 +1443,8 @@ static void blk_aio_write_entry(void *opaque)
     }
 
     assert(!qiov || qiov->size == acb->bytes);
-    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
-                               qiov, rwco->flags);
+    rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
+                                    qiov, 0, rwco->flags);
     blk_aio_complete(acb);
 }
 
@@ -1498,7 +1529,9 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
     bdrv_aio_cancel_async(acb);
 }
 
-int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
     blk_wait_while_drained(blk);
 
@@ -1514,8 +1547,7 @@ static void blk_ioctl_entry(void *opaque)
     BlkRwCo *rwco = opaque;
     QEMUIOVector *qiov = rwco->iobuf;
 
-    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
-                             qiov->iov[0].iov_base);
+    rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base);
     aio_wait_kick();
 }
 
@@ -1529,7 +1561,7 @@ static void blk_aio_ioctl_entry(void *opaque)
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
 
-    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
+    rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
 
     blk_aio_complete(acb);
 }
@@ -1540,7 +1572,9 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
 }
 
-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
     int ret;
 
@@ -1559,7 +1593,7 @@ static void blk_aio_pdiscard_entry(void *opaque)
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
 
-    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
+    rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
     blk_aio_complete(acb);
 }
 
@@ -1571,12 +1605,23 @@ BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
                         cb, opaque);
 }
 
+int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
+{
+    int ret;
+
+    blk_inc_in_flight(blk);
+    ret = blk_do_pdiscard(blk, offset, bytes);
+    blk_dec_in_flight(blk);
+
+    return ret;
+}
+
 static void blk_pdiscard_entry(void *opaque)
 {
     BlkRwCo *rwco = opaque;
     QEMUIOVector *qiov = rwco->iobuf;
 
-    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size);
+    rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size);
     aio_wait_kick();
 }
 
@@ -1585,7 +1630,8 @@ int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
     return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
 }
 
-int blk_co_flush(BlockBackend *blk)
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn blk_do_flush(BlockBackend *blk)
 {
     blk_wait_while_drained(blk);
 
@@ -1601,7 +1647,7 @@ static void blk_aio_flush_entry(void *opaque)
     BlkAioEmAIOCB *acb = opaque;
     BlkRwCo *rwco = &acb->rwco;
 
-    rwco->ret = blk_co_flush(rwco->blk);
+    rwco->ret = blk_do_flush(rwco->blk);
     blk_aio_complete(acb);
 }
 
@@ -1611,10 +1657,21 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk,
     return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
 }
 
+int coroutine_fn blk_co_flush(BlockBackend *blk)
+{
+    int ret;
+
+    blk_inc_in_flight(blk);
+    ret = blk_do_flush(blk);
+    blk_dec_in_flight(blk);
+
+    return ret;
+}
+
 static void blk_flush_entry(void *opaque)
 {
     BlkRwCo *rwco = opaque;
-    rwco->ret = blk_co_flush(rwco->blk);
+    rwco->ret = blk_do_flush(rwco->blk);
     aio_wait_kick();
 }
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index b198deca0b..9bbdbd63d7 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -171,7 +171,6 @@ BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel(BlockAIOCB *acb);
 void blk_aio_cancel_async(BlockAIOCB *acb);
-int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                           BlockCompletionFunc *cb, void *opaque);

From 7f16476fab14fc32388e0ebae793f64673848efa Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 Apr 2020 14:12:59 +0200
Subject: [PATCH 6/7] block: Fix blk->in_flight during blk_wait_while_drained()

Waiting in blk_wait_while_drained() while blk->in_flight is increased
for the current request is wrong because it will cause the drain
operation to deadlock.

This patch makes sure that blk_wait_while_drained() is called with
blk->in_flight increased exactly once for the current request, and that
it temporarily decreases the counter while it waits.

Fixes: cf3129323f900ef5ddbccbe86e4fa801e88c566e
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-Id: <20200407121259.21350-4-kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 610dbfa0b2..38ae413826 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1140,10 +1140,15 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
     return 0;
 }
 
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
 {
+    assert(blk->in_flight > 0);
+
     if (blk->quiesce_counter && !blk->disable_request_queuing) {
+        blk_dec_in_flight(blk);
         qemu_co_queue_wait(&blk->queued_requests, NULL);
+        blk_inc_in_flight(blk);
     }
 }
 
@@ -1418,12 +1423,6 @@ static void blk_aio_read_entry(void *opaque)
     BlkRwCo *rwco = &acb->rwco;
     QEMUIOVector *qiov = rwco->iobuf;
 
-    if (rwco->blk->quiesce_counter) {
-        blk_dec_in_flight(rwco->blk);
-        blk_wait_while_drained(rwco->blk);
-        blk_inc_in_flight(rwco->blk);
-    }
-
     assert(qiov->size == acb->bytes);
     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
                               qiov, rwco->flags);
@@ -1436,12 +1435,6 @@ static void blk_aio_write_entry(void *opaque)
     BlkRwCo *rwco = &acb->rwco;
     QEMUIOVector *qiov = rwco->iobuf;
 
-    if (rwco->blk->quiesce_counter) {
-        blk_dec_in_flight(rwco->blk);
-        blk_wait_while_drained(rwco->blk);
-        blk_inc_in_flight(rwco->blk);
-    }
-
     assert(!qiov || qiov->size == acb->bytes);
     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
                                     qiov, 0, rwco->flags);

From 3f6de653b946fe849330208becf79d6af7e876cb Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 2 Apr 2020 11:36:03 +0200
Subject: [PATCH 7/7] vpc: Don't round up already aligned BAT sizes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As reported on Launchpad, Azure apparently doesn't accept images for
upload that are not both aligned to 1 MB blocks and have a BAT size that
matches the image size exactly.

As far as I can tell, there is no real reason why we create a BAT that
is one entry longer than necessary for aligned image sizes, so change
that.

(Even though the condition is only mentioned as "should" in the spec and
previous products accepted larger BATs - but we'll try to maintain
compatibility with as many of Microsoft's ever-changing interpretations
of the VHD spec as possible.)

Fixes: https://bugs.launchpad.net/bugs/1870098
Reported-by: Tobias Witek
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20200402093603.2369-1-kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/vpc.c b/block/vpc.c
index 6df75e22dc..d8141b52da 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -835,7 +835,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
 
     /* Write the footer (twice: at the beginning and at the end) */
     block_size = 0x200000;
-    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
+    num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
 
     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
     if (ret < 0) {