From f871abd60f4b67547e62c57c9bec19420052be39 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 21 May 2019 19:00:25 +0200
Subject: [PATCH 01/29] block: Drain source node in bdrv_replace_node()

Instead of just asserting that no requests are in flight in
bdrv_replace_node(), which is a requirement that most callers ignore, we
can just drain the source node right there. This fixes at least starting
a commit job while I/O is active on the backing chain, but probably
other callers, too.

Having requests in flight on the target node isn't a problem because the
target just gets new parents, but the call path of running requests
isn't modified. So we can just drop this assertion without a replacement.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1711643
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 1a73e310c1..4c3902508d 100644
--- a/block.c
+++ b/block.c
@@ -4017,13 +4017,13 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
     uint64_t perm = 0, shared = BLK_PERM_ALL;
     int ret;
 
-    assert(!atomic_read(&from->in_flight));
-    assert(!atomic_read(&to->in_flight));
-
     /* Make sure that @from doesn't go away until we have successfully attached
      * all of its parents to @to. */
     bdrv_ref(from);
 
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+    bdrv_drained_begin(from);
+
     /* Put all parents into @list and calculate their cumulative permissions */
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
         assert(c->bs == from);
@@ -4064,6 +4064,7 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
 
 out:
     g_slist_free(list);
+    bdrv_drained_end(from);
     bdrv_unref(from);
 }
 

From ac6fb43eae1f5029b51e0a3d975fe2111cc8b976 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 21 May 2019 20:35:52 +0200
Subject: [PATCH 02/29] iotests: Test commit job start with concurrent I/O

This tests that concurrent requests are correctly drained before making
graph modifications instead of running into assertions in
bdrv_replace_node().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/255        | 83 +++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/255.out    | 16 +++++++
 tests/qemu-iotests/group      |  1 +
 tests/qemu-iotests/iotests.py | 10 ++++-
 4 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100755 tests/qemu-iotests/255
 create mode 100644 tests/qemu-iotests/255.out

diff --git a/tests/qemu-iotests/255 b/tests/qemu-iotests/255
new file mode 100755
index 0000000000..c0bb37a9b0
--- /dev/null
+++ b/tests/qemu-iotests/255
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+#
+# Test commit job graph modifications while requests are active
+#
+# Copyright (C) 2019 Red Hat, Inc.
+#
+# Creator/Owner: Kevin Wolf <kwolf@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import iotests
+from iotests import imgfmt
+
+iotests.verify_image_format(supported_fmts=['qcow2'])
+
+def blockdev_create(vm, options):
+    result = vm.qmp_log('blockdev-create',
+                        filters=[iotests.filter_qmp_testfiles],
+                        job_id='job0', options=options)
+
+    if 'return' in result:
+        assert result['return'] == {}
+        vm.run_job('job0')
+    iotests.log("")
+
+with iotests.FilePath('t.qcow2') as disk_path, \
+     iotests.FilePath('t.qcow2.mid') as mid_path, \
+     iotests.FilePath('t.qcow2.base') as base_path, \
+     iotests.VM() as vm:
+
+    iotests.log("=== Create backing chain and start VM ===")
+    iotests.log("")
+
+    size = 128 * 1024 * 1024
+    size_str = str(size)
+
+    iotests.create_image(base_path, size)
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, mid_path, size_str)
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, disk_path, size_str)
+
+    # Create a backing chain like this:
+    # base <- [throttled: bps-read=4096] <- mid <- overlay
+
+    vm.add_object('throttle-group,x-bps-read=4096,id=throttle0')
+    vm.add_blockdev('file,filename=%s,node-name=base' % (base_path))
+    vm.add_blockdev('throttle,throttle-group=throttle0,file=base,node-name=throttled')
+    vm.add_blockdev('file,filename=%s,node-name=mid-file' % (mid_path))
+    vm.add_blockdev('qcow2,file=mid-file,node-name=mid,backing=throttled')
+    vm.add_drive_raw('if=none,id=overlay,driver=qcow2,file=%s,backing=mid' % (disk_path))
+
+    vm.launch()
+
+    iotests.log("=== Start background read requests ===")
+    iotests.log("")
+
+    def start_requests():
+        vm.hmp_qemu_io('overlay', 'aio_read 0 4k')
+        vm.hmp_qemu_io('overlay', 'aio_read 0 4k')
+
+    start_requests()
+
+    iotests.log("=== Run a commit job ===")
+    iotests.log("")
+
+    result = vm.qmp_log('block-commit', job_id='job0', auto_finalize=False,
+                        device='overlay', top_node='mid')
+
+    vm.run_job('job0', auto_finalize=False, pre_finalize=start_requests,
+                auto_dismiss=True)
+
+    vm.shutdown()
diff --git a/tests/qemu-iotests/255.out b/tests/qemu-iotests/255.out
new file mode 100644
index 0000000000..9a2d7cbb77
--- /dev/null
+++ b/tests/qemu-iotests/255.out
@@ -0,0 +1,16 @@
+=== Create backing chain and start VM ===
+
+Formatting 'TEST_DIR/PID-t.qcow2.mid', fmt=qcow2 size=134217728 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-t.qcow2', fmt=qcow2 size=134217728 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+=== Start background read requests ===
+
+=== Run a commit job ===
+
+{"execute": "block-commit", "arguments": {"auto-finalize": false, "device": "overlay", "job-id": "job0", "top-node": "mid"}}
+{"return": {}}
+{"execute": "job-finalize", "arguments": {"id": "job0"}}
+{"return": {}}
+{"data": {"id": "job0", "type": "commit"}, "event": "BLOCK_JOB_PENDING", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"device": "job0", "len": 134217728, "offset": 134217728, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 859c4b5e9f..88049ad46c 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -265,3 +265,4 @@
 252 rw auto backing quick
 253 rw auto quick
 254 rw auto backing quick
+255 rw auto quick
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 7bde380d96..6bcddd8870 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -126,6 +126,11 @@ def qemu_img_pipe(*args):
         sys.stderr.write('qemu-img received signal %i: %s\n' % (-exitcode, ' '.join(qemu_img_args + list(args))))
     return subp.communicate()[0]
 
+def qemu_img_log(*args):
+    result = qemu_img_pipe(*args)
+    log(result, filters=[filter_testfiles])
+    return result
+
 def img_info_log(filename, filter_path=None, imgopts=False, extra_args=[]):
     args = [ 'info' ]
     if imgopts:
@@ -533,7 +538,8 @@ class VM(qtest.QEMUQtestMachine):
         return result
 
     # Returns None on success, and an error string on failure
-    def run_job(self, job, auto_finalize=True, auto_dismiss=False):
+    def run_job(self, job, auto_finalize=True, auto_dismiss=False,
+                pre_finalize=None):
         error = None
         while True:
             for ev in self.get_qmp_events_filtered(wait=True):
@@ -546,6 +552,8 @@ class VM(qtest.QEMUQtestMachine):
                                 error = j['error']
                                 log('Job failed: %s' % (j['error']))
                     elif status == 'pending' and not auto_finalize:
+                        if pre_finalize:
+                            pre_finalize()
                         self.qmp_log('job-finalize', id=job)
                     elif status == 'concluded' and not auto_dismiss:
                         self.qmp_log('job-dismiss', id=job)

From 4da26f138db06c9c6d7199d42bd3c2be552cb956 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Mon, 13 May 2019 11:06:38 -0400
Subject: [PATCH 03/29] blockdev: fix missed target unref for drive-backup

If the bitmap can't be used for whatever reason, we skip putting down
the reference. Fix that.

In practice, this means that if you attempt to gracefully exit QEMU
after a backup command being rejected, bdrv_close_all will fail and
tell you some unpleasant things via assert().

Reported-by: aihua liang <aliang@redhat.com>
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1703916
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 17c2d801d7..d88dc115f2 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3546,8 +3546,7 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn,
     if (set_backing_hd) {
         bdrv_set_backing_hd(target_bs, source, &local_err);
         if (local_err) {
-            bdrv_unref(target_bs);
-            goto out;
+            goto unref;
         }
     }
 
@@ -3555,11 +3554,10 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn,
         bmap = bdrv_find_dirty_bitmap(bs, backup->bitmap);
         if (!bmap) {
             error_setg(errp, "Bitmap '%s' could not be found", backup->bitmap);
-            bdrv_unref(target_bs);
-            goto out;
+            goto unref;
         }
         if (bdrv_dirty_bitmap_check(bmap, BDRV_BITMAP_DEFAULT, errp)) {
-            goto out;
+            goto unref;
         }
     }
     if (!backup->auto_finalize) {
@@ -3573,12 +3571,13 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn,
                             backup->sync, bmap, backup->compress,
                             backup->on_source_error, backup->on_target_error,
                             job_flags, NULL, NULL, txn, &local_err);
-    bdrv_unref(target_bs);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
-        goto out;
+        goto unref;
     }
 
+unref:
+    bdrv_unref(target_bs);
 out:
     aio_context_release(aio_context);
     return job;

From 52f2b8961409be834abaee5189bff2cc9e372851 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Mon, 8 Apr 2019 19:26:16 +0300
Subject: [PATCH 04/29] tests/perf: Test lseek influence on qcow2 block-status

Block layer may recursively check block_status in file child of qcow2,
if qcow2 driver returned DATA. There are several test cases to check
influence of lseek on block_status performance. To see real difference
run on tmpfs.

Tests originally created by Kevin, I just refactored and put them
together into one executable file with simple output.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/perf/block/qcow2/convert-blockstatus | 71 ++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100755 tests/perf/block/qcow2/convert-blockstatus

diff --git a/tests/perf/block/qcow2/convert-blockstatus b/tests/perf/block/qcow2/convert-blockstatus
new file mode 100755
index 0000000000..a1a3c1ef43
--- /dev/null
+++ b/tests/perf/block/qcow2/convert-blockstatus
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# Test lseek influence on qcow2 block-status
+#
+# Block layer may recursively check block_status in file child of qcow2, if
+# qcow2 driver returned DATA. There are several test cases to check influence
+# of lseek on block_status performance. To see real difference run on tmpfs.
+#
+# Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
+#
+# Tests originally written by Kevin Wolf
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+if [ "$#" -lt 1 ]; then
+    echo "Usage: $0 SOURCE_FILE"
+    exit 1
+fi
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../../.." >/dev/null 2>&1 && pwd )"
+QEMU_IMG="$ROOT_DIR/qemu-img"
+QEMU_IO="$ROOT_DIR/qemu-io"
+
+size=1G
+src="$1"
+
+# test-case plain
+
+(
+$QEMU_IMG create -f qcow2 "$src" $size
+for i in $(seq 16384 -1 0); do
+    echo "write $((i * 65536)) 64k"
+done | $QEMU_IO "$src"
+) > /dev/null
+
+echo -n "plain: "
+/usr/bin/time -f %e $QEMU_IMG convert -n "$src" null-co://
+
+# test-case forward
+
+(
+$QEMU_IMG create -f qcow2 "$src" $size
+for i in $(seq 0 2 16384); do
+    echo "write $((i * 65536)) 64k"
+done | $QEMU_IO "$src"
+for i in $(seq 1 2 16384); do
+    echo "write $((i * 65536)) 64k"
+done | $QEMU_IO "$src"
+) > /dev/null
+
+echo -n "forward: "
+/usr/bin/time -f %e $QEMU_IMG convert -n "$src" null-co://
+
+# test-case prealloc
+
+$QEMU_IMG create -f qcow2 -o preallocation=metadata "$src" $size > /dev/null
+
+echo -n "prealloc: "
+/usr/bin/time -f %e $QEMU_IMG convert -n "$src" null-co://

From 69f47505ee66afaa513305de0c1895a224e52c45 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Mon, 8 Apr 2019 19:26:17 +0300
Subject: [PATCH 05/29] block: avoid recursive block_status call if possible

drv_co_block_status digs bs->file for additional, more accurate search
for hole inside region, reported as DATA by bs since 5daa74a6ebc.

This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
knows, where are holes and where is data. But every block_status
request calls lseek additionally. Assume a big disk, full of
data, in any iterative copying block job (or img convert) we'll call
lseek(HOLE) on every iteration, and each of these lseeks will have to
iterate through all metadata up to the end of file. It's obviously
ineffective behavior. And for many scenarios we don't need this lseek
at all.

However, lseek is needed when we have metadata-preallocated image.

So, let's detect metadata-preallocation case and don't dig qcow2's
protocol file in other cases.

The idea is to compare allocation size in POV of filesystem with
allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
significantly lower, consider it as metadata-preallocation case.

102 iotest changed, as our detector can't detect shrinked file as
metadata-preallocation, which don't seem to be wrong, as with metadata
preallocation we always have valid file length.

Two other iotests have a slight change in their QMP output sequence:
Active 'block-commit' returns earlier because the job coroutine yields
earlier on a blocking operation. This operation is loading the refcount
blocks in qcow2_detect_metadata_preallocation().

Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c                 |  9 ++++++++-
 block/qcow2-refcount.c     | 32 ++++++++++++++++++++++++++++++++
 block/qcow2.c              | 11 +++++++++++
 block/qcow2.h              |  4 ++++
 include/block/block.h      |  8 +++++++-
 tests/qemu-iotests/102     |  2 +-
 tests/qemu-iotests/102.out |  3 ++-
 tests/qemu-iotests/141.out |  2 +-
 tests/qemu-iotests/144.out |  2 +-
 9 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index 3134a60a48..150358c3b1 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2092,6 +2092,12 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
      */
     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
            align > offset - aligned_offset);
+    if (ret & BDRV_BLOCK_RECURSE) {
+        assert(ret & BDRV_BLOCK_DATA);
+        assert(ret & BDRV_BLOCK_OFFSET_VALID);
+        assert(!(ret & BDRV_BLOCK_ZERO));
+    }
+
     *pnum -= offset - aligned_offset;
     if (*pnum > bytes) {
         *pnum = bytes;
@@ -2122,7 +2128,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
         }
     }
 
-    if (want_zero && local_file && local_file != bs &&
+    if (want_zero && ret & BDRV_BLOCK_RECURSE &&
+        local_file && local_file != bs &&
         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
         (ret & BDRV_BLOCK_OFFSET_VALID)) {
         int64_t file_pnum;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4c1794f9af..3a2c673a5e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -3444,3 +3444,35 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
                             "There are no references in the refcount table.");
     return -EIO;
 }
+
+int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t i, end_cluster, cluster_count = 0, threshold;
+    int64_t file_length, real_allocation, real_clusters;
+
+    file_length = bdrv_getlength(bs->file->bs);
+    if (file_length < 0) {
+        return file_length;
+    }
+
+    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
+    if (real_allocation < 0) {
+        return real_allocation;
+    }
+
+    real_clusters = real_allocation / s->cluster_size;
+    threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
+
+    end_cluster = size_to_clusters(s, file_length);
+    for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
+        uint64_t refcount;
+        int ret = qcow2_get_refcount(bs, i, &refcount);
+        if (ret < 0) {
+            return ret;
+        }
+        cluster_count += !!refcount;
+    }
+
+    return cluster_count >= threshold;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index f2cb131048..14f914117f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1895,6 +1895,12 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
     unsigned int bytes;
     int status = 0;
 
+    if (!s->metadata_preallocation_checked) {
+        ret = qcow2_detect_metadata_preallocation(bs);
+        s->metadata_preallocation = (ret == 1);
+        s->metadata_preallocation_checked = true;
+    }
+
     bytes = MIN(INT_MAX, count);
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
@@ -1917,6 +1923,11 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
     } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
         status |= BDRV_BLOCK_DATA;
     }
+    if (s->metadata_preallocation && (status & BDRV_BLOCK_DATA) &&
+        (status & BDRV_BLOCK_OFFSET_VALID))
+    {
+        status |= BDRV_BLOCK_RECURSE;
+    }
     return status;
 }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 567375e56c..fc1b0d3c1e 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -356,6 +356,9 @@ typedef struct BDRVQcow2State {
     int nb_threads;
 
     BdrvChild *data_file;
+
+    bool metadata_preallocation_checked;
+    bool metadata_preallocation;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
@@ -655,6 +658,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                 void *cb_opaque, Error **errp);
 int qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
+int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
diff --git a/include/block/block.h b/include/block/block.h
index 9b083e2bca..531cf595cf 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -156,10 +156,15 @@ typedef struct HDGeometry {
  * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
  *                 layer, set by block layer
  *
- * Internal flag:
+ * Internal flags:
  * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
  *                 that the block layer recompute the answer from the returned
  *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
+ * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
+ *                     zeroes in file child of current block node inside
+ *                     returned region. Only valid together with both
+ *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
+ *                     appear with BDRV_BLOCK_ZERO.
  *
  * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
  * host offset within the returned BDS that is allocated for the
@@ -184,6 +189,7 @@ typedef struct HDGeometry {
 #define BDRV_BLOCK_RAW          0x08
 #define BDRV_BLOCK_ALLOCATED    0x10
 #define BDRV_BLOCK_EOF          0x20
+#define BDRV_BLOCK_RECURSE      0x40
 #define BDRV_BLOCK_OFFSET_MASK  BDRV_SECTOR_MASK
 
 typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
diff --git a/tests/qemu-iotests/102 b/tests/qemu-iotests/102
index 749ff66b8a..b898df436f 100755
--- a/tests/qemu-iotests/102
+++ b/tests/qemu-iotests/102
@@ -55,7 +55,7 @@ $QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io
 $QEMU_IMG resize -f raw --shrink "$TEST_IMG" $((5 * 64 * 1024))
 
 $QEMU_IO -c map "$TEST_IMG"
-$QEMU_IMG map "$TEST_IMG"
+$QEMU_IMG map "$TEST_IMG" | _filter_qemu_img_map
 
 echo
 echo '=== Testing map on an image file truncated outside of qemu ==='
diff --git a/tests/qemu-iotests/102.out b/tests/qemu-iotests/102.out
index 4401b08fee..cd2fdc7f96 100644
--- a/tests/qemu-iotests/102.out
+++ b/tests/qemu-iotests/102.out
@@ -7,7 +7,8 @@ wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 Image resized.
 64 KiB (0x10000) bytes     allocated at offset 0 bytes (0x0)
-Offset          Length          Mapped to       File
+Offset          Length          File
+0               0x10000         TEST_DIR/t.IMGFMT
 
 === Testing map on an image file truncated outside of qemu ===
 
diff --git a/tests/qemu-iotests/141.out b/tests/qemu-iotests/141.out
index 41c7291258..4d71d9dcae 100644
--- a/tests/qemu-iotests/141.out
+++ b/tests/qemu-iotests/141.out
@@ -42,9 +42,9 @@ Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
+{"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}}
-{"return": {}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device is in use by block job: commit"}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job0"}}
diff --git a/tests/qemu-iotests/144.out b/tests/qemu-iotests/144.out
index 55299201e4..a9a8216bea 100644
--- a/tests/qemu-iotests/144.out
+++ b/tests/qemu-iotests/144.out
@@ -14,10 +14,10 @@ Formatting 'TEST_DIR/tmp.qcow2', fmt=qcow2 size=536870912 backing_file=TEST_DIR/
 
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "virtio0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "virtio0"}}
+{"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "virtio0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "virtio0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}}
 {"return": {}}
-{"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "virtio0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "virtio0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "virtio0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}}

From 5cb2737e925042e6c7cd3fb0b01313950b03cddf Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 22 May 2019 16:40:36 +0200
Subject: [PATCH 06/29] block/io: Delay decrementing the quiesce_counter

When ending a drained section, bdrv_do_drained_end() currently first
decrements the quiesce_counter, and only then actually ends the drain.

The bdrv_drain_invoke(bs, false) call may cause graph changes.  Say the
graph change involves replacing an existing BB's ("blk") BDS
(blk_bs(blk)) by @bs.  Let us introducing the following values:
- bs_oqc = old_quiesce_counter
  (so bs->quiesce_counter == bs_oqc - 1)
- obs_qc = blk_bs(blk)->quiesce_counter (before bdrv_drain_invoke())

Let us assume there is no blk_pread_unthrottled() involved, so
blk->quiesce_counter == obs_qc (before bdrv_drain_invoke()).

Now replacing blk_bs(blk) by @bs will reduce blk->quiesce_counter by
obs_qc (making it 0) and increase it by bs_oqc-1 (making it bs_oqc-1).

bdrv_drain_invoke() returns and we invoke bdrv_parent_drained_end().
This will decrement blk->quiesce_counter by one, so it would be -1 --
were there not an assertion against that in blk_root_drained_end().

We therefore have to keep the quiesce_counter up at least until
bdrv_drain_invoke() returns, so that bdrv_parent_drained_end() does the
right thing for the parents @bs got during bdrv_drain_invoke().

But let us delay it even further, namely until bdrv_parent_drained_end()
returns, because then it mirrors bdrv_do_drained_begin(): There, we
first increment the quiesce_counter, then begin draining the parents,
and then call bdrv_drain_invoke().  It makes sense to let
bdrv_do_drained_end() unravel this exactly in reverse.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/io.c b/block/io.c
index 150358c3b1..0f6ebd001c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -422,11 +422,12 @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
         return;
     }
     assert(bs->quiesce_counter > 0);
-    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
     bdrv_drain_invoke(bs, false);
     bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
+
+    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }

From 74f27eadeae799c44d51ba53ffc94807be2b5834 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 22 May 2019 16:40:37 +0200
Subject: [PATCH 07/29] iotests: Test cancelling a job and closing the VM

This patch adds a test where we cancel a throttled mirror job and
immediately close the VM before it can be cancelled.  Doing so will
invoke bdrv_drain_all() while the mirror job tries to drain the
throttled node.  When bdrv_drain_all_end() tries to lift its drain on
the throttle node, the job will exit and replace the current root node
of the BB drive0 (which is the job's filter node) by the throttle node.
Before the previous patch, this replacement did not increase drive0's
quiesce_counter by a sufficient amount, so when
bdrv_parent_drained_end() (invoked by bdrv_do_drained_end(), invoked by
bdrv_drain_all_end()) tried to end the drain on all of the throttle
node's parents, it decreased drive0's quiesce_counter below 0 -- which
fails an assertion.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/255     | 52 ++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/255.out | 24 ++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/tests/qemu-iotests/255 b/tests/qemu-iotests/255
index c0bb37a9b0..49433ec122 100755
--- a/tests/qemu-iotests/255
+++ b/tests/qemu-iotests/255
@@ -35,6 +35,10 @@ def blockdev_create(vm, options):
         vm.run_job('job0')
     iotests.log("")
 
+iotests.log('Finishing a commit job with background reads')
+iotests.log('============================================')
+iotests.log('')
+
 with iotests.FilePath('t.qcow2') as disk_path, \
      iotests.FilePath('t.qcow2.mid') as mid_path, \
      iotests.FilePath('t.qcow2.base') as base_path, \
@@ -81,3 +85,51 @@ with iotests.FilePath('t.qcow2') as disk_path, \
                 auto_dismiss=True)
 
     vm.shutdown()
+
+iotests.log('')
+iotests.log('Closing the VM while a job is being cancelled')
+iotests.log('=============================================')
+iotests.log('')
+
+with iotests.FilePath('src.qcow2') as src_path, \
+     iotests.FilePath('dst.qcow2') as dst_path, \
+     iotests.VM() as vm:
+
+    iotests.log('=== Create images and start VM ===')
+    iotests.log('')
+
+    size = 128 * 1024 * 1024
+    size_str = str(size)
+
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, src_path, size_str)
+    iotests.qemu_img_log('create', '-f', iotests.imgfmt, dst_path, size_str)
+
+    iotests.log(iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write 0 1M',
+                                src_path),
+                filters=[iotests.filter_test_dir, iotests.filter_qemu_io])
+
+    vm.add_object('throttle-group,x-bps-read=4096,id=throttle0')
+
+    vm.add_blockdev('file,node-name=src-file,filename=%s' % (src_path))
+    vm.add_blockdev('%s,node-name=src,file=src-file' % (iotests.imgfmt))
+
+    vm.add_blockdev('file,node-name=dst-file,filename=%s' % (dst_path))
+    vm.add_blockdev('%s,node-name=dst,file=dst-file' % (iotests.imgfmt))
+
+    vm.add_blockdev('throttle,node-name=src-throttled,' +
+                    'throttle-group=throttle0,file=src')
+
+    vm.add_device('virtio-blk,drive=src-throttled')
+
+    vm.launch()
+
+    iotests.log('=== Start a mirror job ===')
+    iotests.log('')
+
+    vm.qmp_log('blockdev-mirror', job_id='job0', device='src-throttled',
+                                  target='dst', sync='full')
+
+    vm.qmp_log('block-job-cancel', device='job0')
+    vm.qmp_log('quit')
+
+    vm.shutdown()
diff --git a/tests/qemu-iotests/255.out b/tests/qemu-iotests/255.out
index 9a2d7cbb77..348909fdef 100644
--- a/tests/qemu-iotests/255.out
+++ b/tests/qemu-iotests/255.out
@@ -1,3 +1,6 @@
+Finishing a commit job with background reads
+============================================
+
 === Create backing chain and start VM ===
 
 Formatting 'TEST_DIR/PID-t.qcow2.mid', fmt=qcow2 size=134217728 cluster_size=65536 lazy_refcounts=off refcount_bits=16
@@ -14,3 +17,24 @@ Formatting 'TEST_DIR/PID-t.qcow2', fmt=qcow2 size=134217728 cluster_size=65536 l
 {"return": {}}
 {"data": {"id": "job0", "type": "commit"}, "event": "BLOCK_JOB_PENDING", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 {"data": {"device": "job0", "len": 134217728, "offset": 134217728, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+
+Closing the VM while a job is being cancelled
+=============================================
+
+=== Create images and start VM ===
+
+Formatting 'TEST_DIR/PID-src.qcow2', fmt=qcow2 size=134217728 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+Formatting 'TEST_DIR/PID-dst.qcow2', fmt=qcow2 size=134217728 cluster_size=65536 lazy_refcounts=off refcount_bits=16
+
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Start a mirror job ===
+
+{"execute": "blockdev-mirror", "arguments": {"device": "src-throttled", "job-id": "job0", "sync": "full", "target": "dst"}}
+{"return": {}}
+{"execute": "block-job-cancel", "arguments": {"device": "job0"}}
+{"return": {}}
+{"execute": "quit", "arguments": {}}
+{"return": {}}

From 2b02fd81ded2f8fefbdacee431fc9fb006f92bc6 Mon Sep 17 00:00:00 2001
From: Julia Suvorova <jusual@mail.ru>
Date: Sun, 2 Jun 2019 23:17:09 +0300
Subject: [PATCH 08/29] block/linux-aio: Drop unused BlockAIOCB submission
 method

Callback-based laio_submit() and laio_cancel() were left after
rewriting Linux AIO backend to coroutines in hope that they would be
used in other code that could bypass coroutines. They can be safely
removed because they have not been used since that time.

Signed-off-by: Julia Suvorova <jusual@mail.ru>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/linux-aio.c       | 72 ++++++-----------------------------------
 include/block/raw-aio.h |  3 --
 2 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index d4b61fb251..27100c2fd1 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -30,7 +30,6 @@
 #define MAX_EVENTS 128
 
 struct qemu_laiocb {
-    BlockAIOCB common;
     Coroutine *co;
     LinuxAioState *ctx;
     struct iocb iocb;
@@ -72,7 +71,7 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 }
 
 /*
- * Completes an AIO request (calls the callback and frees the ACB).
+ * Completes an AIO request.
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
@@ -94,18 +93,15 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
-    if (laiocb->co) {
-        /* If the coroutine is already entered it must be in ioq_submit() and
-         * will notice laio->ret has been filled in when it eventually runs
-         * later.  Coroutines cannot be entered recursively so avoid doing
-         * that!
-         */
-        if (!qemu_coroutine_entered(laiocb->co)) {
-            aio_co_wake(laiocb->co);
-        }
-    } else {
-        laiocb->common.cb(laiocb->common.opaque, ret);
-        qemu_aio_unref(laiocb);
+
+    /*
+     * If the coroutine is already entered it must be in ioq_submit() and
+     * will notice laio->ret has been filled in when it eventually runs
+     * later.  Coroutines cannot be entered recursively so avoid doing
+     * that!
+     */
+    if (!qemu_coroutine_entered(laiocb->co)) {
+        aio_co_wake(laiocb->co);
     }
 }
 
@@ -273,30 +269,6 @@ static bool qemu_laio_poll_cb(void *opaque)
     return true;
 }
 
-static void laio_cancel(BlockAIOCB *blockacb)
-{
-    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
-    struct io_event event;
-    int ret;
-
-    if (laiocb->ret != -EINPROGRESS) {
-        return;
-    }
-    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
-    laiocb->ret = -ECANCELED;
-    if (ret != 0) {
-        /* iocb is not cancelled, cb will be called by the event loop later */
-        return;
-    }
-
-    laiocb->common.cb(laiocb->common.opaque, laiocb->ret);
-}
-
-static const AIOCBInfo laio_aiocb_info = {
-    .aiocb_size         = sizeof(struct qemu_laiocb),
-    .cancel_async       = laio_cancel,
-};
-
 static void ioq_init(LaioQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->pending);
@@ -431,30 +403,6 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
     return laiocb.ret;
 }
 
-BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
-{
-    struct qemu_laiocb *laiocb;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
-    int ret;
-
-    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
-    laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    laiocb->ctx = s;
-    laiocb->ret = -EINPROGRESS;
-    laiocb->is_read = (type == QEMU_AIO_READ);
-    laiocb->qiov = qiov;
-
-    ret = laio_do_submit(fd, laiocb, offset, type);
-    if (ret < 0) {
-        qemu_aio_unref(laiocb);
-        return NULL;
-    }
-
-    return &laiocb->common;
-}
-
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 {
     aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index ba223dd1f1..0cb7cc74a2 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -50,9 +50,6 @@ LinuxAioState *laio_init(Error **errp);
 void laio_cleanup(LinuxAioState *s);
 int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
                                 uint64_t offset, QEMUIOVector *qiov, int type);
-BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type);
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);

From 3036a626e9efad8ee5dc178b149184cf87bd06b0 Mon Sep 17 00:00:00 2001
From: Kenneth Heitke <kenneth.heitke@intel.com>
Date: Mon, 20 May 2019 11:40:30 -0600
Subject: [PATCH 09/29] nvme: add Get/Set Feature Timestamp support

Signed-off-by: Kenneth Heitke <kenneth.heitke@intel.com>
Reviewed-by: Klaus Birkelund Jensen <klaus.jensen@cnexlabs.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/nvme.c       | 106 +++++++++++++++++++++++++++++++++++++++++-
 hw/block/nvme.h       |   2 +
 hw/block/trace-events |   2 +
 include/block/nvme.h  |   2 +
 4 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 63a5b58849..30e50f7a38 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -219,6 +219,30 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
     return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+                                   uint64_t prp1, uint64_t prp2)
+{
+    QEMUSGList qsg;
+    QEMUIOVector iov;
+    uint16_t status = NVME_SUCCESS;
+
+    if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    if (qsg.nsg > 0) {
+        if (dma_buf_write(ptr, len, &qsg)) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+        }
+        qemu_sglist_destroy(&qsg);
+    } else {
+        if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+        }
+        qemu_iovec_destroy(&iov);
+    }
+    return status;
+}
+
 static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     uint64_t prp1, uint64_t prp2)
 {
@@ -678,7 +702,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
     return ret;
 }
 
-
 static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
 {
     NvmeIdentify *c = (NvmeIdentify *)cmd;
@@ -696,6 +719,57 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
     }
 }
 
+static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
+{
+    trace_nvme_setfeat_timestamp(ts);
+
+    n->host_timestamp = le64_to_cpu(ts);
+    n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+}
+
+static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
+{
+    uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+    uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
+
+    union nvme_timestamp {
+        struct {
+            uint64_t timestamp:48;
+            uint64_t sync:1;
+            uint64_t origin:3;
+            uint64_t rsvd1:12;
+        };
+        uint64_t all;
+    };
+
+    union nvme_timestamp ts;
+    ts.all = 0;
+
+    /*
+     * If the sum of the Timestamp value set by the host and the elapsed
+     * time exceeds 2^48, the value returned should be reduced modulo 2^48.
+     */
+    ts.timestamp = (n->host_timestamp + elapsed_time) & 0xffffffffffff;
+
+    /* If the host timestamp is non-zero, set the timestamp origin */
+    ts.origin = n->host_timestamp ? 0x01 : 0x00;
+
+    trace_nvme_getfeat_timestamp(ts.all);
+
+    return cpu_to_le64(ts.all);
+}
+
+static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    uint64_t prp1 = le64_to_cpu(cmd->prp1);
+    uint64_t prp2 = le64_to_cpu(cmd->prp2);
+
+    uint64_t timestamp = nvme_get_timestamp(n);
+
+    return nvme_dma_read_prp(n, (uint8_t *)&timestamp,
+                                 sizeof(timestamp), prp1, prp2);
+}
+
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -710,6 +784,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         trace_nvme_getfeat_numq(result);
         break;
+    case NVME_TIMESTAMP:
+        return nvme_get_feature_timestamp(n, cmd);
+        break;
     default:
         trace_nvme_err_invalid_getfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -719,6 +796,24 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
+{
+    uint16_t ret;
+    uint64_t timestamp;
+    uint64_t prp1 = le64_to_cpu(cmd->prp1);
+    uint64_t prp2 = le64_to_cpu(cmd->prp2);
+
+    ret = nvme_dma_write_prp(n, (uint8_t *)&timestamp,
+                                sizeof(timestamp), prp1, prp2);
+    if (ret != NVME_SUCCESS) {
+        return ret;
+    }
+
+    nvme_set_timestamp(n, timestamp);
+
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -735,6 +830,11 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         req->cqe.result =
             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
         break;
+
+    case NVME_TIMESTAMP:
+        return nvme_set_feature_timestamp(n, cmd);
+        break;
+
     default:
         trace_nvme_err_invalid_setfeat(dw10);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -907,6 +1007,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
     nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
         NVME_AQA_ASQS(n->bar.aqa) + 1);
 
+    nvme_set_timestamp(n, 0ULL);
+
     return 0;
 }
 
@@ -1270,7 +1372,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
     id->sqes = (0x6 << 4) | 0x6;
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
-    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
+    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
     id->psd[0].mp = cpu_to_le16(0x9c4);
     id->psd[0].enlat = cpu_to_le32(0x10);
     id->psd[0].exlat = cpu_to_le32(0x4);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 56c9d4b4b1..557194ee19 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -79,6 +79,8 @@ typedef struct NvmeCtrl {
     uint32_t    cmbloc;
     uint8_t     *cmbuf;
     uint64_t    irq_status;
+    uint64_t    host_timestamp;                 /* Timestamp sent by the host */
+    uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
 
     char            *serial;
     NvmeNamespace   *namespaces;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index b92039a573..97a17838ed 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -46,6 +46,8 @@ nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
 nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s"
 nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
+nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
+nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
 nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 849a6f3fa3..3ec8efcc43 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -581,6 +581,7 @@ enum NvmeIdCtrlOncs {
     NVME_ONCS_WRITE_ZEROS   = 1 << 3,
     NVME_ONCS_FEATURES      = 1 << 4,
     NVME_ONCS_RESRVATIONS   = 1 << 5,
+    NVME_ONCS_TIMESTAMP     = 1 << 6,
 };
 
 #define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf)
@@ -622,6 +623,7 @@ enum NvmeFeatureIds {
     NVME_INTERRUPT_VECTOR_CONF      = 0x9,
     NVME_WRITE_ATOMICITY            = 0xa,
     NVME_ASYNCHRONOUS_EVENT_CONF    = 0xb,
+    NVME_TIMESTAMP                  = 0xe,
     NVME_SOFTWARE_PROGRESS_MARKER   = 0x80
 };
 

From 087ba459a222cb022ce79110eebef190b8ec854a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 3 May 2019 13:20:54 +0200
Subject: [PATCH 10/29] test-block-iothread: Check filter node in
 test_propagate_mirror

Just make the test cover the AioContext of the filter node as well.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 tests/test-block-iothread.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index 59f692892e..e424d360c8 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -593,7 +593,7 @@ static void test_propagate_mirror(void)
     IOThread *iothread = iothread_new();
     AioContext *ctx = iothread_get_aio_context(iothread);
     AioContext *main_ctx = qemu_get_aio_context();
-    BlockDriverState *src, *target;
+    BlockDriverState *src, *target, *filter;
     BlockBackend *blk;
     Job *job;
     Error *local_err = NULL;
@@ -610,11 +610,13 @@ static void test_propagate_mirror(void)
                  false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
                  &error_abort);
     job = job_get("job0");
+    filter = bdrv_find_node("filter_node");
 
     /* Change the AioContext of src */
     bdrv_try_set_aio_context(src, ctx, &error_abort);
     g_assert(bdrv_get_aio_context(src) == ctx);
     g_assert(bdrv_get_aio_context(target) == ctx);
+    g_assert(bdrv_get_aio_context(filter) == ctx);
     g_assert(job->aio_context == ctx);
 
     /* Change the AioContext of target */
@@ -623,6 +625,7 @@ static void test_propagate_mirror(void)
     aio_context_release(ctx);
     g_assert(bdrv_get_aio_context(src) == main_ctx);
     g_assert(bdrv_get_aio_context(target) == main_ctx);
+    g_assert(bdrv_get_aio_context(filter) == main_ctx);
 
     /* With a BlockBackend on src, changing target must fail */
     blk = blk_new(0, BLK_PERM_ALL);
@@ -635,6 +638,7 @@ static void test_propagate_mirror(void)
     g_assert(blk_get_aio_context(blk) == main_ctx);
     g_assert(bdrv_get_aio_context(src) == main_ctx);
     g_assert(bdrv_get_aio_context(target) == main_ctx);
+    g_assert(bdrv_get_aio_context(filter) == main_ctx);
 
     /* ...unless we explicitly allow it */
     aio_context_acquire(ctx);
@@ -645,6 +649,7 @@ static void test_propagate_mirror(void)
     g_assert(blk_get_aio_context(blk) == ctx);
     g_assert(bdrv_get_aio_context(src) == ctx);
     g_assert(bdrv_get_aio_context(target) == ctx);
+    g_assert(bdrv_get_aio_context(filter) == ctx);
 
     job_cancel_sync_all();
 

From 45e92a9011bff4f6660dae1e011083c5790d121b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 24 May 2019 10:41:34 +0200
Subject: [PATCH 11/29] nbd-server: Call blk_set_allow_aio_context_change()

The NBD server uses an AioContext notifier, so it can tolerate that its
BlockBackend is switched to a different AioContext. Before we start
actually calling bdrv_try_set_aio_context(), which checks for
consistency, outside of test cases, we need to make sure that the NBD
server actually allows this.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 nbd/server.c               |  1 +
 tests/qemu-iotests/240     | 21 +++++++++++++++++++++
 tests/qemu-iotests/240.out | 13 +++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/nbd/server.c b/nbd/server.c
index e21bd501dc..d1375350bc 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1491,6 +1491,7 @@ NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset,
         goto fail;
     }
     blk_set_enable_write_cache(blk, !writethrough);
+    blk_set_allow_aio_context_change(blk, true);
 
     exp->refcount = 1;
     QTAILQ_INIT(&exp->clients);
diff --git a/tests/qemu-iotests/240 b/tests/qemu-iotests/240
index b4cf95096d..5be6b9c0f7 100755
--- a/tests/qemu-iotests/240
+++ b/tests/qemu-iotests/240
@@ -27,6 +27,12 @@ echo "QA output created by $seq"
 
 status=1	# failure is the default!
 
+_cleanup()
+{
+    rm -f "$TEST_DIR/nbd"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
 # get standard environment, filters and checks
 . ./common.rc
 . ./common.filter
@@ -122,6 +128,21 @@ run_qemu <<EOF
 { "execute": "quit"}
 EOF
 
+echo
+echo === Attach a SCSI disks using the same block device as a NBD server ===
+echo
+
+run_qemu <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add", "arguments": {"driver": "null-co", "node-name": "hd0", "read-only": true}}
+{ "execute": "nbd-server-start", "arguments": {"addr":{"type":"unix","data":{"path":"$TEST_DIR/nbd"}}}}
+{ "execute": "nbd-server-add", "arguments": {"device":"hd0"}}
+{ "execute": "object-add", "arguments": {"qom-type": "iothread", "id": "iothread0"}}
+{ "execute": "device_add", "arguments": {"id": "scsi0", "driver": "${virtio_scsi}", "iothread": "iothread0"}}
+{ "execute": "device_add", "arguments": {"id": "scsi-hd0", "driver": "scsi-hd", "drive": "hd0", "bus": "scsi0.0"}}
+{ "execute": "quit"}
+EOF
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/240.out b/tests/qemu-iotests/240.out
index d76392966c..84e0a43ce5 100644
--- a/tests/qemu-iotests/240.out
+++ b/tests/qemu-iotests/240.out
@@ -51,4 +51,17 @@ QMP_VERSION
 {"return": {}}
 {"return": {}}
 {"return": {}}
+
+=== Attach a SCSI disks using the same block device as a NBD server ===
+
+Testing:
+QMP_VERSION
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
 *** done

From 97896a4887a0a29c3314c5f0e9a82e269a6401fc Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 2 May 2019 11:10:59 +0200
Subject: [PATCH 12/29] block: Add Error to blk_set_aio_context()

Add an Error parameter to blk_set_aio_context() and use
bdrv_child_try_set_aio_context() internally to check whether all
involved nodes can actually support the AioContext switch.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c           | 26 ++++++++++++++++----------
 hw/block/dataplane/virtio-blk.c | 12 +++++++++---
 hw/block/dataplane/xen-block.c  |  6 ++++--
 hw/scsi/virtio-scsi.c           | 10 +++++++---
 include/sysemu/block-backend.h  |  3 ++-
 tests/test-bdrv-drain.c         |  8 ++++----
 tests/test-block-iothread.c     | 22 +++++++++++-----------
 7 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index ad3e1c882d..390fde6f71 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1865,30 +1865,36 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
     return blk_get_aio_context(blk_acb->blk);
 }
 
-static void blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
-                                   bool update_root_node)
+static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
+                                  bool update_root_node, Error **errp)
 {
     BlockDriverState *bs = blk_bs(blk);
     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
+    int ret;
 
     if (bs) {
+        if (update_root_node) {
+            ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
+                                                 errp);
+            if (ret < 0) {
+                return ret;
+            }
+        }
         if (tgm->throttle_state) {
             bdrv_drained_begin(bs);
             throttle_group_detach_aio_context(tgm);
             throttle_group_attach_aio_context(tgm, new_context);
             bdrv_drained_end(bs);
         }
-        if (update_root_node) {
-            GSList *ignore = g_slist_prepend(NULL, blk->root);
-            bdrv_set_aio_context_ignore(bs, new_context, &ignore);
-            g_slist_free(ignore);
-        }
     }
+
+    return 0;
 }
 
-void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
+int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
+                        Error **errp)
 {
-    blk_do_set_aio_context(blk, new_context, true);
+    return blk_do_set_aio_context(blk, new_context, true, errp);
 }
 
 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
@@ -1915,7 +1921,7 @@ static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
                                  GSList **ignore)
 {
     BlockBackend *blk = child->opaque;
-    blk_do_set_aio_context(blk, ctx, false);
+    blk_do_set_aio_context(blk, ctx, false, &error_abort);
 }
 
 void blk_add_aio_context_notifier(BlockBackend *blk,
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 8c37bd314a..158c78f852 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -173,6 +173,7 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
     unsigned i;
     unsigned nvqs = s->conf->num_queues;
+    Error *local_err = NULL;
     int r;
 
     if (vblk->dataplane_started || s->starting) {
@@ -212,7 +213,11 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
     vblk->dataplane_started = true;
     trace_virtio_blk_data_plane_start(s);
 
-    blk_set_aio_context(s->conf->conf.blk, s->ctx);
+    r = blk_set_aio_context(s->conf->conf.blk, s->ctx, &local_err);
+    if (r < 0) {
+        error_report_err(local_err);
+        goto fail_guest_notifiers;
+    }
 
     /* Kick right away to begin processing requests already in vring */
     for (i = 0; i < nvqs; i++) {
@@ -281,8 +286,9 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
     aio_context_acquire(s->ctx);
     aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
 
-    /* Drain and switch bs back to the QEMU main loop */
-    blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context());
+    /* Drain and try to switch bs back to the QEMU main loop. If other users
+     * keep the BlockBackend in the iothread, that's ok */
+    blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context(), NULL);
 
     aio_context_release(s->ctx);
 
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index bb8f1186e4..f7ad452bbd 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -682,7 +682,8 @@ void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
     }
 
     aio_context_acquire(dataplane->ctx);
-    blk_set_aio_context(dataplane->blk, qemu_get_aio_context());
+    /* Xen doesn't have multiple users for nodes, so this can't fail */
+    blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
     aio_context_release(dataplane->ctx);
 
     xendev = dataplane->xendev;
@@ -811,7 +812,8 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
     }
 
     aio_context_acquire(dataplane->ctx);
-    blk_set_aio_context(dataplane->blk, dataplane->ctx);
+    /* If other users keep the BlockBackend in the iothread, that's ok */
+    blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
     aio_context_release(dataplane->ctx);
     return;
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 839f120256..01c2b85f90 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -795,6 +795,7 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
     VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
     VirtIOSCSI *s = VIRTIO_SCSI(vdev);
     SCSIDevice *sd = SCSI_DEVICE(dev);
+    int ret;
 
     if (s->ctx && !s->dataplane_fenced) {
         AioContext *ctx;
@@ -808,9 +809,11 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
             return;
         }
         virtio_scsi_acquire(s);
-        blk_set_aio_context(sd->conf.blk, s->ctx);
+        ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
         virtio_scsi_release(s);
-
+        if (ret < 0) {
+            return;
+        }
     }
 
     if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@@ -839,7 +842,8 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
 
     if (s->ctx) {
         virtio_scsi_acquire(s);
-        blk_set_aio_context(sd->conf.blk, qemu_get_aio_context());
+        /* If other users keep the BlockBackend in the iothread, that's ok */
+        blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
         virtio_scsi_release(s);
     }
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 938de34fe9..228fb3fb83 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -208,7 +208,8 @@ void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason);
 void blk_op_block_all(BlockBackend *blk, Error *reason);
 void blk_op_unblock_all(BlockBackend *blk, Error *reason);
 AioContext *blk_get_aio_context(BlockBackend *blk);
-void blk_set_aio_context(BlockBackend *blk, AioContext *new_context);
+int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
+                        Error **errp);
 void blk_add_aio_context_notifier(BlockBackend *blk,
         void (*attached_aio_context)(AioContext *new_context, void *opaque),
         void (*detach_aio_context)(void *opaque), void *opaque);
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 5534c2adf9..e86798923f 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -678,7 +678,7 @@ static void test_iothread_common(enum drain_type drain_type, int drain_thread)
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
 
-    blk_set_aio_context(blk, ctx_a);
+    blk_set_aio_context(blk, ctx_a, &error_abort);
     aio_context_acquire(ctx_a);
 
     s->bh_indirection_ctx = ctx_b;
@@ -742,7 +742,7 @@ static void test_iothread_common(enum drain_type drain_type, int drain_thread)
     }
 
     aio_context_acquire(ctx_a);
-    blk_set_aio_context(blk, qemu_get_aio_context());
+    blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
     aio_context_release(ctx_a);
 
     bdrv_unref(bs);
@@ -903,7 +903,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
     if (use_iothread) {
         iothread = iothread_new();
         ctx = iothread_get_aio_context(iothread);
-        blk_set_aio_context(blk_src, ctx);
+        blk_set_aio_context(blk_src, ctx, &error_abort);
     } else {
         ctx = qemu_get_aio_context();
     }
@@ -1001,7 +1001,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
     g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
 
     if (use_iothread) {
-        blk_set_aio_context(blk_src, qemu_get_aio_context());
+        blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
     }
     aio_context_release(ctx);
 
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index e424d360c8..1d47ea9895 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -342,14 +342,14 @@ static void test_sync_op(const void *opaque)
     blk_insert_bs(blk, bs, &error_abort);
     c = QLIST_FIRST(&bs->parents);
 
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
     aio_context_acquire(ctx);
     t->fn(c);
     if (t->blkfn) {
         t->blkfn(blk);
     }
     aio_context_release(ctx);
-    blk_set_aio_context(blk, qemu_get_aio_context());
+    blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
 
     bdrv_unref(bs);
     blk_unref(blk);
@@ -428,7 +428,7 @@ static void test_attach_blockjob(void)
         aio_poll(qemu_get_aio_context(), false);
     }
 
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
 
     tjob->n = 0;
     while (tjob->n == 0) {
@@ -436,7 +436,7 @@ static void test_attach_blockjob(void)
     }
 
     aio_context_acquire(ctx);
-    blk_set_aio_context(blk, qemu_get_aio_context());
+    blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
     aio_context_release(ctx);
 
     tjob->n = 0;
@@ -444,7 +444,7 @@ static void test_attach_blockjob(void)
         aio_poll(qemu_get_aio_context(), false);
     }
 
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
 
     tjob->n = 0;
     while (tjob->n == 0) {
@@ -453,7 +453,7 @@ static void test_attach_blockjob(void)
 
     aio_context_acquire(ctx);
     job_complete_sync(&tjob->common.job, &error_abort);
-    blk_set_aio_context(blk, qemu_get_aio_context());
+    blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
     aio_context_release(ctx);
 
     bdrv_unref(bs);
@@ -497,7 +497,7 @@ static void test_propagate_basic(void)
     bs_verify = bdrv_open(NULL, NULL, options, BDRV_O_RDWR, &error_abort);
 
     /* Switch the AioContext */
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
     g_assert(blk_get_aio_context(blk) == ctx);
     g_assert(bdrv_get_aio_context(bs_a) == ctx);
     g_assert(bdrv_get_aio_context(bs_verify) == ctx);
@@ -505,7 +505,7 @@ static void test_propagate_basic(void)
 
     /* Switch the AioContext back */
     ctx = qemu_get_aio_context();
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
     g_assert(blk_get_aio_context(blk) == ctx);
     g_assert(bdrv_get_aio_context(bs_a) == ctx);
     g_assert(bdrv_get_aio_context(bs_verify) == ctx);
@@ -565,7 +565,7 @@ static void test_propagate_diamond(void)
     blk_insert_bs(blk, bs_verify, &error_abort);
 
     /* Switch the AioContext */
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
     g_assert(blk_get_aio_context(blk) == ctx);
     g_assert(bdrv_get_aio_context(bs_verify) == ctx);
     g_assert(bdrv_get_aio_context(bs_a) == ctx);
@@ -574,7 +574,7 @@ static void test_propagate_diamond(void)
 
     /* Switch the AioContext back */
     ctx = qemu_get_aio_context();
-    blk_set_aio_context(blk, ctx);
+    blk_set_aio_context(blk, ctx, &error_abort);
     g_assert(blk_get_aio_context(blk) == ctx);
     g_assert(bdrv_get_aio_context(bs_verify) == ctx);
     g_assert(bdrv_get_aio_context(bs_a) == ctx);
@@ -654,7 +654,7 @@ static void test_propagate_mirror(void)
     job_cancel_sync_all();
 
     aio_context_acquire(ctx);
-    blk_set_aio_context(blk, main_ctx);
+    blk_set_aio_context(blk, main_ctx, &error_abort);
     bdrv_try_set_aio_context(target, main_ctx, &error_abort);
     aio_context_release(ctx);
 

From d861ab3acf8dcf817e0c2335979b258847b69564 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 25 Apr 2019 14:25:10 +0200
Subject: [PATCH 13/29] block: Add BlockBackend.ctx

This adds a new parameter to blk_new() which requires its callers to
declare from which AioContext this BlockBackend is going to be used (or
the locks of which AioContext need to be taken anyway).

The given context is only stored and kept up to date when changing
AioContexts. Actually applying the stored AioContext to the root node
is saved for another commit.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                          |  2 +-
 block/backup.c                   |  3 ++-
 block/block-backend.c            | 18 +++++++++++++++---
 block/commit.c                   | 11 +++++++----
 block/crypto.c                   |  3 ++-
 block/mirror.c                   |  3 ++-
 block/parallels.c                |  3 ++-
 block/qcow.c                     |  3 ++-
 block/qcow2.c                    |  6 ++++--
 block/qed.c                      |  3 ++-
 block/sheepdog.c                 |  3 ++-
 block/vdi.c                      |  3 ++-
 block/vhdx.c                     |  3 ++-
 block/vmdk.c                     |  3 ++-
 block/vpc.c                      |  3 ++-
 blockdev.c                       |  4 ++--
 blockjob.c                       |  2 +-
 hmp.c                            |  3 ++-
 hw/block/fdc.c                   |  2 +-
 hw/block/xen-block.c             |  2 +-
 hw/core/qdev-properties-system.c |  4 +++-
 hw/ide/qdev.c                    |  2 +-
 hw/scsi/scsi-disk.c              |  2 +-
 include/sysemu/block-backend.h   |  2 +-
 migration/block.c                |  3 ++-
 nbd/server.c                     |  5 +++--
 qemu-img.c                       |  6 ++++--
 tests/test-bdrv-drain.c          | 30 +++++++++++++++---------------
 tests/test-bdrv-graph-mod.c      |  5 +++--
 tests/test-block-backend.c       |  6 ++++--
 tests/test-block-iothread.c      | 10 +++++-----
 tests/test-blockjob.c            |  2 +-
 tests/test-throttle.c            |  6 +++---
 33 files changed, 102 insertions(+), 64 deletions(-)

diff --git a/block.c b/block.c
index 4c3902508d..69ceac6377 100644
--- a/block.c
+++ b/block.c
@@ -2884,7 +2884,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
             /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
              * looking at the header to guess the image format. This works even
              * in cases where a guest would not see a consistent state. */
-            file = blk_new(0, BLK_PERM_ALL);
+            file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
             blk_insert_bs(file, file_bs, &local_err);
             bdrv_unref(file_bs);
             if (local_err) {
diff --git a/block/backup.c b/block/backup.c
index 00f4f8af53..715e1d3be8 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -627,7 +627,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
     }
 
     /* The target must match the source in size, so no resize here either */
-    job->target = blk_new(BLK_PERM_WRITE,
+    job->target = blk_new(job->common.job.aio_context,
+                          BLK_PERM_WRITE,
                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
                           BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD);
     ret = blk_insert_bs(job->target, target, errp);
diff --git a/block/block-backend.c b/block/block-backend.c
index 390fde6f71..f03f14acb0 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -42,6 +42,7 @@ struct BlockBackend {
     char *name;
     int refcnt;
     BdrvChild *root;
+    AioContext *ctx;
     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
@@ -322,12 +323,13 @@ static const BdrvChildRole child_root = {
  *
  * Return the new BlockBackend on success, null on failure.
  */
-BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
+BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 {
     BlockBackend *blk;
 
     blk = g_new0(BlockBackend, 1);
     blk->refcnt = 1;
+    blk->ctx = ctx;
     blk->perm = perm;
     blk->shared_perm = shared_perm;
     blk_set_enable_write_cache(blk, true);
@@ -347,6 +349,7 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 
 /*
  * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
+ * The new BlockBackend is in the main AioContext.
  *
  * Just as with bdrv_open(), after having called this function the reference to
  * @options belongs to the block layer (even on failure).
@@ -382,7 +385,7 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
         perm |= BLK_PERM_RESIZE;
     }
 
-    blk = blk_new(perm, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), perm, BLK_PERM_ALL);
     bs = bdrv_open(filename, reference, options, flags, errp);
     if (!bs) {
         blk_unref(blk);
@@ -1856,7 +1859,15 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
 
 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
-    return bdrv_get_aio_context(blk_bs(blk));
+    BlockDriverState *bs = blk_bs(blk);
+
+    /* FIXME The AioContext of bs and blk can be inconsistent. For the moment,
+     * we prefer the one of bs for compatibility. */
+    if (bs) {
+        return bdrv_get_aio_context(blk_bs(blk));
+    }
+
+    return blk->ctx;
 }
 
 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
@@ -1888,6 +1899,7 @@ static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
         }
     }
 
+    blk->ctx = new_context;
     return 0;
 }
 
diff --git a/block/commit.c b/block/commit.c
index 14e5bb394c..4d519506d6 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -338,7 +338,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
         goto fail;
     }
 
-    s->base = blk_new(BLK_PERM_CONSISTENT_READ
+    s->base = blk_new(s->common.job.aio_context,
+                      BLK_PERM_CONSISTENT_READ
                       | BLK_PERM_WRITE
                       | BLK_PERM_RESIZE,
                       BLK_PERM_CONSISTENT_READ
@@ -351,7 +352,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     s->base_bs = base;
 
     /* Required permissions are already taken with block_job_add_bdrv() */
-    s->top = blk_new(0, BLK_PERM_ALL);
+    s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
     ret = blk_insert_bs(s->top, top, errp);
     if (ret < 0) {
         goto fail;
@@ -395,6 +396,7 @@ int bdrv_commit(BlockDriverState *bs)
     BlockDriverState *backing_file_bs = NULL;
     BlockDriverState *commit_top_bs = NULL;
     BlockDriver *drv = bs->drv;
+    AioContext *ctx;
     int64_t offset, length, backing_length;
     int ro;
     int64_t n;
@@ -422,8 +424,9 @@ int bdrv_commit(BlockDriverState *bs)
         }
     }
 
-    src = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
-    backing = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    ctx = bdrv_get_aio_context(bs);
+    src = blk_new(ctx, BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+    backing = blk_new(ctx, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
 
     ret = blk_insert_bs(src, bs, &local_err);
     if (ret < 0) {
diff --git a/block/crypto.c b/block/crypto.c
index 3af46b805f..7351fd479d 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -257,7 +257,8 @@ static int block_crypto_co_create_generic(BlockDriverState *bs,
     QCryptoBlock *crypto = NULL;
     struct BlockCryptoCreateData data;
 
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
 
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
diff --git a/block/mirror.c b/block/mirror.c
index ec4bd9f404..eb96b52de9 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1584,7 +1584,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
      * We can allow anything except resize there.*/
     target_is_backing = bdrv_chain_contains(bs, target);
     target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
-    s->target = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE |
+    s->target = blk_new(s->common.job.aio_context,
+                        BLK_PERM_WRITE | BLK_PERM_RESIZE |
                         (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0),
                         BLK_PERM_WRITE_UNCHANGED |
                         (target_is_backing ? BLK_PERM_CONSISTENT_READ |
diff --git a/block/parallels.c b/block/parallels.c
index 2747400577..00fae125d1 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -554,7 +554,8 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,
         return -EIO;
     }
 
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto out;
diff --git a/block/qcow.c b/block/qcow.c
index 1bb8fd05e2..6dee5bb792 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -844,7 +844,8 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
         return -EIO;
     }
 
-    qcow_blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    qcow_blk = blk_new(bdrv_get_aio_context(bs),
+                       BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(qcow_blk, bs, errp);
     if (ret < 0) {
         goto exit;
diff --git a/block/qcow2.c b/block/qcow2.c
index 14f914117f..9396d490d5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3174,7 +3174,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
     }
 
     /* Create BlockBackend to write to the image */
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto out;
@@ -5006,7 +5007,8 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
     }
 
     if (new_size) {
-        BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
+        BlockBackend *blk = blk_new(bdrv_get_aio_context(bs),
+                                    BLK_PERM_RESIZE, BLK_PERM_ALL);
         ret = blk_insert_bs(blk, bs, errp);
         if (ret < 0) {
             blk_unref(blk);
diff --git a/block/qed.c b/block/qed.c
index dcdcd62b4a..bb4f5c9863 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -649,7 +649,8 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
         return -EIO;
     }
 
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto out;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index cbdfe9ab6e..f76d6ddbbc 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1800,7 +1800,8 @@ static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size,
     void *buf = NULL;
     int ret;
 
-    blk = blk_new(BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
                   BLK_PERM_ALL);
 
     ret = blk_insert_bs(blk, bs, errp);
diff --git a/block/vdi.c b/block/vdi.c
index d7ef6628e7..b9845a4cbd 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -803,7 +803,8 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
         goto exit;
     }
 
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs_file),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs_file, errp);
     if (ret < 0) {
         goto exit;
diff --git a/block/vhdx.c b/block/vhdx.c
index a143a57657..d6070b6fa8 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1900,7 +1900,8 @@ static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,
         return -EIO;
     }
 
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto delete_and_exit;
diff --git a/block/vmdk.c b/block/vmdk.c
index de8cb859f8..51067c774f 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2356,7 +2356,8 @@ static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
     if (!bs) {
         return NULL;
     }
-    blk = blk_new(BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
                   BLK_PERM_ALL);
     if (blk_insert_bs(blk, bs, errp)) {
         bdrv_unref(bs);
diff --git a/block/vpc.c b/block/vpc.c
index 0c279b87c8..d4776ee8a5 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -1011,7 +1011,8 @@ static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
         return -EIO;
     }
 
-    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto out;
diff --git a/blockdev.c b/blockdev.c
index d88dc115f2..e1ad92c559 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -574,7 +574,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     if ((!file || !*file) && !qdict_size(bs_opts)) {
         BlockBackendRootState *blk_rs;
 
-        blk = blk_new(0, BLK_PERM_ALL);
+        blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
         blk_rs = blk_get_root_state(blk);
         blk_rs->open_flags    = bdrv_flags;
         blk_rs->read_only     = read_only;
@@ -3154,7 +3154,7 @@ void qmp_block_resize(bool has_device, const char *device,
         goto out;
     }
 
-    blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
+    blk = blk_new(bdrv_get_aio_context(bs), BLK_PERM_RESIZE, BLK_PERM_ALL);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto out;
diff --git a/blockjob.c b/blockjob.c
index cc5f18e7cd..29517ae162 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -392,7 +392,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
         job_id = bdrv_get_device_name(bs);
     }
 
-    blk = blk_new(perm, shared_perm);
+    blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         blk_unref(blk);
diff --git a/hmp.c b/hmp.c
index 56a3ed7375..be5e345c6f 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2560,7 +2560,8 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
     if (!blk) {
         BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err);
         if (bs) {
-            blk = local_blk = blk_new(0, BLK_PERM_ALL);
+            blk = local_blk = blk_new(bdrv_get_aio_context(bs),
+                                      0, BLK_PERM_ALL);
             ret = blk_insert_bs(blk, bs, &err);
             if (ret < 0) {
                 goto fail;
diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index 6f19f127a5..37ccedc9f7 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -538,7 +538,7 @@ static void floppy_drive_realize(DeviceState *qdev, Error **errp)
 
     if (!dev->conf.blk) {
         /* Anonymous BlockBackend for an empty drive */
-        dev->conf.blk = blk_new(0, BLK_PERM_ALL);
+        dev->conf.blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
         ret = blk_attach_dev(dev->conf.blk, qdev);
         assert(ret == 0);
     }
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index ef635be4c2..31b0f5ccc8 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -609,7 +609,7 @@ static void xen_cdrom_realize(XenBlockDevice *blockdev, Error **errp)
         int rc;
 
         /* Set up an empty drive */
-        conf->blk = blk_new(0, BLK_PERM_ALL);
+        conf->blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
 
         rc = blk_attach_dev(conf->blk, DEVICE(blockdev));
         if (!rc) {
diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c
index b45a7ef54b..42e048f190 100644
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -80,7 +80,9 @@ static void parse_drive(DeviceState *dev, const char *str, void **ptr,
     if (!blk) {
         BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL);
         if (bs) {
-            blk = blk_new(0, BLK_PERM_ALL);
+            /* BlockBackends of devices start in the main context and are only
+             * later moved into another context if the device supports that. */
+            blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
             blk_created = true;
 
             ret = blk_insert_bs(blk, bs, errp);
diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index 573b022e1e..360cd20bd8 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -168,7 +168,7 @@ static void ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind, Error **errp)
             return;
         } else {
             /* Anonymous BlockBackend for an empty drive */
-            dev->conf.blk = blk_new(0, BLK_PERM_ALL);
+            dev->conf.blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
             ret = blk_attach_dev(dev->conf.blk, &dev->qdev);
             assert(ret == 0);
         }
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index e7e865ab3b..91c5a8b1ac 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2417,7 +2417,7 @@ static void scsi_cd_realize(SCSIDevice *dev, Error **errp)
     if (!dev->conf.blk) {
         /* Anonymous BlockBackend for an empty drive. As we put it into
          * dev->conf, qdev takes care of detaching on unplug. */
-        dev->conf.blk = blk_new(0, BLK_PERM_ALL);
+        dev->conf.blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
         ret = blk_attach_dev(dev->conf.blk, &dev->qdev);
         assert(ret == 0);
     }
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 228fb3fb83..733c4957eb 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -76,7 +76,7 @@ typedef struct BlockBackendPublic {
     ThrottleGroupMember throttle_group_member;
 } BlockBackendPublic;
 
-BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm);
+BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm);
 BlockBackend *blk_new_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp);
 int blk_get_refcnt(BlockBackend *blk);
diff --git a/migration/block.c b/migration/block.c
index 83c633fb3f..91f98ef44a 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -417,7 +417,8 @@ static int init_blk_migration(QEMUFile *f)
         }
 
         bmds = g_new0(BlkMigDevState, 1);
-        bmds->blk = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+        bmds->blk = blk_new(qemu_get_aio_context(),
+                            BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
         bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
         bmds->bulk_completed = 0;
         bmds->total_sectors = sectors;
diff --git a/nbd/server.c b/nbd/server.c
index d1375350bc..aeca3893fe 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1484,8 +1484,9 @@ NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset,
     if ((nbdflags & NBD_FLAG_READ_ONLY) == 0) {
         perm |= BLK_PERM_WRITE;
     }
-    blk = blk_new(perm, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
-                        BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
+    blk = blk_new(bdrv_get_aio_context(bs), perm,
+                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
     ret = blk_insert_bs(blk, bs, errp);
     if (ret < 0) {
         goto fail;
diff --git a/qemu-img.c b/qemu-img.c
index b0535919b1..07b6e2a808 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3313,7 +3313,8 @@ static int img_rebase(int argc, char **argv)
         BlockDriverState *base_bs = backing_bs(bs);
 
         if (base_bs) {
-            blk_old_backing = blk_new(BLK_PERM_CONSISTENT_READ,
+            blk_old_backing = blk_new(qemu_get_aio_context(),
+                                      BLK_PERM_CONSISTENT_READ,
                                       BLK_PERM_ALL);
             ret = blk_insert_bs(blk_old_backing, base_bs,
                                 &local_err);
@@ -3360,7 +3361,8 @@ static int img_rebase(int argc, char **argv)
             prefix_chain_bs = bdrv_find_backing_image(bs, out_real_path);
             if (prefix_chain_bs) {
                 g_free(out_real_path);
-                blk_new_backing = blk_new(BLK_PERM_CONSISTENT_READ,
+                blk_new_backing = blk_new(qemu_get_aio_context(),
+                                          BLK_PERM_CONSISTENT_READ,
                                           BLK_PERM_ALL);
                 ret = blk_insert_bs(blk_new_backing, prefix_chain_bs,
                                     &local_err);
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index e86798923f..fabbd52d93 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -206,7 +206,7 @@ static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
 
     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
                               &error_abort);
     s = bs->opaque;
@@ -290,7 +290,7 @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
     BlockBackend *blk;
     BlockDriverState *bs, *backing;
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
                               &error_abort);
     blk_insert_bs(blk, bs, &error_abort);
@@ -353,7 +353,7 @@ static void test_nested(void)
     BDRVTestState *s, *backing_s;
     enum drain_type outer, inner;
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
                               &error_abort);
     s = bs->opaque;
@@ -402,13 +402,13 @@ static void test_multiparent(void)
     BlockDriverState *bs_a, *bs_b, *backing;
     BDRVTestState *a_s, *b_s, *backing_s;
 
-    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
                                 &error_abort);
     a_s = bs_a->opaque;
     blk_insert_bs(blk_a, bs_a, &error_abort);
 
-    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
                                 &error_abort);
     b_s = bs_b->opaque;
@@ -475,13 +475,13 @@ static void test_graph_change_drain_subtree(void)
     BlockDriverState *bs_a, *bs_b, *backing;
     BDRVTestState *a_s, *b_s, *backing_s;
 
-    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
                                 &error_abort);
     a_s = bs_a->opaque;
     blk_insert_bs(blk_a, bs_a, &error_abort);
 
-    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
                                 &error_abort);
     b_s = bs_b->opaque;
@@ -555,7 +555,7 @@ static void test_graph_change_drain_all(void)
     BDRVTestState *a_s, *b_s;
 
     /* Create node A with a BlockBackend */
-    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
                                 &error_abort);
     a_s = bs_a->opaque;
@@ -571,7 +571,7 @@ static void test_graph_change_drain_all(void)
     g_assert_cmpint(a_s->drain_count, ==, 1);
 
     /* Create node B with a BlockBackend */
-    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
                                 &error_abort);
     b_s = bs_b->opaque;
@@ -672,7 +672,7 @@ static void test_iothread_common(enum drain_type drain_type, int drain_thread)
         goto out;
     }
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
                               &error_abort);
     s = bs->opaque;
@@ -883,7 +883,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
     bdrv_set_backing_hd(src, src_backing, &error_abort);
     bdrv_unref(src_backing);
 
-    blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_src = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     blk_insert_bs(blk_src, src_overlay, &error_abort);
 
     switch (drain_node) {
@@ -910,7 +910,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
 
     target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
                                   &error_abort);
-    blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_target = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     blk_insert_bs(blk_target, target, &error_abort);
 
     aio_context_acquire(ctx);
@@ -1205,7 +1205,7 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete,
                         &error_abort);
     bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     blk_insert_bs(blk, bs, &error_abort);
 
     /* Referenced by blk now */
@@ -1368,7 +1368,7 @@ static void test_detach_indirect(bool by_parent_cb)
     c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
 
     /* blk is a BB for parent-a */
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     blk_insert_bs(blk, parent_a, &error_abort);
     bdrv_unref(parent_a);
 
@@ -1460,7 +1460,7 @@ static void test_append_to_drained(void)
     BlockDriverState *base, *overlay;
     BDRVTestState *base_s, *overlay_s;
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     base = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
     base_s = base->opaque;
     blk_insert_bs(blk, base, &error_abort);
diff --git a/tests/test-bdrv-graph-mod.c b/tests/test-bdrv-graph-mod.c
index 747c0bf8fc..cfeec36566 100644
--- a/tests/test-bdrv-graph-mod.c
+++ b/tests/test-bdrv-graph-mod.c
@@ -102,7 +102,8 @@ static void test_update_perm_tree(void)
 {
     Error *local_err = NULL;
 
-    BlockBackend *root = blk_new(BLK_PERM_WRITE | BLK_PERM_CONSISTENT_READ,
+    BlockBackend *root = blk_new(qemu_get_aio_context(),
+                                 BLK_PERM_WRITE | BLK_PERM_CONSISTENT_READ,
                                  BLK_PERM_ALL & ~BLK_PERM_WRITE);
     BlockDriverState *bs = no_perm_node("node");
     BlockDriverState *filter = pass_through_node("filter");
@@ -165,7 +166,7 @@ static void test_update_perm_tree(void)
  */
 static void test_should_update_child(void)
 {
-    BlockBackend *root = blk_new(0, BLK_PERM_ALL);
+    BlockBackend *root = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
     BlockDriverState *bs = no_perm_node("node");
     BlockDriverState *filter = no_perm_node("filter");
     BlockDriverState *target = no_perm_node("target");
diff --git a/tests/test-block-backend.c b/tests/test-block-backend.c
index fd59f02bd0..5b5d6845c0 100644
--- a/tests/test-block-backend.c
+++ b/tests/test-block-backend.c
@@ -37,7 +37,8 @@ static void test_drain_aio_error_flush_cb(void *opaque, int ret)
 
 static void test_drain_aio_error(void)
 {
-    BlockBackend *blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    BlockBackend *blk = blk_new(qemu_get_aio_context(),
+                                BLK_PERM_ALL, BLK_PERM_ALL);
     BlockAIOCB *acb;
     bool completed = false;
 
@@ -53,7 +54,8 @@ static void test_drain_aio_error(void)
 
 static void test_drain_all_aio_error(void)
 {
-    BlockBackend *blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    BlockBackend *blk = blk_new(qemu_get_aio_context(),
+                                BLK_PERM_ALL, BLK_PERM_ALL);
     BlockAIOCB *acb;
     bool completed = false;
 
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index 1d47ea9895..2200487d76 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -336,7 +336,7 @@ static void test_sync_op(const void *opaque)
     BlockDriverState *bs;
     BdrvChild *c;
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
     bs->total_sectors = 65536 / BDRV_SECTOR_SIZE;
     blk_insert_bs(blk, bs, &error_abort);
@@ -415,7 +415,7 @@ static void test_attach_blockjob(void)
     BlockDriverState *bs;
     TestBlockJob *tjob;
 
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
     blk_insert_bs(blk, bs, &error_abort);
 
@@ -481,7 +481,7 @@ static void test_propagate_basic(void)
     QDict *options;
 
     /* Create bs_a and its BlockBackend */
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     bs_a = bdrv_new_open_driver(&bdrv_test, "bs_a", BDRV_O_RDWR, &error_abort);
     blk_insert_bs(blk, bs_a, &error_abort);
 
@@ -561,7 +561,7 @@ static void test_propagate_diamond(void)
     qdict_put_str(options, "raw", "bs_c");
 
     bs_verify = bdrv_open(NULL, NULL, options, BDRV_O_RDWR, &error_abort);
-    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     blk_insert_bs(blk, bs_verify, &error_abort);
 
     /* Switch the AioContext */
@@ -628,7 +628,7 @@ static void test_propagate_mirror(void)
     g_assert(bdrv_get_aio_context(filter) == main_ctx);
 
     /* With a BlockBackend on src, changing target must fail */
-    blk = blk_new(0, BLK_PERM_ALL);
+    blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
     blk_insert_bs(blk, src, &error_abort);
 
     bdrv_try_set_aio_context(target, ctx, &local_err);
diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c
index 652d1e8359..8c91980c70 100644
--- a/tests/test-blockjob.c
+++ b/tests/test-blockjob.c
@@ -68,7 +68,7 @@ static BlockJob *do_test_id(BlockBackend *blk, const char *id,
 static BlockBackend *create_blk(const char *name)
 {
     /* No I/O is performed on this device */
-    BlockBackend *blk = blk_new(0, BLK_PERM_ALL);
+    BlockBackend *blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
     BlockDriverState *bs;
 
     bs = bdrv_open("null-co://", NULL, NULL, 0, &error_abort);
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index 948a42c991..5644cf95ca 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -675,9 +675,9 @@ static void test_groups(void)
     ThrottleGroupMember *tgm1, *tgm2, *tgm3;
 
     /* No actual I/O is performed on these devices */
-    blk1 = blk_new(0, BLK_PERM_ALL);
-    blk2 = blk_new(0, BLK_PERM_ALL);
-    blk3 = blk_new(0, BLK_PERM_ALL);
+    blk1 = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
+    blk2 = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
+    blk3 = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
 
     blkp1 = blk_get_public(blk1);
     blkp2 = blk_get_public(blk2);

From 307a5f60ebe0506a5b90f323fdcabb4333405484 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 29 Apr 2019 17:40:14 +0200
Subject: [PATCH 14/29] block: Add qdev_prop_drive_iothread property type

Some qdev block devices have support for iothreads and take care of the
AioContext they are running in, but most devices don't know about any of
this. For the latter category, the qdev drive property must make sure
that their BlockBackend is in the main AioContext.

Unfortunately, while the current code just does the same thing for
devices that do support iothreads, this is not correct and it would show
as soon as we actually try to keep a consistent AioContext assignment
across all nodes and users of a block graph subtree: If a node is
already in a non-default AioContext because of one of its users,
attaching a new device should still be possible if that device can work
in the same AioContext. Switching the node back to the main context
first and only then into the device AioContext causes failure (because
the existing user wouldn't allow the switch to the main context).

So devices that support iothreads need a different kind of drive
property that leaves the node in its current AioContext, but by using
this type, the device promises to check later that it can work with this
context.

This patch adds the qdev infrastructure that allows devices to signal
that they handle iothreads and qdev should leave the AioContext alone.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/core/qdev-properties-system.c | 43 ++++++++++++++++++++++++++++----
 include/hw/block/block.h         |  7 ++++--
 include/hw/qdev-properties.h     |  3 +++
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c
index 42e048f190..ba412dd2ca 100644
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -69,8 +69,8 @@ static void set_pointer(Object *obj, Visitor *v, Property *prop,
 
 /* --- drive --- */
 
-static void parse_drive(DeviceState *dev, const char *str, void **ptr,
-                        const char *propname, Error **errp)
+static void do_parse_drive(DeviceState *dev, const char *str, void **ptr,
+                           const char *propname, bool iothread, Error **errp)
 {
     BlockBackend *blk;
     bool blk_created = false;
@@ -80,9 +80,16 @@ static void parse_drive(DeviceState *dev, const char *str, void **ptr,
     if (!blk) {
         BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL);
         if (bs) {
-            /* BlockBackends of devices start in the main context and are only
-             * later moved into another context if the device supports that. */
-            blk = blk_new(qemu_get_aio_context(), 0, BLK_PERM_ALL);
+            /*
+             * If the device supports iothreads, it will make sure to move the
+             * block node to the right AioContext if necessary (or fail if this
+             * isn't possible because of other users). Devices that are not
+             * aware of iothreads require their BlockBackends to be in the main
+             * AioContext.
+             */
+            AioContext *ctx = iothread ? bdrv_get_aio_context(bs) :
+                                         qemu_get_aio_context();
+            blk = blk_new(ctx, 0, BLK_PERM_ALL);
             blk_created = true;
 
             ret = blk_insert_bs(blk, bs, errp);
@@ -120,6 +127,18 @@ fail:
     }
 }
 
+static void parse_drive(DeviceState *dev, const char *str, void **ptr,
+                        const char *propname, Error **errp)
+{
+    do_parse_drive(dev, str, ptr, propname, false, errp);
+}
+
+static void parse_drive_iothread(DeviceState *dev, const char *str, void **ptr,
+                                 const char *propname, Error **errp)
+{
+    do_parse_drive(dev, str, ptr, propname, true, errp);
+}
+
 static void release_drive(Object *obj, const char *name, void *opaque)
 {
     DeviceState *dev = DEVICE(obj);
@@ -162,6 +181,12 @@ static void set_drive(Object *obj, Visitor *v, const char *name, void *opaque,
     set_pointer(obj, v, opaque, parse_drive, name, errp);
 }
 
+static void set_drive_iothread(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    set_pointer(obj, v, opaque, parse_drive_iothread, name, errp);
+}
+
 const PropertyInfo qdev_prop_drive = {
     .name  = "str",
     .description = "Node name or ID of a block device to use as a backend",
@@ -170,6 +195,14 @@ const PropertyInfo qdev_prop_drive = {
     .release = release_drive,
 };
 
+const PropertyInfo qdev_prop_drive_iothread = {
+    .name  = "str",
+    .description = "Node name or ID of a block device to use as a backend",
+    .get   = get_drive,
+    .set   = set_drive_iothread,
+    .release = release_drive,
+};
+
 /* --- character device --- */
 
 static void get_chr(Object *obj, Visitor *v, const char *name, void *opaque,
diff --git a/include/hw/block/block.h b/include/hw/block/block.h
index d06f25aa0f..607539057a 100644
--- a/include/hw/block/block.h
+++ b/include/hw/block/block.h
@@ -45,8 +45,7 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf)
     return exp;
 }
 
-#define DEFINE_BLOCK_PROPERTIES(_state, _conf)                          \
-    DEFINE_PROP_DRIVE("drive", _state, _conf.blk),                      \
+#define DEFINE_BLOCK_PROPERTIES_BASE(_state, _conf)                     \
     DEFINE_PROP_BLOCKSIZE("logical_block_size", _state,                 \
                           _conf.logical_block_size),                    \
     DEFINE_PROP_BLOCKSIZE("physical_block_size", _state,                \
@@ -59,6 +58,10 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf)
                             ON_OFF_AUTO_AUTO), \
     DEFINE_PROP_BOOL("share-rw", _state, _conf.share_rw, false)
 
+#define DEFINE_BLOCK_PROPERTIES(_state, _conf)                          \
+    DEFINE_PROP_DRIVE("drive", _state, _conf.blk),                      \
+    DEFINE_BLOCK_PROPERTIES_BASE(_state, _conf)
+
 #define DEFINE_BLOCK_CHS_PROPERTIES(_state, _conf)      \
     DEFINE_PROP_UINT32("cyls", _state, _conf.cyls, 0),  \
     DEFINE_PROP_UINT32("heads", _state, _conf.heads, 0), \
diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h
index b6758c852e..1eae5ab056 100644
--- a/include/hw/qdev-properties.h
+++ b/include/hw/qdev-properties.h
@@ -28,6 +28,7 @@ extern const PropertyInfo qdev_prop_blockdev_on_error;
 extern const PropertyInfo qdev_prop_bios_chs_trans;
 extern const PropertyInfo qdev_prop_fdc_drive_type;
 extern const PropertyInfo qdev_prop_drive;
+extern const PropertyInfo qdev_prop_drive_iothread;
 extern const PropertyInfo qdev_prop_netdev;
 extern const PropertyInfo qdev_prop_pci_devfn;
 extern const PropertyInfo qdev_prop_blocksize;
@@ -198,6 +199,8 @@ extern const PropertyInfo qdev_prop_pcie_link_width;
     DEFINE_PROP(_n, _s, _f, qdev_prop_netdev, NICPeers)
 #define DEFINE_PROP_DRIVE(_n, _s, _f) \
     DEFINE_PROP(_n, _s, _f, qdev_prop_drive, BlockBackend *)
+#define DEFINE_PROP_DRIVE_IOTHREAD(_n, _s, _f) \
+    DEFINE_PROP(_n, _s, _f, qdev_prop_drive_iothread, BlockBackend *)
 #define DEFINE_PROP_MACADDR(_n, _s, _f)         \
     DEFINE_PROP(_n, _s, _f, qdev_prop_macaddr, MACAddr)
 #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \

From 4f71fb436a6fbec7a7b2360cdba741deeb24be67 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 26 Apr 2019 19:29:47 +0200
Subject: [PATCH 15/29] scsi-disk: Use qdev_prop_drive_iothread

This makes use of qdev_prop_drive_iothread for scsi-disk so that the
disk can be attached to a node that is already in the target AioContext.
We need to check that the HBA actually supports iothreads, otherwise
scsi-disk must make sure that the node is already in the main
AioContext.

This changes the error message for conflicting iothread settings.
Previously, virtio-scsi produced the error message, now it comes from
blk_set_aio_context(). Update a test case accordingly.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi/scsi-disk.c        | 22 +++++++++++++++-------
 hw/scsi/virtio-scsi.c      | 15 ++++++++-------
 include/hw/scsi/scsi.h     |  1 +
 tests/qemu-iotests/240.out |  2 +-
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 91c5a8b1ac..7b89ac798b 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2336,6 +2336,13 @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
         return;
     }
 
+    if (blk_get_aio_context(s->qdev.conf.blk) != qemu_get_aio_context() &&
+        !s->qdev.hba_supports_iothread)
+    {
+        error_setg(errp, "HBA does not support iothreads");
+        return;
+    }
+
     if (dev->type == TYPE_DISK) {
         if (!blkconf_geometry(&dev->conf, NULL, 65535, 255, 255, errp)) {
             return;
@@ -2929,13 +2936,14 @@ static const TypeInfo scsi_disk_base_info = {
     .abstract      = true,
 };
 
-#define DEFINE_SCSI_DISK_PROPERTIES()                                \
-    DEFINE_BLOCK_PROPERTIES(SCSIDiskState, qdev.conf),               \
-    DEFINE_BLOCK_ERROR_PROPERTIES(SCSIDiskState, qdev.conf),         \
-    DEFINE_PROP_STRING("ver", SCSIDiskState, version),               \
-    DEFINE_PROP_STRING("serial", SCSIDiskState, serial),             \
-    DEFINE_PROP_STRING("vendor", SCSIDiskState, vendor),             \
-    DEFINE_PROP_STRING("product", SCSIDiskState, product),           \
+#define DEFINE_SCSI_DISK_PROPERTIES()                                   \
+    DEFINE_PROP_DRIVE_IOTHREAD("drive", SCSIDiskState, qdev.conf.blk),  \
+    DEFINE_BLOCK_PROPERTIES_BASE(SCSIDiskState, qdev.conf),             \
+    DEFINE_BLOCK_ERROR_PROPERTIES(SCSIDiskState, qdev.conf),            \
+    DEFINE_PROP_STRING("ver", SCSIDiskState, version),                  \
+    DEFINE_PROP_STRING("serial", SCSIDiskState, serial),                \
+    DEFINE_PROP_STRING("vendor", SCSIDiskState, vendor),                \
+    DEFINE_PROP_STRING("product", SCSIDiskState, product),              \
     DEFINE_PROP_STRING("device_id", SCSIDiskState, device_id)
 
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 01c2b85f90..2994f0738f 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -789,6 +789,13 @@ static void virtio_scsi_change(SCSIBus *bus, SCSIDevice *dev, SCSISense sense)
     }
 }
 
+static void virtio_scsi_pre_hotplug(HotplugHandler *hotplug_dev,
+                                    DeviceState *dev, Error **errp)
+{
+    SCSIDevice *sd = SCSI_DEVICE(dev);
+    sd->hba_supports_iothread = true;
+}
+
 static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
                                 Error **errp)
 {
@@ -798,16 +805,9 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
     int ret;
 
     if (s->ctx && !s->dataplane_fenced) {
-        AioContext *ctx;
         if (blk_op_is_blocked(sd->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
             return;
         }
-        ctx = blk_get_aio_context(sd->conf.blk);
-        if (ctx != s->ctx && ctx != qemu_get_aio_context()) {
-            error_setg(errp, "Cannot attach a blockdev that is using "
-                       "a different iothread");
-            return;
-        }
         virtio_scsi_acquire(s);
         ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
         virtio_scsi_release(s);
@@ -990,6 +990,7 @@ static void virtio_scsi_class_init(ObjectClass *klass, void *data)
     vdc->reset = virtio_scsi_reset;
     vdc->start_ioeventfd = virtio_scsi_dataplane_start;
     vdc->stop_ioeventfd = virtio_scsi_dataplane_stop;
+    hc->pre_plug = virtio_scsi_pre_hotplug;
     hc->plug = virtio_scsi_hotplug;
     hc->unplug = virtio_scsi_hotunplug;
 }
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index acef25faa4..426566a5c6 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -88,6 +88,7 @@ struct SCSIDevice
     int scsi_version;
     int default_scsi_version;
     bool needs_vpd_bl_emulation;
+    bool hba_supports_iothread;
 };
 
 extern const VMStateDescription vmstate_scsi_device;
diff --git a/tests/qemu-iotests/240.out b/tests/qemu-iotests/240.out
index 84e0a43ce5..d00df50297 100644
--- a/tests/qemu-iotests/240.out
+++ b/tests/qemu-iotests/240.out
@@ -43,7 +43,7 @@ QMP_VERSION
 {"return": {}}
 {"return": {}}
 {"return": {}}
-{"error": {"class": "GenericError", "desc": "Cannot attach a blockdev that is using a different iothread"}}
+{"error": {"class": "GenericError", "desc": "Cannot change iothread of active block backend"}}
 {"return": {}}
 {"return": {}}
 {"return": {}}

From 132ada80c4a6fea7b67e8bb0a5fd299994d927c6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 24 Apr 2019 17:41:46 +0200
Subject: [PATCH 16/29] block: Adjust AioContexts when attaching nodes

So far, we only made sure that updating the AioContext of a node
affected the whole subtree. However, if a node is newly attached to a
new parent, we also need to make sure that both the subtree of the node
and the parent are in the same AioContext. This tries to move the new
child node to the parent AioContext and returns an error if this isn't
possible.

BlockBackends now actually apply their AioContext to their root node.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 35 ++++++++++++++++++++++++++++++++++-
 block/block-backend.c     |  9 ++++-----
 blockjob.c                | 10 ++++++++--
 include/block/block_int.h |  1 +
 tests/test-bdrv-drain.c   |  6 ++++--
 5 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/block.c b/block.c
index 69ceac6377..7d9ed1a724 100644
--- a/block.c
+++ b/block.c
@@ -2249,14 +2249,19 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
  *
  * On failure NULL is returned, errp is set and the reference to
  * child_bs is also dropped.
+ *
+ * The caller must hold the AioContext lock @child_bs, but not that of @ctx
+ * (unless @child_bs is already in @ctx).
  */
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                   const char *child_name,
                                   const BdrvChildRole *child_role,
+                                  AioContext *ctx,
                                   uint64_t perm, uint64_t shared_perm,
                                   void *opaque, Error **errp)
 {
     BdrvChild *child;
+    Error *local_err = NULL;
     int ret;
 
     ret = bdrv_check_update_perm(child_bs, NULL, perm, shared_perm, NULL, errp);
@@ -2276,6 +2281,31 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
         .opaque         = opaque,
     };
 
+    /* If the AioContexts don't match, first try to move the subtree of
+     * child_bs into the AioContext of the new parent. If this doesn't work,
+     * try moving the parent into the AioContext of child_bs instead. */
+    if (bdrv_get_aio_context(child_bs) != ctx) {
+        ret = bdrv_try_set_aio_context(child_bs, ctx, &local_err);
+        if (ret < 0 && child_role->can_set_aio_ctx) {
+            GSList *ignore = g_slist_prepend(NULL, child);;
+            ctx = bdrv_get_aio_context(child_bs);
+            if (child_role->can_set_aio_ctx(child, ctx, &ignore, NULL)) {
+                error_free(local_err);
+                ret = 0;
+                g_slist_free(ignore);
+                ignore = g_slist_prepend(NULL, child);;
+                child_role->set_aio_ctx(child, ctx, &ignore);
+            }
+            g_slist_free(ignore);
+        }
+        if (ret < 0) {
+            error_propagate(errp, local_err);
+            g_free(child);
+            bdrv_abort_perm_update(child_bs);
+            return NULL;
+        }
+    }
+
     /* This performs the matching bdrv_set_perm() for the above check. */
     bdrv_replace_child(child, child_bs);
 
@@ -2289,6 +2319,9 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
  *
  * On failure NULL is returned, errp is set and the reference to
  * child_bs is also dropped.
+ *
+ * If @parent_bs and @child_bs are in different AioContexts, the caller must
+ * hold the AioContext lock for @child_bs, but not for @parent_bs.
  */
 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
                              BlockDriverState *child_bs,
@@ -2302,11 +2335,11 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
     bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
 
     assert(parent_bs->drv);
-    assert(bdrv_get_aio_context(parent_bs) == bdrv_get_aio_context(child_bs));
     bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
                     perm, shared_perm, &perm, &shared_perm);
 
     child = bdrv_root_attach_child(child_bs, child_name, child_role,
+                                   bdrv_get_aio_context(parent_bs),
                                    perm, shared_perm, parent_bs, errp);
     if (child == NULL) {
         return NULL;
diff --git a/block/block-backend.c b/block/block-backend.c
index f03f14acb0..f5d9407d20 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -392,7 +392,7 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
         return NULL;
     }
 
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
                                        perm, BLK_PERM_ALL, blk, errp);
     if (!blk->root) {
         blk_unref(blk);
@@ -803,7 +803,7 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 {
     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
     bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
                                        blk->perm, blk->shared_perm, blk, errp);
     if (blk->root == NULL) {
         return -EPERM;
@@ -1861,10 +1861,9 @@ AioContext *blk_get_aio_context(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
 
-    /* FIXME The AioContext of bs and blk can be inconsistent. For the moment,
-     * we prefer the one of bs for compatibility. */
     if (bs) {
-        return bdrv_get_aio_context(blk_bs(blk));
+        AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
+        assert(ctx == blk->ctx);
     }
 
     return blk->ctx;
diff --git a/blockjob.c b/blockjob.c
index 29517ae162..931d675c0c 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -205,8 +205,14 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
     BdrvChild *c;
 
     bdrv_ref(bs);
-    c = bdrv_root_attach_child(bs, name, &child_job, perm, shared_perm,
-                               job, errp);
+    if (job->job.aio_context != qemu_get_aio_context()) {
+        aio_context_release(job->job.aio_context);
+    }
+    c = bdrv_root_attach_child(bs, name, &child_job, job->job.aio_context,
+                               perm, shared_perm, job, errp);
+    if (job->job.aio_context != qemu_get_aio_context()) {
+        aio_context_acquire(job->job.aio_context);
+    }
     if (c == NULL) {
         return -EPERM;
     }
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1eebc7c8f3..06df2bda1b 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1163,6 +1163,7 @@ void hmp_drive_add_node(Monitor *mon, const char *optstr);
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                   const char *child_name,
                                   const BdrvChildRole *child_role,
+                                  AioContext *ctx,
                                   uint64_t perm, uint64_t shared_perm,
                                   void *opaque, Error **errp);
 void bdrv_root_unref_child(BdrvChild *child);
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index fabbd52d93..16ea2b813f 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -912,6 +912,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
                                   &error_abort);
     blk_target = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
     blk_insert_bs(blk_target, target, &error_abort);
+    blk_set_allow_aio_context_change(blk_target, true);
 
     aio_context_acquire(ctx);
     tjob = block_job_create("job0", &test_job_driver, NULL, src,
@@ -972,7 +973,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
     g_assert_false(job->job.paused);
     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 
-    do_drain_begin(drain_type, target);
+    do_drain_begin_unlocked(drain_type, target);
 
     if (drain_type == BDRV_DRAIN_ALL) {
         /* bdrv_drain_all() drains both src and target */
@@ -983,7 +984,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
     g_assert_true(job->job.paused);
     g_assert_false(job->job.busy); /* The job is paused */
 
-    do_drain_end(drain_type, target);
+    do_drain_end_unlocked(drain_type, target);
 
     if (use_iothread) {
         /* paused is reset in the I/O thread, wait for it */
@@ -1002,6 +1003,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
 
     if (use_iothread) {
         blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
+        blk_set_aio_context(blk_target, qemu_get_aio_context(), &error_abort);
     }
     aio_context_release(ctx);
 

From 48946d7d64818c349e3e183838f99d8ce32a9b86 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 24 Apr 2019 17:49:33 +0200
Subject: [PATCH 17/29] test-block-iothread: Test adding parent to iothread
 node

Opening a new parent node for a node that has already been moved into a
different AioContext must cause the new parent to move into the same
context.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-block-iothread.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index 2200487d76..38f59999ab 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -663,6 +663,38 @@ static void test_propagate_mirror(void)
     bdrv_unref(target);
 }
 
+static void test_attach_second_node(void)
+{
+    IOThread *iothread = iothread_new();
+    AioContext *ctx = iothread_get_aio_context(iothread);
+    AioContext *main_ctx = qemu_get_aio_context();
+    BlockBackend *blk;
+    BlockDriverState *bs, *filter;
+    QDict *options;
+
+    blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    options = qdict_new();
+    qdict_put_str(options, "driver", "raw");
+    qdict_put_str(options, "file", "base");
+
+    filter = bdrv_open(NULL, NULL, options, BDRV_O_RDWR, &error_abort);
+    g_assert(blk_get_aio_context(blk) == ctx);
+    g_assert(bdrv_get_aio_context(bs) == ctx);
+    g_assert(bdrv_get_aio_context(filter) == ctx);
+
+    blk_set_aio_context(blk, main_ctx, &error_abort);
+    g_assert(blk_get_aio_context(blk) == main_ctx);
+    g_assert(bdrv_get_aio_context(bs) == main_ctx);
+    g_assert(bdrv_get_aio_context(filter) == main_ctx);
+
+    bdrv_unref(filter);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 int main(int argc, char **argv)
 {
     int i;
@@ -678,6 +710,7 @@ int main(int argc, char **argv)
     }
 
     g_test_add_func("/attach/blockjob", test_attach_blockjob);
+    g_test_add_func("/attach/second_node", test_attach_second_node);
     g_test_add_func("/propagate/basic", test_propagate_basic);
     g_test_add_func("/propagate/diamond", test_propagate_diamond);
     g_test_add_func("/propagate/mirror", test_propagate_mirror);

From 2e9cdab3000530fdd03df7f0a5d124eaa0ae942f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 24 Apr 2019 17:47:34 +0200
Subject: [PATCH 18/29] test-block-iothread: BlockBackend AioContext across
 root node change

Test that BlockBackends preserve their assigned AioContext even when the
root node goes away. Inserting a new root node will move it to the right
AioContext.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-block-iothread.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index 38f59999ab..f41082e3bd 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -695,6 +695,38 @@ static void test_attach_second_node(void)
     blk_unref(blk);
 }
 
+static void test_attach_preserve_blk_ctx(void)
+{
+    IOThread *iothread = iothread_new();
+    AioContext *ctx = iothread_get_aio_context(iothread);
+    BlockBackend *blk;
+    BlockDriverState *bs;
+
+    blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
+    bs->total_sectors = 65536 / BDRV_SECTOR_SIZE;
+
+    /* Add node to BlockBackend that has an iothread context assigned */
+    blk_insert_bs(blk, bs, &error_abort);
+    g_assert(blk_get_aio_context(blk) == ctx);
+    g_assert(bdrv_get_aio_context(bs) == ctx);
+
+    /* Remove the node again */
+    blk_remove_bs(blk);
+    /* TODO bs should move back to main context here */
+    g_assert(blk_get_aio_context(blk) == ctx);
+    g_assert(bdrv_get_aio_context(bs) == ctx);
+
+    /* Re-attach the node */
+    blk_insert_bs(blk, bs, &error_abort);
+    g_assert(blk_get_aio_context(blk) == ctx);
+    g_assert(bdrv_get_aio_context(bs) == ctx);
+
+    blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
+    bdrv_unref(bs);
+    blk_unref(blk);
+}
+
 int main(int argc, char **argv)
 {
     int i;
@@ -711,6 +743,7 @@ int main(int argc, char **argv)
 
     g_test_add_func("/attach/blockjob", test_attach_blockjob);
     g_test_add_func("/attach/second_node", test_attach_second_node);
+    g_test_add_func("/attach/preserve_blk_ctx", test_attach_preserve_blk_ctx);
     g_test_add_func("/propagate/basic", test_propagate_basic);
     g_test_add_func("/propagate/diamond", test_propagate_diamond);
     g_test_add_func("/propagate/mirror", test_propagate_mirror);

From ad943dcb226a083a0c433ae73737c4834f8cfe74 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 24 Apr 2019 18:04:42 +0200
Subject: [PATCH 19/29] block: Move node without parents to main AioContext

A node should only be in a non-default AioContext if a user is attached
to it that requires this. When the last parent of a node is gone, it can
move back to the main AioContext.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                     | 4 ++++
 tests/test-bdrv-drain.c     | 2 +-
 tests/test-block-iothread.c | 3 +--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 7d9ed1a724..6830f2d940 100644
--- a/block.c
+++ b/block.c
@@ -2235,6 +2235,10 @@ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
         bdrv_get_cumulative_perm(old_bs, &perm, &shared_perm);
         bdrv_check_perm(old_bs, NULL, perm, shared_perm, NULL, &error_abort);
         bdrv_set_perm(old_bs, perm, shared_perm);
+
+        /* When the parent requiring a non-default AioContext is removed, the
+         * node moves back to the main AioContext */
+        bdrv_try_set_aio_context(old_bs, qemu_get_aio_context(), NULL);
     }
 
     if (new_bs) {
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 16ea2b813f..44e658bde0 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -1003,7 +1003,7 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type,
 
     if (use_iothread) {
         blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
-        blk_set_aio_context(blk_target, qemu_get_aio_context(), &error_abort);
+        assert(blk_get_aio_context(blk_target) == qemu_get_aio_context());
     }
     aio_context_release(ctx);
 
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index f41082e3bd..79d9cf8a57 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -713,9 +713,8 @@ static void test_attach_preserve_blk_ctx(void)
 
     /* Remove the node again */
     blk_remove_bs(blk);
-    /* TODO bs should move back to main context here */
     g_assert(blk_get_aio_context(blk) == ctx);
-    g_assert(bdrv_get_aio_context(bs) == ctx);
+    g_assert(bdrv_get_aio_context(bs) == qemu_get_aio_context());
 
     /* Re-attach the node */
     blk_insert_bs(blk, bs, &error_abort);

From 8ed7d42fb528d380bb0fda1e91aa1770a1b05cf6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 26 Apr 2019 16:12:27 +0200
Subject: [PATCH 20/29] blockdev: Use bdrv_try_set_aio_context() for monitor
 commands

Monitor commands can handle errors, so they can easily be converted to
using the safer bdrv_try_set_aio_context() function.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index e1ad92c559..3f44b891eb 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1535,6 +1535,7 @@ static void external_snapshot_prepare(BlkActionState *common,
                              DO_UPCAST(ExternalSnapshotState, common, common);
     TransactionAction *action = common->action;
     AioContext *aio_context;
+    int ret;
 
     /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar
      * purpose but a different set of parameters */
@@ -1674,7 +1675,10 @@ static void external_snapshot_prepare(BlkActionState *common,
         goto out;
     }
 
-    bdrv_set_aio_context(state->new_bs, aio_context);
+    ret = bdrv_try_set_aio_context(state->new_bs, aio_context, errp);
+    if (ret < 0) {
+        goto out;
+    }
 
     /* This removes our old bs and adds the new bs. This is an operation that
      * can fail, so we need to do it in .prepare; undoing it for abort is
@@ -3440,6 +3444,7 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn,
     int flags, job_flags = JOB_DEFAULT;
     int64_t size;
     bool set_backing_hd = false;
+    int ret;
 
     if (!backup->has_speed) {
         backup->speed = 0;
@@ -3541,7 +3546,11 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn,
         goto out;
     }
 
-    bdrv_set_aio_context(target_bs, aio_context);
+    ret = bdrv_try_set_aio_context(target_bs, aio_context, errp);
+    if (ret < 0) {
+        bdrv_unref(target_bs);
+        goto out;
+    }
 
     if (set_backing_hd) {
         bdrv_set_backing_hd(target_bs, source, &local_err);
@@ -3613,6 +3622,7 @@ BlockJob *do_blockdev_backup(BlockdevBackup *backup, JobTxn *txn,
     AioContext *aio_context;
     BlockJob *job = NULL;
     int job_flags = JOB_DEFAULT;
+    int ret;
 
     if (!backup->has_speed) {
         backup->speed = 0;
@@ -3649,16 +3659,9 @@ BlockJob *do_blockdev_backup(BlockdevBackup *backup, JobTxn *txn,
         goto out;
     }
 
-    if (bdrv_get_aio_context(target_bs) != aio_context) {
-        if (!bdrv_has_blk(target_bs)) {
-            /* The target BDS is not attached, we can safely move it to another
-             * AioContext. */
-            bdrv_set_aio_context(target_bs, aio_context);
-        } else {
-            error_setg(errp, "Target is attached to a different thread from "
-                             "source.");
-            goto out;
-        }
+    ret = bdrv_try_set_aio_context(target_bs, aio_context, errp);
+    if (ret < 0) {
+        goto out;
     }
 
     if (backup->has_bitmap) {
@@ -3831,6 +3834,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
     int flags;
     int64_t size;
     const char *format = arg->format;
+    int ret;
 
     bs = qmp_get_root_bs(arg->device, errp);
     if (!bs) {
@@ -3931,7 +3935,11 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
         goto out;
     }
 
-    bdrv_set_aio_context(target_bs, aio_context);
+    ret = bdrv_try_set_aio_context(target_bs, aio_context, errp);
+    if (ret < 0) {
+        bdrv_unref(target_bs);
+        goto out;
+    }
 
     blockdev_mirror_common(arg->has_job_id ? arg->job_id : NULL, bs, target_bs,
                            arg->has_replaces, arg->replaces, arg->sync,
@@ -3975,6 +3983,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
     AioContext *aio_context;
     BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN;
     Error *local_err = NULL;
+    int ret;
 
     bs = qmp_get_root_bs(device, errp);
     if (!bs) {
@@ -3989,7 +3998,10 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    bdrv_set_aio_context(target_bs, aio_context);
+    ret = bdrv_try_set_aio_context(target_bs, aio_context, errp);
+    if (ret < 0) {
+        goto out;
+    }
 
     blockdev_mirror_common(has_job_id ? job_id : NULL, bs, target_bs,
                            has_replaces, replaces, sync, backing_mode,
@@ -4005,7 +4017,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                            has_auto_dismiss, auto_dismiss,
                            &local_err);
     error_propagate(errp, local_err);
-
+out:
     aio_context_release(aio_context);
 }
 
@@ -4495,7 +4507,7 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread,
     old_context = bdrv_get_aio_context(bs);
     aio_context_acquire(old_context);
 
-    bdrv_set_aio_context(bs, new_context);
+    bdrv_try_set_aio_context(bs, new_context, errp);
 
     aio_context_release(old_context);
 }

From d0ee0204f400956ab429278f1b459d9af969c4a2 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 May 2019 18:19:16 +0200
Subject: [PATCH 21/29] block: Remove wrong bdrv_set_aio_context() calls

The mirror and commit block jobs use bdrv_set_aio_context() to move
their filter node into the right AioContext before hooking it up in the
graph. Similarly, bdrv_open_backing_file() explicitly moves the backing
file node into the right AioContext first.

This isn't necessary any more, they get automatically moved into the
right context now when attaching them.

However, in the case of bdrv_open_backing_file() with a node reference,
it's actually not only unnecessary, but even wrong: The unchecked
bdrv_set_aio_context() changes the AioContext of the child node even if
other parents require it to retain the old context. So this is not only
a simplification, but a bug fix, too.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1684342
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c        | 1 -
 block/commit.c | 2 --
 block/mirror.c | 1 -
 3 files changed, 4 deletions(-)

diff --git a/block.c b/block.c
index 6830f2d940..ddfae15d9b 100644
--- a/block.c
+++ b/block.c
@@ -2554,7 +2554,6 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
         ret = -EINVAL;
         goto free_exit;
     }
-    bdrv_set_aio_context(backing_hd, bdrv_get_aio_context(bs));
 
     if (implicit_backing) {
         bdrv_refresh_filename(backing_hd);
diff --git a/block/commit.c b/block/commit.c
index 4d519506d6..c815def89a 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -301,7 +301,6 @@ void commit_start(const char *job_id, BlockDriverState *bs,
         commit_top_bs->implicit = true;
     }
     commit_top_bs->total_sectors = top->total_sectors;
-    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top));
 
     bdrv_append(commit_top_bs, top, &local_err);
     if (local_err) {
@@ -443,7 +442,6 @@ int bdrv_commit(BlockDriverState *bs)
         error_report_err(local_err);
         goto ro_cleanup;
     }
-    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(backing_file_bs));
 
     bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
     bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
diff --git a/block/mirror.c b/block/mirror.c
index eb96b52de9..f8bdb5b21b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1543,7 +1543,6 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                                           BDRV_REQ_NO_FALLBACK;
     bs_opaque = g_new0(MirrorBDSOpaque, 1);
     mirror_top_bs->opaque = bs_opaque;
-    bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
 
     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
      * it alive until block_job_create() succeeds even if bs has no parent. */

From edbe36ad0f2fa8876b59bc77991ef00c969247a9 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 8 May 2019 11:58:45 +0200
Subject: [PATCH 22/29] virtio-scsi-test: Test attaching new overlay with
 iothreads

This tests that blockdev-add can correctly add a qcow2 overlay to an
image used by a virtio-scsi disk in an iothread. The interesting point
here is whether the newly added node gets correctly moved into the
iothread AioContext.

If it isn't, we get an assertion failure in virtio-scsi while processing
the next request:

    virtio_scsi_ctx_check: Assertion `blk_get_aio_context(d->conf.blk) == s->ctx' failed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/libqtest.c         | 19 ++++++++++++
 tests/libqtest.h         | 11 +++++++
 tests/virtio-scsi-test.c | 63 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)

diff --git a/tests/libqtest.c b/tests/libqtest.c
index 8ac0c02af4..546a875913 100644
--- a/tests/libqtest.c
+++ b/tests/libqtest.c
@@ -1038,6 +1038,25 @@ QDict *qmp(const char *fmt, ...)
     return response;
 }
 
+void qmp_assert_success(const char *fmt, ...)
+{
+    va_list ap;
+    QDict *response;
+
+    va_start(ap, fmt);
+    response = qtest_vqmp(global_qtest, fmt, ap);
+    va_end(ap);
+
+    g_assert(response);
+    if (!qdict_haskey(response, "return")) {
+        QString *s = qobject_to_json_pretty(QOBJECT(response));
+        g_test_message("%s", qstring_get_str(s));
+        qobject_unref(s);
+    }
+    g_assert(qdict_haskey(response, "return"));
+    qobject_unref(response);
+}
+
 char *hmp(const char *fmt, ...)
 {
     va_list ap;
diff --git a/tests/libqtest.h b/tests/libqtest.h
index a98ea15b7d..32d927755d 100644
--- a/tests/libqtest.h
+++ b/tests/libqtest.h
@@ -619,6 +619,17 @@ static inline void qtest_end(void)
 QDict *qmp(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
 
 /**
+ * qmp_assert_success:
+ * @fmt...: QMP message to send to qemu, formatted like
+ * qobject_from_jsonf_nofail().  See parse_escape() for what's
+ * supported after '%'.
+ *
+ * Sends a QMP message to QEMU and asserts that a 'return' key is present in
+ * the response.
+ */
+void qmp_assert_success(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
+
+/*
  * qmp_eventwait:
  * @s: #event event to wait for.
  *
diff --git a/tests/virtio-scsi-test.c b/tests/virtio-scsi-test.c
index 162b31c88d..1e535cb178 100644
--- a/tests/virtio-scsi-test.c
+++ b/tests/virtio-scsi-test.c
@@ -188,6 +188,53 @@ static void test_unaligned_write_same(void *obj, void *data,
     qvirtio_scsi_pci_free(vs);
 }
 
+static void test_iothread_attach_node(void *obj, void *data,
+                                      QGuestAllocator *t_alloc)
+{
+    QVirtioSCSIPCI *scsi_pci = obj;
+    QVirtioSCSI *scsi = &scsi_pci->scsi;
+    QVirtioSCSIQueues *vs;
+    char tmp_path[] = "/tmp/qtest.XXXXXX";
+    int fd;
+    int ret;
+
+    uint8_t buf[512] = { 0 };
+    const uint8_t write_cdb[VIRTIO_SCSI_CDB_SIZE] = {
+        /* WRITE(10) to LBA 0, transfer length 1 */
+        0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00
+    };
+
+    alloc = t_alloc;
+    vs = qvirtio_scsi_init(scsi->vdev);
+
+    /* Create a temporary qcow2 overlay*/
+    fd = mkstemp(tmp_path);
+    g_assert(fd >= 0);
+    close(fd);
+
+    if (!have_qemu_img()) {
+        g_test_message("QTEST_QEMU_IMG not set or qemu-img missing; "
+                       "skipping snapshot test");
+        goto fail;
+    }
+
+    mkqcow2(tmp_path, 64);
+
+    /* Attach the overlay to the null0 node */
+    qmp_assert_success("{'execute': 'blockdev-add', 'arguments': {"
+                       "   'driver': 'qcow2', 'node-name': 'overlay',"
+                       "   'backing': 'null0', 'file': {"
+                       "     'driver': 'file', 'filename': %s}}}", tmp_path);
+
+    /* Send a request to see if the AioContext is still right */
+    ret = virtio_scsi_do_command(vs, write_cdb, NULL, 0, buf, 512, NULL);
+    g_assert_cmphex(ret, ==, 0);
+
+fail:
+    qvirtio_scsi_pci_free(vs);
+    unlink(tmp_path);
+}
+
 static void *virtio_scsi_hotplug_setup(GString *cmd_line, void *arg)
 {
     g_string_append(cmd_line,
@@ -204,6 +251,15 @@ static void *virtio_scsi_setup(GString *cmd_line, void *arg)
     return arg;
 }
 
+static void *virtio_scsi_setup_iothread(GString *cmd_line, void *arg)
+{
+    g_string_append(cmd_line,
+                    " -object iothread,id=thread0"
+                    " -blockdev driver=null-co,node-name=null0"
+                    " -device scsi-hd,drive=null0");
+    return arg;
+}
+
 static void register_virtio_scsi_test(void)
 {
     QOSGraphTestOptions opts = { };
@@ -214,6 +270,13 @@ static void register_virtio_scsi_test(void)
     opts.before = virtio_scsi_setup;
     qos_add_test("unaligned-write-same", "virtio-scsi",
                  test_unaligned_write_same, &opts);
+
+    opts.before = virtio_scsi_setup_iothread;
+    opts.edge = (QOSGraphEdgeOptions) {
+        .extra_device_opts = "iothread=thread0",
+    };
+    qos_add_test("iothread-attach-node", "virtio-scsi-pci",
+                 test_iothread_attach_node, &opts);
 }
 
 libqos_init(register_virtio_scsi_test);

From 6c8705351ee5b3f2ede8337fc93de320b3d4add6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 23 May 2019 16:09:19 +0200
Subject: [PATCH 23/29] iotests: Attach new devices to node in non-default
 iothread

This tests that devices refuse to be attached to a node that has already
been moved to a different iothread if they can't be or aren't configured
to work in the same iothread.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/051        | 24 ++++++++++++++++++++++++
 tests/qemu-iotests/051.out    |  3 +++
 tests/qemu-iotests/051.pc.out | 27 +++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/tests/qemu-iotests/051 b/tests/qemu-iotests/051
index a3deb1fcad..200660f977 100755
--- a/tests/qemu-iotests/051
+++ b/tests/qemu-iotests/051
@@ -191,6 +191,30 @@ case "$QEMU_DEFAULT_MACHINE" in
         ;;
 esac
 
+echo
+echo === Attach to node in non-default iothread ===
+echo
+
+case "$QEMU_DEFAULT_MACHINE" in
+    pc)
+        iothread="-drive file=$TEST_IMG,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on"
+
+        # Can't add a device in the main thread while virtio-scsi0 uses the node
+        run_qemu $iothread -device ide-hd,drive=disk,share-rw=on
+        run_qemu $iothread -device virtio-blk-pci,drive=disk,share-rw=on
+        run_qemu $iothread -device lsi53c895a,id=lsi0 -device scsi-hd,bus=lsi0.0,drive=disk,share-rw=on
+        run_qemu $iothread -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
+
+        # virtio-blk enables the iothread only when the driver initialises the
+        # device, so a second virtio-blk device can't be added even with the
+        # same iothread. virtio-scsi allows this.
+        run_qemu $iothread -device virtio-blk-pci,drive=disk,iohtread=iothread0,share-rw=on
+        run_qemu $iothread -device virtio-scsi,id=virtio-scsi1,iothread=thread0 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
+        ;;
+     *)
+        ;;
+esac
+
 echo
 echo === Read-only ===
 echo
diff --git a/tests/qemu-iotests/051.out b/tests/qemu-iotests/051.out
index 9f1cf22608..8993835b94 100644
--- a/tests/qemu-iotests/051.out
+++ b/tests/qemu-iotests/051.out
@@ -137,6 +137,9 @@ QEMU X.Y.Z monitor - type 'help' for more information
 (qemu) QEMU_PROG: -drive if=virtio: Device needs media, but drive is empty
 
 
+=== Attach to node in non-default iothread ===
+
+
 === Read-only ===
 
 Testing: -drive file=TEST_DIR/t.qcow2,if=virtio,readonly=on
diff --git a/tests/qemu-iotests/051.pc.out b/tests/qemu-iotests/051.pc.out
index c4743cc31c..2d811c166c 100644
--- a/tests/qemu-iotests/051.pc.out
+++ b/tests/qemu-iotests/051.pc.out
@@ -173,6 +173,33 @@ QEMU X.Y.Z monitor - type 'help' for more information
 (qemu) QEMU_PROG: -device scsi-hd,drive=disk: Device needs media, but drive is empty
 
 
+=== Attach to node in non-default iothread ===
+
+Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device ide-hd,drive=disk,share-rw=on
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) QEMU_PROG: -device ide-hd,drive=disk,share-rw=on: Cannot change iothread of active block backend
+
+Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,share-rw=on
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) QEMU_PROG: -device virtio-blk-pci,drive=disk,share-rw=on: Cannot change iothread of active block backend
+
+Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device lsi53c895a,id=lsi0 -device scsi-hd,bus=lsi0.0,drive=disk,share-rw=on
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) QEMU_PROG: -device scsi-hd,bus=lsi0.0,drive=disk,share-rw=on: HBA does not support iothreads
+
+Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) QEMU_PROG: -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on: Cannot change iothread of active block backend
+
+Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iohtread=iothread0,share-rw=on
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) QEMU_PROG: -device virtio-blk-pci,drive=disk,iohtread=iothread0,share-rw=on: Cannot change iothread of active block backend
+
+Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1,iothread=thread0 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) quit
+
+
 === Read-only ===
 
 Testing: -drive file=TEST_DIR/t.qcow2,if=floppy,readonly=on

From 26bf15e441d9d0aa7715bac28ef6a3f25a034df3 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 May 2019 18:22:10 +0200
Subject: [PATCH 24/29] test-bdrv-drain: Use bdrv_try_set_aio_context()

No reason to use the unchecked version in tests, even more so when these
are the last callers of bdrv_set_aio_context() outside of block.c.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-bdrv-drain.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 44e658bde0..12e2ecf517 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -1509,16 +1509,16 @@ static void test_set_aio_context(void)
                               &error_abort);
 
     bdrv_drained_begin(bs);
-    bdrv_set_aio_context(bs, ctx_a);
+    bdrv_try_set_aio_context(bs, ctx_a, &error_abort);
 
     aio_context_acquire(ctx_a);
     bdrv_drained_end(bs);
 
     bdrv_drained_begin(bs);
-    bdrv_set_aio_context(bs, ctx_b);
+    bdrv_try_set_aio_context(bs, ctx_b, &error_abort);
     aio_context_release(ctx_a);
     aio_context_acquire(ctx_b);
-    bdrv_set_aio_context(bs, qemu_get_aio_context());
+    bdrv_try_set_aio_context(bs, qemu_get_aio_context(), &error_abort);
     aio_context_release(ctx_b);
     bdrv_drained_end(bs);
 

From 42a65f02f9b380bd8074882d5844d4ea033389cc Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 May 2019 18:31:38 +0200
Subject: [PATCH 25/29] block: Remove bdrv_set_aio_context()

All callers of bdrv_set_aio_context() are eliminated now, they have
moved to bdrv_try_set_aio_context() and related safe functions. Remove
bdrv_set_aio_context().

With this, we can now know that the .set_aio_ctx callback must be
present in bdrv_set_aio_context_ignore() because
bdrv_can_set_aio_context() would have returned false previously, so
instead of checking the condition, we can assert it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                           | 30 ++++++++++++++----------------
 docs/devel/multiple-iothreads.txt |  4 ++--
 include/block/block.h             |  9 ---------
 3 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/block.c b/block.c
index ddfae15d9b..e3e77feee0 100644
--- a/block.c
+++ b/block.c
@@ -5789,8 +5789,17 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
     bs->walking_aio_notifiers = false;
 }
 
-/* @ignore will accumulate all visited BdrvChild object. The caller is
- * responsible for freeing the list afterwards. */
+/*
+ * Changes the AioContext used for fd handlers, timers, and BHs by this
+ * BlockDriverState and all its children and parents.
+ *
+ * The caller must own the AioContext lock for the old AioContext of bs, but it
+ * must not own the AioContext lock for new_context (unless new_context is the
+ * same as the current context of bs).
+ *
+ * @ignore will accumulate all visited BdrvChild object. The caller is
+ * responsible for freeing the list afterwards.
+ */
 void bdrv_set_aio_context_ignore(BlockDriverState *bs,
                                  AioContext *new_context, GSList **ignore)
 {
@@ -5813,10 +5822,9 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
         if (g_slist_find(*ignore, child)) {
             continue;
         }
-        if (child->role->set_aio_ctx) {
-            *ignore = g_slist_prepend(*ignore, child);
-            child->role->set_aio_ctx(child, new_context, ignore);
-        }
+        assert(child->role->set_aio_ctx);
+        *ignore = g_slist_prepend(*ignore, child);
+        child->role->set_aio_ctx(child, new_context, ignore);
     }
 
     bdrv_detach_aio_context(bs);
@@ -5830,16 +5838,6 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
     aio_context_release(new_context);
 }
 
-/* The caller must own the AioContext lock for the old AioContext of bs, but it
- * must not own the AioContext lock for new_context (unless new_context is
- * the same as the current context of bs). */
-void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
-{
-    GSList *ignore_list = NULL;
-    bdrv_set_aio_context_ignore(bs, new_context, &ignore_list);
-    g_slist_free(ignore_list);
-}
-
 static bool bdrv_parent_can_set_aio_context(BdrvChild *c, AioContext *ctx,
                                             GSList **ignore, Error **errp)
 {
diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt
index 4f9012d154..aeb997bed5 100644
--- a/docs/devel/multiple-iothreads.txt
+++ b/docs/devel/multiple-iothreads.txt
@@ -109,7 +109,7 @@ The AioContext originates from the QEMU block layer, even though nowadays
 AioContext is a generic event loop that can be used by any QEMU subsystem.
 
 The block layer has support for AioContext integrated.  Each BlockDriverState
-is associated with an AioContext using bdrv_set_aio_context() and
+is associated with an AioContext using bdrv_try_set_aio_context() and
 bdrv_get_aio_context().  This allows block layer code to process I/O inside the
 right AioContext.  Other subsystems may wish to follow a similar approach.
 
@@ -134,5 +134,5 @@ Long-running jobs (usually in the form of coroutines) are best scheduled in
 the BlockDriverState's AioContext to avoid the need to acquire/release around
 each bdrv_*() call.  The functions bdrv_add/remove_aio_context_notifier,
 or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
-can be used to get a notification whenever bdrv_set_aio_context() moves a
+can be used to get a notification whenever bdrv_try_set_aio_context() moves a
 BlockDriverState to a different AioContext.
diff --git a/include/block/block.h b/include/block/block.h
index 531cf595cf..13ea050a5b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -583,15 +583,6 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs);
  */
 void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co);
 
-/**
- * bdrv_set_aio_context:
- *
- * Changes the #AioContext used for fd handlers, timers, and BHs by this
- * BlockDriverState and all its children.
- *
- * This function must be called with iothread lock held.
- */
-void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context);
 void bdrv_set_aio_context_ignore(BlockDriverState *bs,
                                  AioContext *new_context, GSList **ignore);
 int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,

From 1477b6c803491cd1146a8858bf5cf8159913054d Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 23 Apr 2019 15:57:04 +0300
Subject: [PATCH 26/29] block/qcow2-refcount: add trace-point to
 qcow2_process_discards

Let's at least trace ignored failure.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 7 ++++++-
 block/trace-events     | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 3a2c673a5e..22cadd79d5 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
 #include "qemu/range.h"
 #include "qemu/bswap.h"
 #include "qemu/cutils.h"
+#include "trace.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
                                     uint64_t max);
@@ -737,7 +738,11 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
 
         /* Discard is optional, ignore the return value */
         if (ret >= 0) {
-            bdrv_pdiscard(bs->file, d->offset, d->bytes);
+            int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
+            if (r2 < 0) {
+                trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
+                                                           r2);
+            }
         }
 
         g_free(d);
diff --git a/block/trace-events b/block/trace-events
index 1e0653ce6d..eab51497fc 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -92,6 +92,9 @@ qcow2_cache_get_done(void *co, int c, int i) "co %p is_l2_cache %d index %d"
 qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d"
 qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index %d"
 
+# qcow2-refcount.c
+qcow2_process_discards_failed_region(uint64_t offset, uint64_t bytes, int ret) "offset 0x%" PRIx64 " bytes 0x%" PRIx64 " ret %d"
+
 # qed-l2-cache.c
 qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
 qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"

From d93e57268892d555d7c71ec30b25276b0d8132b6 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 23 Apr 2019 15:57:05 +0300
Subject: [PATCH 27/29] block/io: bdrv_pdiscard: support int64_t bytes
 parameter

This fixes at least one overflow in qcow2_process_discards, which
passes 64bit region length to bdrv_pdiscard where bytes (or sectors in
the past) parameter is int since its introduction in 0b919fae.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c            | 16 ++++++++--------
 include/block/block.h |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index 0f6ebd001c..9ba1bada36 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2632,7 +2632,7 @@ int bdrv_flush(BlockDriverState *bs)
 typedef struct DiscardCo {
     BdrvChild *child;
     int64_t offset;
-    int bytes;
+    int64_t bytes;
     int ret;
 } DiscardCo;
 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
@@ -2643,14 +2643,15 @@ static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
     aio_wait_kick();
 }
 
-int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
+int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
+                                  int64_t bytes)
 {
     BdrvTrackedRequest req;
     int max_pdiscard, ret;
     int head, tail, align;
     BlockDriverState *bs = child->bs;
 
-    if (!bs || !bs->drv) {
+    if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
         return -ENOMEDIUM;
     }
 
@@ -2658,9 +2659,8 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
         return -EPERM;
     }
 
-    ret = bdrv_check_byte_request(bs, offset, bytes);
-    if (ret < 0) {
-        return ret;
+    if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) {
+        return -EIO;
     }
 
     /* Do nothing if disabled.  */
@@ -2695,7 +2695,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
     assert(max_pdiscard >= bs->bl.request_alignment);
 
     while (bytes > 0) {
-        int num = bytes;
+        int64_t num = bytes;
 
         if (head) {
             /* Make small requests to get to alignment boundaries. */
@@ -2757,7 +2757,7 @@ out:
     return ret;
 }
 
-int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
+int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
 {
     Coroutine *co;
     DiscardCo rwco = {
diff --git a/include/block/block.h b/include/block/block.h
index 13ea050a5b..f9415ed740 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -434,8 +434,8 @@ void bdrv_drain_all(void);
     AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),              \
                    cond); })
 
-int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes);
-int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes);
+int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs);

From 3dd3e248eb65e58979b5fa3cb1855c758c4fbe61 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 4 Jun 2019 15:39:48 +0300
Subject: [PATCH 28/29] iotests: test big qcow2 shrink

This test checks bug in qcow2_process_discards, fixed by previous
commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/250     | 78 ++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/250.out | 16 ++++++++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 95 insertions(+)
 create mode 100755 tests/qemu-iotests/250
 create mode 100644 tests/qemu-iotests/250.out

diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
new file mode 100755
index 0000000000..c9c0a84a5a
--- /dev/null
+++ b/tests/qemu-iotests/250
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+#
+# Test big discard in qcow2 shrink
+#
+# Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=vsementsov@virtuozzo.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+# This test checks that qcow2_process_discards does not truncate a discard
+# request > 2G.
+# To reproduce bug we need to overflow int by one sequential discard, so we
+# need size > 2G, bigger cluster size (as with default 64k we may have maximum
+# of 512M sequential data, corresponding to one L1 entry), and we need some
+# data of the beginning of the disk mapped to the end of file to prevent
+# bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed
+# anyway.
+
+disk_usage()
+{
+    du --block-size=1 $1 | awk '{print $1}'
+}
+
+size=2100M
+IMGOPTS="cluster_size=1M,preallocation=metadata"
+
+_make_test_img $size
+$QEMU_IO -c 'discard 0 10M' -c 'discard 2090M 10M' \
+         -c 'write 2090M 10M' -c 'write 0 10M' "$TEST_IMG" | _filter_qemu_io
+
+# Check that our trick with swapping first and last 10M chunks succeeded.
+# Otherwise test may pass even if bdrv_pdiscard() fails in
+# qcow2_process_discards()
+$QEMU_IMG map "$TEST_IMG" | _filter_testdir
+
+before=$(disk_usage "$TEST_IMG")
+$QEMU_IMG resize --shrink "$TEST_IMG" 5M
+after=$(disk_usage "$TEST_IMG")
+
+echo "Disk usage delta: $((before - after))"
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/250.out b/tests/qemu-iotests/250.out
new file mode 100644
index 0000000000..f480fd273b
--- /dev/null
+++ b/tests/qemu-iotests/250.out
@@ -0,0 +1,16 @@
+QA output created by 250
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=2202009600 preallocation=metadata
+discard 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+discard 10485760/10485760 bytes at offset 2191523840
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 10485760/10485760 bytes at offset 2191523840
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          Mapped to       File
+0               0xa00000        0x82f00000      TEST_DIR/t.qcow2
+0x82a00000      0xa00000        0x500000        TEST_DIR/t.qcow2
+Image resized.
+Disk usage delta: 15728640
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 88049ad46c..f3b6d601b2 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -262,6 +262,7 @@
 247 rw quick
 248 rw quick
 249 rw auto quick
+250 rw auto quick
 252 rw auto backing quick
 253 rw auto quick
 254 rw auto backing quick

From 11ba81c3cde0bc070cced6e8ef2835fab4fe90c8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 3 Jun 2019 15:43:20 +0200
Subject: [PATCH 29/29] iotests: Fix duplicated diff output on failure

Commit 70ff5b07 wanted to move the diff between actual and reference
output to the end after printing the test result line. It really only
copied it, though, so the diff is now displayed twice. Remove the old
one.

Fixes: 70ff5b07fcdd378180ad2d5cc0b0d5e67e7ef325
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/check | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index 95162c6cf9..44ebf24080 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -876,7 +876,6 @@ do
                     fi
                 else
                     mv $tmp.out $seq.out.bad
-                    $diff -w "$reference" "$PWD"/$seq.out.bad
                     status="fail"
                     results="output mismatch (see $seq.out.bad)"
                     printdiff=true