From b2f65d6b02b7df0a780a974a97f26fbf0e464964 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 31 May 2016 16:41:09 +0200
Subject: [PATCH 01/39] qcow2: Work with bytes in qcow2_get_cluster_offset()

This patch changes the units that qcow2_get_cluster_offset() uses
internally, without touching the interface just yet. This will be done
in another patch.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/qcow2-cluster.c | 44 ++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index b04bfafd65..a59fb3493c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -482,29 +482,28 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     unsigned int l2_index;
     uint64_t l1_index, l2_offset, *l2_table;
     int l1_bits, c;
-    unsigned int index_in_cluster, nb_clusters;
-    uint64_t nb_available, nb_needed;
+    unsigned int offset_in_cluster, nb_clusters;
+    uint64_t bytes_available, bytes_needed;
     int ret;
+    unsigned int bytes;
 
-    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
-    nb_needed = *num + index_in_cluster;
+    assert(*num <= BDRV_REQUEST_MAX_SECTORS);
+    bytes = *num * BDRV_SECTOR_SIZE;
+
+    offset_in_cluster = offset_into_cluster(s, offset);
+    bytes_needed = (uint64_t) bytes + offset_in_cluster;
 
     l1_bits = s->l2_bits + s->cluster_bits;
 
-    /* compute how many bytes there are between the offset and
-     * the end of the l1 entry
-     */
+    /* compute how many bytes there are between the start of the cluster
+     * containing offset and the end of the l1 entry */
+    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
+                    + offset_in_cluster;
 
-    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
-
-    /* compute the number of available sectors */
-
-    nb_available = (nb_available >> 9) + index_in_cluster;
-
-    if (nb_needed > nb_available) {
-        nb_needed = nb_available;
+    if (bytes_needed > bytes_available) {
+        bytes_needed = bytes_available;
     }
-    assert(nb_needed <= INT_MAX);
+    assert(bytes_needed <= INT_MAX);
 
     *cluster_offset = 0;
 
@@ -542,7 +541,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
     /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
-    nb_clusters = size_to_clusters(s, nb_needed << 9);
+    nb_clusters = size_to_clusters(s, bytes_needed);
 
     ret = qcow2_get_cluster_type(*cluster_offset);
     switch (ret) {
@@ -589,13 +588,16 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 
-    nb_available = (c * s->cluster_sectors);
+    bytes_available = (c * s->cluster_size);
 
 out:
-    if (nb_available > nb_needed)
-        nb_available = nb_needed;
+    if (bytes_available > bytes_needed) {
+        bytes_available = bytes_needed;
+    }
 
-    *num = nb_available - index_in_cluster;
+    bytes = bytes_available - offset_in_cluster;
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    *num = bytes >> BDRV_SECTOR_BITS;
 
     return ret;
 

From ecfe186380e6fc3a556dff05b866ca213e277fc7 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 31 May 2016 16:13:07 +0200
Subject: [PATCH 02/39] qcow2: Implement .bdrv_co_preadv()

Reading from qcow2 images is now byte granularity.

Most of the affected code in qcow2 actually gets simpler with this
change. The only exception is encryption, which is fixed on 512 bytes
blocks; in order to keep this working, bs->request_alignment is set for
encrypted images.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/qcow2-cluster.c |  27 +++++------
 block/qcow2.c         | 110 +++++++++++++++++++++++-------------------
 block/qcow2.h         |   2 +-
 3 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index a59fb3493c..277764f36c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -424,7 +424,8 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
      * interface.  This avoids double I/O throttling and request tracking,
      * which can lead to deadlock when block layer copy-on-read is enabled.
      */
-    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+    ret = bs->drv->bdrv_co_preadv(bs, (start_sect + n_start) * BDRV_SECTOR_SIZE,
+                                  n * BDRV_SECTOR_SIZE, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -464,19 +465,21 @@ out:
 /*
  * get_cluster_offset
  *
- * For a given offset of the disk image, find the cluster offset in
- * qcow2 file. The offset is stored in *cluster_offset.
+ * For a given offset of the virtual disk, find the cluster type and offset in
+ * the qcow2 file. The offset is stored in *cluster_offset.
  *
- * on entry, *num is the number of contiguous sectors we'd like to
- * access following offset.
+ * On entry, *bytes is the maximum number of contiguous bytes starting at
+ * offset that we are interested in.
  *
- * on exit, *num is the number of contiguous sectors we can read.
+ * On exit, *bytes is the number of bytes starting at offset that have the same
+ * cluster type and (if applicable) are stored contiguously in the image file.
+ * Compressed clusters are always returned one by one.
  *
  * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
  * cases.
  */
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *cluster_offset)
+                             unsigned int *bytes, uint64_t *cluster_offset)
 {
     BDRVQcow2State *s = bs->opaque;
     unsigned int l2_index;
@@ -485,13 +488,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     unsigned int offset_in_cluster, nb_clusters;
     uint64_t bytes_available, bytes_needed;
     int ret;
-    unsigned int bytes;
-
-    assert(*num <= BDRV_REQUEST_MAX_SECTORS);
-    bytes = *num * BDRV_SECTOR_SIZE;
 
     offset_in_cluster = offset_into_cluster(s, offset);
-    bytes_needed = (uint64_t) bytes + offset_in_cluster;
+    bytes_needed = (uint64_t) *bytes + offset_in_cluster;
 
     l1_bits = s->l2_bits + s->cluster_bits;
 
@@ -595,9 +594,7 @@ out:
         bytes_available = bytes_needed;
     }
 
-    bytes = bytes_available - offset_in_cluster;
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-    *num = bytes >> BDRV_SECTOR_BITS;
+    *bytes = bytes_available - offset_in_cluster;
 
     return ret;
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 6f5fb810e4..545dea0ab3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -975,6 +975,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         }
 
         bs->encrypted = 1;
+
+        /* Encryption works on a sector granularity */
+        bs->request_alignment = BDRV_SECTOR_SIZE;
     }
 
     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
@@ -1331,16 +1334,20 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     BDRVQcow2State *s = bs->opaque;
     uint64_t cluster_offset;
     int index_in_cluster, ret;
+    unsigned int bytes;
     int64_t status = 0;
 
-    *pnum = nb_sectors;
+    bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    ret = qcow2_get_cluster_offset(bs, sector_num << 9, &bytes,
+                                   &cluster_offset);
     qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
         return ret;
     }
 
+    *pnum = bytes >> BDRV_SECTOR_BITS;
+
     if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
         !s->cipher) {
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
@@ -1358,28 +1365,34 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
 
 /* handle reading after the end of the backing file */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors)
+                        int64_t offset, int bytes)
 {
+    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
     int n1;
-    if ((sector_num + nb_sectors) <= bs->total_sectors)
-        return nb_sectors;
-    if (sector_num >= bs->total_sectors)
-        n1 = 0;
-    else
-        n1 = bs->total_sectors - sector_num;
 
-    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+    if ((offset + bytes) <= bs_size) {
+        return bytes;
+    }
+
+    if (offset >= bs_size) {
+        n1 = 0;
+    } else {
+        n1 = bs_size - offset;
+    }
+
+    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
 
     return n1;
 }
 
-static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
-                          int remaining_sectors, QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                        uint64_t bytes, QEMUIOVector *qiov,
+                                        int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int index_in_cluster, n1;
+    int offset_in_cluster, n1;
     int ret;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
     uint64_t bytes_done = 0;
     QEMUIOVector hd_qiov;
@@ -1389,26 +1402,24 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
 
     qemu_co_mutex_lock(&s->lock);
 
-    while (remaining_sectors != 0) {
+    while (bytes != 0) {
 
         /* prepare next request */
-        cur_nr_sectors = remaining_sectors;
+        cur_bytes = MIN(bytes, INT_MAX);
         if (s->cipher) {
-            cur_nr_sectors = MIN(cur_nr_sectors,
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+            cur_bytes = MIN(cur_bytes,
+                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
         }
 
-        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
-            &cur_nr_sectors, &cluster_offset);
+        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
         if (ret < 0) {
             goto fail;
         }
 
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        offset_in_cluster = offset_into_cluster(s, offset);
 
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
-            cur_nr_sectors * 512);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
 
         switch (ret) {
         case QCOW2_CLUSTER_UNALLOCATED:
@@ -1416,18 +1427,17 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             if (bs->backing) {
                 /* read from the base image */
                 n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                    sector_num, cur_nr_sectors);
+                                         offset, cur_bytes);
                 if (n1 > 0) {
                     QEMUIOVector local_qiov;
 
                     qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0,
-                                      n1 * BDRV_SECTOR_SIZE);
+                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
 
                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                     qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_readv(bs->backing->bs, sector_num,
-                                        n1, &local_qiov);
+                    ret = bdrv_co_preadv(bs->backing->bs, offset, n1,
+                                         &local_qiov, 0);
                     qemu_co_mutex_lock(&s->lock);
 
                     qemu_iovec_destroy(&local_qiov);
@@ -1438,12 +1448,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                 }
             } else {
                 /* Note: in this case, no need to wait */
-                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
             }
             break;
 
         case QCOW2_CLUSTER_ZERO:
-            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
             break;
 
         case QCOW2_CLUSTER_COMPRESSED:
@@ -1454,8 +1464,8 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             }
 
             qemu_iovec_from_buf(&hd_qiov, 0,
-                s->cluster_cache + index_in_cluster * 512,
-                512 * cur_nr_sectors);
+                                s->cluster_cache + offset_in_cluster,
+                                cur_bytes);
             break;
 
         case QCOW2_CLUSTER_NORMAL:
@@ -1482,34 +1492,34 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                     }
                 }
 
-                assert(cur_nr_sectors <=
-                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
                 qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_add(&hd_qiov, cluster_data,
-                    512 * cur_nr_sectors);
+                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
             }
 
             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
             qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_readv(bs->file->bs,
-                                (cluster_offset >> 9) + index_in_cluster,
-                                cur_nr_sectors, &hd_qiov);
+            ret = bdrv_co_preadv(bs->file->bs,
+                                 cluster_offset + offset_in_cluster,
+                                 cur_bytes, &hd_qiov, 0);
             qemu_co_mutex_lock(&s->lock);
             if (ret < 0) {
                 goto fail;
             }
             if (bs->encrypted) {
                 assert(s->cipher);
+                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
                 Error *err = NULL;
-                if (qcow2_encrypt_sectors(s, sector_num,  cluster_data,
-                                          cluster_data, cur_nr_sectors, false,
-                                          &err) < 0) {
+                if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
+                                          cluster_data, cluster_data,
+                                          cur_bytes >> BDRV_SECTOR_BITS,
+                                          false, &err) < 0) {
                     error_free(err);
                     ret = -EIO;
                     goto fail;
                 }
-                qemu_iovec_from_buf(qiov, bytes_done,
-                    cluster_data, 512 * cur_nr_sectors);
+                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
             }
             break;
 
@@ -1519,9 +1529,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             goto fail;
         }
 
-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
+        bytes -= cur_bytes;
+        offset += cur_bytes;
+        bytes_done += cur_bytes;
     }
     ret = 0;
 
@@ -2435,7 +2445,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
     if (head || tail) {
         int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
         uint64_t off;
-        int nr;
+        unsigned int nr;
 
         assert(head + count <= s->cluster_size);
 
@@ -2452,7 +2462,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
         /* We can have new write after previous check */
         offset = cl_start << BDRV_SECTOR_BITS;
         count = s->cluster_size;
-        nr = s->cluster_sectors;
+        nr = s->cluster_size;
         ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
         if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
             qemu_co_mutex_unlock(&s->lock);
@@ -3368,7 +3378,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_co_get_block_status = qcow2_co_get_block_status,
     .bdrv_set_key       = qcow2_set_key,
 
-    .bdrv_co_readv          = qcow2_co_readv,
+    .bdrv_co_preadv         = qcow2_co_preadv,
     .bdrv_co_writev         = qcow2_co_writev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 7db9795d44..e2c42d5514 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -544,7 +544,7 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
                           int nb_sectors, bool enc, Error **errp);
 
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *cluster_offset);
+                             unsigned int *bytes, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int *num, uint64_t *host_offset, QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,

From aaa4d20b4972bb1a811ce929502e6741835d584e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 1 Jun 2016 15:21:05 +0200
Subject: [PATCH 03/39] qcow2: Make copy_sectors() byte based

This will allow copy on write operations where the overwritten part of
the cluster is not aligned to sector boundaries.

Also rename the function because it has nothing to do with sectors any
more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/qcow2-cluster.c | 55 +++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 277764f36c..b7671fc1de 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -390,22 +390,18 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
     return 0;
 }
 
-static int coroutine_fn copy_sectors(BlockDriverState *bs,
-                                     uint64_t start_sect,
-                                     uint64_t cluster_offset,
-                                     int n_start, int n_end)
+static int coroutine_fn do_perform_cow(BlockDriverState *bs,
+                                       uint64_t src_cluster_offset,
+                                       uint64_t cluster_offset,
+                                       int offset_in_cluster,
+                                       int bytes)
 {
     BDRVQcow2State *s = bs->opaque;
     QEMUIOVector qiov;
     struct iovec iov;
-    int n, ret;
+    int ret;
 
-    n = n_end - n_start;
-    if (n <= 0) {
-        return 0;
-    }
-
-    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_len = bytes;
     iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
     if (iov.iov_base == NULL) {
         return -ENOMEM;
@@ -424,18 +420,21 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
      * interface.  This avoids double I/O throttling and request tracking,
      * which can lead to deadlock when block layer copy-on-read is enabled.
      */
-    ret = bs->drv->bdrv_co_preadv(bs, (start_sect + n_start) * BDRV_SECTOR_SIZE,
-                                  n * BDRV_SECTOR_SIZE, &qiov, 0);
+    ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
+                                  bytes, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
 
     if (bs->encrypted) {
         Error *err = NULL;
+        int64_t sector = (cluster_offset + offset_in_cluster)
+                         >> BDRV_SECTOR_BITS;
         assert(s->cipher);
-        if (qcow2_encrypt_sectors(s, start_sect + n_start,
-                                  iov.iov_base, iov.iov_base, n,
-                                  true, &err) < 0) {
+        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
+        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
+        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+                                  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
             ret = -EIO;
             error_free(err);
             goto out;
@@ -443,14 +442,14 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
     }
 
     ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
+            cluster_offset + offset_in_cluster, bytes);
     if (ret < 0) {
         goto out;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
-                         &qiov);
+    ret = bdrv_co_pwritev(bs->file->bs, cluster_offset + offset_in_cluster,
+                          bytes, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -745,9 +744,8 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
     }
 
     qemu_co_mutex_unlock(&s->lock);
-    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
-                       r->offset / BDRV_SECTOR_SIZE,
-                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+                         r->offset, r->nb_sectors * BDRV_SECTOR_SIZE);
     qemu_co_mutex_lock(&s->lock);
 
     if (ret < 0) {
@@ -809,13 +807,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     assert(l2_index + m->nb_clusters <= s->l2_size);
     for (i = 0; i < m->nb_clusters; i++) {
         /* if two concurrent writes happen to the same unallocated cluster
-	 * each write allocates separate cluster and writes data concurrently.
-	 * The first one to complete updates l2 table with pointer to its
-	 * cluster the second one has to do RMW (which is done above by
-	 * copy_sectors()), update l2 table with its cluster pointer and free
-	 * old cluster. This is what this loop does */
-        if(l2_table[l2_index + i] != 0)
+         * each write allocates separate cluster and writes data concurrently.
+         * The first one to complete updates l2 table with pointer to its
+         * cluster the second one has to do RMW (which is done above by
+         * perform_cow()), update l2 table with its cluster pointer and free
+         * old cluster. This is what this loop does */
+        if (l2_table[l2_index + i] != 0) {
             old_cluster[j++] = l2_table[l2_index + i];
+        }
 
         l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);

From 8556739355d2a5392334c237486e32bd0f24c227 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 1 Jun 2016 14:08:56 +0200
Subject: [PATCH 04/39] qcow2: Use bytes instead of sectors for QCowL2Meta

In preparation for implementing .bdrv_co_pwritev in qcow2.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/qcow2-cluster.c | 32 ++++++++++++--------------------
 block/qcow2.h         | 13 +++----------
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index b7671fc1de..b24230b12c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -739,13 +739,12 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
     BDRVQcow2State *s = bs->opaque;
     int ret;
 
-    if (r->nb_sectors == 0) {
+    if (r->nb_bytes == 0) {
         return 0;
     }
 
     qemu_co_mutex_unlock(&s->lock);
-    ret = do_perform_cow(bs, m->offset, m->alloc_offset,
-                         r->offset, r->nb_sectors * BDRV_SECTOR_SIZE);
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
     qemu_co_mutex_lock(&s->lock);
 
     if (ret < 0) {
@@ -1196,25 +1195,20 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     /*
      * Save info needed for meta data update.
      *
-     * requested_sectors: Number of sectors from the start of the first
+     * requested_bytes: Number of bytes from the start of the first
      * newly allocated cluster to the end of the (possibly shortened
      * before) write request.
      *
-     * avail_sectors: Number of sectors from the start of the first
+     * avail_bytes: Number of bytes from the start of the first
      * newly allocated to the end of the last newly allocated cluster.
      *
-     * nb_sectors: The number of sectors from the start of the first
+     * nb_bytes: The number of bytes from the start of the first
      * newly allocated cluster to the end of the area that the write
      * request actually writes to (excluding COW at the end)
      */
-    int requested_sectors =
-        (*bytes + offset_into_cluster(s, guest_offset))
-        >> BDRV_SECTOR_BITS;
-    int avail_sectors = nb_clusters
-                        << (s->cluster_bits - BDRV_SECTOR_BITS);
-    int alloc_n_start = offset_into_cluster(s, guest_offset)
-                        >> BDRV_SECTOR_BITS;
-    int nb_sectors = MIN(requested_sectors, avail_sectors);
+    uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
+    int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits);
+    int nb_bytes = MIN(requested_bytes, avail_bytes);
     QCowL2Meta *old_m = *m;
 
     *m = g_malloc0(sizeof(**m));
@@ -1225,23 +1219,21 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         .alloc_offset   = alloc_cluster_offset,
         .offset         = start_of_cluster(s, guest_offset),
         .nb_clusters    = nb_clusters,
-        .nb_available   = nb_sectors,
 
         .cow_start = {
             .offset     = 0,
-            .nb_sectors = alloc_n_start,
+            .nb_bytes   = offset_into_cluster(s, guest_offset),
         },
         .cow_end = {
-            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
-            .nb_sectors = avail_sectors - nb_sectors,
+            .offset     = nb_bytes,
+            .nb_bytes   = avail_bytes - nb_bytes,
         },
     };
     qemu_co_queue_init(&(*m)->dependent_requests);
     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
 
     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
-    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
-                         - offset_into_cluster(s, guest_offset));
+    *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
     assert(*bytes != 0);
 
     return 1;
diff --git a/block/qcow2.h b/block/qcow2.h
index e2c42d5514..175b0c1740 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -302,8 +302,8 @@ typedef struct Qcow2COWRegion {
      */
     uint64_t    offset;
 
-    /** Number of sectors to copy */
-    int         nb_sectors;
+    /** Number of bytes to copy */
+    int         nb_bytes;
 } Qcow2COWRegion;
 
 /**
@@ -318,12 +318,6 @@ typedef struct QCowL2Meta
     /** Host offset of the first newly allocated cluster */
     uint64_t alloc_offset;
 
-    /**
-     * Number of sectors from the start of the first allocated cluster to
-     * the end of the (possibly shortened) request
-     */
-    int nb_available;
-
     /** Number of newly allocated clusters */
     int nb_clusters;
 
@@ -471,8 +465,7 @@ static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
 
 static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
 {
-    return m->offset + m->cow_end.offset
-        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+    return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
 }
 
 static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)

From d46a0bb24d6061c1eadcf1136fa73dc7c4ac267d Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 1 Jun 2016 16:55:05 +0200
Subject: [PATCH 05/39] qcow2: Implement .bdrv_co_pwritev()

This changes qcow2 to implement the byte-based .bdrv_co_pwritev
interface rather than the sector-based old one.

As preallocation uses the same allocation function as normal writes, and
the interface of that function needs to be changed, it is converted in
the same patch.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/qcow2-cluster.c | 13 +++----
 block/qcow2.c         | 89 ++++++++++++++++++++-----------------------
 block/qcow2.h         |  3 +-
 trace-events          |  6 +--
 4 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index b24230b12c..893ddf6798 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1265,7 +1265,8 @@ fail:
  * Return 0 on success and -errno in error cases
  */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *host_offset, QCowL2Meta **m)
+                               unsigned int *bytes, uint64_t *host_offset,
+                               QCowL2Meta **m)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t start, remaining;
@@ -1273,13 +1274,11 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     uint64_t cur_bytes;
     int ret;
 
-    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
-
-    assert((offset & ~BDRV_SECTOR_MASK) == 0);
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
 
 again:
     start = offset;
-    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
+    remaining = *bytes;
     cluster_offset = 0;
     *host_offset = 0;
     cur_bytes = 0;
@@ -1365,8 +1364,8 @@ again:
         }
     }
 
-    *num -= remaining >> BDRV_SECTOR_BITS;
-    assert(*num > 0);
+    *bytes -= remaining;
+    assert(*bytes > 0);
     assert(*host_offset != 0);
 
     return 0;
diff --git a/block/qcow2.c b/block/qcow2.c
index 545dea0ab3..cb55e2d897 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1544,23 +1544,21 @@ fail:
     return ret;
 }
 
-static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
-                           int64_t sector_num,
-                           int remaining_sectors,
-                           QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                         uint64_t bytes, QEMUIOVector *qiov,
+                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int index_in_cluster;
+    int offset_in_cluster;
     int ret;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    unsigned int cur_bytes; /* number of sectors in current iteration */
     uint64_t cluster_offset;
     QEMUIOVector hd_qiov;
     uint64_t bytes_done = 0;
     uint8_t *cluster_data = NULL;
     QCowL2Meta *l2meta = NULL;
 
-    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
-                                 remaining_sectors);
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
 
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
@@ -1568,22 +1566,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
     qemu_co_mutex_lock(&s->lock);
 
-    while (remaining_sectors != 0) {
+    while (bytes != 0) {
 
         l2meta = NULL;
 
         trace_qcow2_writev_start_part(qemu_coroutine_self());
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        cur_nr_sectors = remaining_sectors;
-        if (bs->encrypted &&
-            cur_nr_sectors >
-            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
-            cur_nr_sectors =
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster;
+        offset_in_cluster = offset_into_cluster(s, offset);
+        cur_bytes = MIN(bytes, INT_MAX);
+        if (bs->encrypted) {
+            cur_bytes = MIN(cur_bytes,
+                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                            - offset_in_cluster);
         }
 
-        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            &cur_nr_sectors, &cluster_offset, &l2meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
+                                         &cluster_offset, &l2meta);
         if (ret < 0) {
             goto fail;
         }
@@ -1591,8 +1588,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         assert((cluster_offset & 511) == 0);
 
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
-            cur_nr_sectors * 512);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
 
         if (bs->encrypted) {
             Error *err = NULL;
@@ -1611,8 +1607,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
             qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
 
-            if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
-                                      cluster_data, cur_nr_sectors,
+            if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
+                                      cluster_data, cluster_data,
+                                      cur_bytes >>BDRV_SECTOR_BITS,
                                       true, &err) < 0) {
                 error_free(err);
                 ret = -EIO;
@@ -1620,13 +1617,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             }
 
             qemu_iovec_reset(&hd_qiov);
-            qemu_iovec_add(&hd_qiov, cluster_data,
-                cur_nr_sectors * 512);
+            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
         }
 
         ret = qcow2_pre_write_overlap_check(bs, 0,
-                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
-                cur_nr_sectors * BDRV_SECTOR_SIZE);
+                cluster_offset + offset_in_cluster, cur_bytes);
         if (ret < 0) {
             goto fail;
         }
@@ -1634,10 +1629,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         qemu_co_mutex_unlock(&s->lock);
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         trace_qcow2_writev_data(qemu_coroutine_self(),
-                                (cluster_offset >> 9) + index_in_cluster);
-        ret = bdrv_co_writev(bs->file->bs,
-                             (cluster_offset >> 9) + index_in_cluster,
-                             cur_nr_sectors, &hd_qiov);
+                                cluster_offset + offset_in_cluster);
+        ret = bdrv_co_pwritev(bs->file->bs,
+                              cluster_offset + offset_in_cluster,
+                              cur_bytes, &hd_qiov, 0);
         qemu_co_mutex_lock(&s->lock);
         if (ret < 0) {
             goto fail;
@@ -1663,10 +1658,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             l2meta = next;
         }
 
-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
-        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+        bytes -= cur_bytes;
+        offset += cur_bytes;
+        bytes_done += cur_bytes;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
     }
     ret = 0;
 
@@ -2008,19 +2003,19 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
 
 static int preallocate(BlockDriverState *bs)
 {
-    uint64_t nb_sectors;
+    uint64_t bytes;
     uint64_t offset;
     uint64_t host_offset = 0;
-    int num;
+    unsigned int cur_bytes;
     int ret;
     QCowL2Meta *meta;
 
-    nb_sectors = bdrv_nb_sectors(bs);
+    bytes = bdrv_getlength(bs);
     offset = 0;
 
-    while (nb_sectors) {
-        num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS);
-        ret = qcow2_alloc_cluster_offset(bs, offset, &num,
+    while (bytes) {
+        cur_bytes = MIN(bytes, INT_MAX);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                          &host_offset, &meta);
         if (ret < 0) {
             return ret;
@@ -2046,8 +2041,8 @@ static int preallocate(BlockDriverState *bs)
 
         /* TODO Preallocate data if requested */
 
-        nb_sectors -= num;
-        offset += num << BDRV_SECTOR_BITS;
+        bytes -= cur_bytes;
+        offset += cur_bytes;
     }
 
     /*
@@ -2056,11 +2051,9 @@ static int preallocate(BlockDriverState *bs)
      * EOF). Extend the image to the last allocated sector.
      */
     if (host_offset != 0) {
-        uint8_t buf[BDRV_SECTOR_SIZE];
-        memset(buf, 0, BDRV_SECTOR_SIZE);
-        ret = bdrv_write(bs->file->bs,
-                         (host_offset >> BDRV_SECTOR_BITS) + num - 1,
-                         buf, 1);
+        uint8_t data = 0;
+        ret = bdrv_pwrite(bs->file->bs, (host_offset + cur_bytes) - 1,
+                          &data, 1);
         if (ret < 0) {
             return ret;
         }
@@ -3379,7 +3372,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_set_key       = qcow2_set_key,
 
     .bdrv_co_preadv         = qcow2_co_preadv,
-    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_pwritev        = qcow2_co_pwritev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
 
     .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
diff --git a/block/qcow2.h b/block/qcow2.h
index 175b0c1740..b36a7bf8ad 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -539,7 +539,8 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
                              unsigned int *bytes, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *host_offset, QCowL2Meta **m);
+                               unsigned int *bytes, uint64_t *host_offset,
+                               QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                          uint64_t offset,
                                          int compressed_size);
diff --git a/trace-events b/trace-events
index 2f14205de0..720c644873 100644
--- a/trace-events
+++ b/trace-events
@@ -606,16 +606,16 @@ qemu_system_shutdown_request(void) ""
 qemu_system_powerdown_request(void) ""
 
 # block/qcow2.c
-qcow2_writev_start_req(void *co, int64_t sector, int nb_sectors) "co %p sector %" PRIx64 " nb_sectors %d"
+qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
 qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
 qcow2_writev_start_part(void *co) "co %p"
-qcow2_writev_done_part(void *co, int cur_nr_sectors) "co %p cur_nr_sectors %d"
+qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
 qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
 qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
 
 # block/qcow2-cluster.c
-qcow2_alloc_clusters_offset(void *co, uint64_t offset, int num) "co %p offset %" PRIx64 " num %d"
+qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
 qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
 qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
 qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"

From 38a53d506be5efc83a33498948d8e6ba838dd836 Mon Sep 17 00:00:00 2001
From: Colin Lord <clord@redhat.com>
Date: Wed, 8 Jun 2016 13:56:28 -0400
Subject: [PATCH 06/39] blockdev: clarify error on attempt to open locked tray

When opening a device with a locked tray, gives an error explaining the
device tray is locked and that the user should wait and try again. This
is less confusing than the previous error, which simply stated that the
tray was locked.

Signed-off-by: Colin Lord <clord@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 7fd515a4fa..11177b4fcc 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2544,6 +2544,7 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
     BlockBackend *blk;
     BlockDriverState *medium_bs = NULL;
     int bdrv_flags;
+    int rc;
     QDict *options = NULL;
     Error *err = NULL;
 
@@ -2598,11 +2599,13 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
         goto fail;
     }
 
-    qmp_blockdev_open_tray(device, false, false, &err);
-    if (err) {
+    rc = do_open_tray(device, false, &err);
+    if (rc && rc != -ENOSYS) {
         error_propagate(errp, err);
         goto fail;
     }
+    error_free(err);
+    err = NULL;
 
     qmp_x_blockdev_remove_medium(device, &err);
     if (err) {

From 479b5998d4149bef94a5900291b51804eff39a3d Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Wed, 8 Jun 2016 12:39:17 +0300
Subject: [PATCH 07/39] hmp: acquire aio_context in hmp_qemu_io

Acquire aio context before run command, this is mandatory for unit tests.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hmp.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hmp.c b/hmp.c
index 4b8e9878b1..30897af6ae 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1955,7 +1955,12 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 
     blk = blk_by_name(device);
     if (blk) {
+        AioContext *aio_context = blk_get_aio_context(blk);
+        aio_context_acquire(aio_context);
+
         qemuio_command(blk, command);
+
+        aio_context_release(aio_context);
     } else {
         error_set(&err, ERROR_CLASS_DEVICE_NOT_FOUND,
                   "Device '%s' not found", device);

From 9e19036e5a3c5abfe57984ca98ab064acfca83a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Tue, 31 May 2016 13:36:05 +0200
Subject: [PATCH 08/39] m25p80: fix test on blk_pread() return value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 243e6f69c129 ("m25p80: Switch to byte-based block access")
replaced blk_read() calls with blk_pread() but return values are
different.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/m25p80.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index 4c856f5278..51d8596056 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -900,7 +900,7 @@ static int m25p80_init(SSISlave *ss)
         s->storage = blk_blockalign(s->blk, s->size);
 
         /* FIXME: Move to late init */
-        if (blk_pread(s->blk, 0, s->storage, s->size)) {
+        if (blk_pread(s->blk, 0, s->storage, s->size) != s->size) {
             fprintf(stderr, "Failed to initialize SPI flash!\n");
             return 1;
         }

From 604e86136266d104210a74be4c82643300c74e69 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 14 Jun 2016 11:29:32 +0200
Subject: [PATCH 09/39] qemu-img bench: Fix uninitialised writethrough mode

If no -t option is specified, bool writethrough stayed uninitialised.
Initialise it as false, which makes cache=writeback the default cache
mode.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-img.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index 251386b49d..14e2661a5c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3570,7 +3570,7 @@ static int img_bench(int argc, char **argv)
     BlockBackend *blk = NULL;
     BenchData data = {};
     int flags = 0;
-    bool writethrough;
+    bool writethrough = false;
     struct timeval t1, t2;
     int i;
 

From 73698c30caaf16e990b8b2e6e7bc64928947e65e Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Mon, 13 Jun 2016 12:56:34 -0600
Subject: [PATCH 10/39] block: Avoid bogus flags during mirroring

Commit e253f4b8 converted mirroring from sector-based bdrv_aio_*
to byte-based blk_aio_*, but failed to account for the subtle
difference in signatures (the former takes a semi-redundant length,
the latter takes a flags parameter).  Since all of our flags are
currently smaller in size than BDRV_SECTOR_SIZE, it has no ill
effects until we either perform sub-sector mirroring, or we start
asserting that no unexpected flags are set.  I found it while
testing new asserts when qemu-iotests 132 started warning about an
unknown flag 0x200000.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 80fd3c7469..1f01f2488c 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -157,8 +157,7 @@ static void mirror_read_complete(void *opaque, int ret)
         return;
     }
     blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
-                    op->nb_sectors * BDRV_SECTOR_SIZE,
-                    mirror_write_complete, op);
+                    0, mirror_write_complete, op);
 }
 
 static inline void mirror_clip_sectors(MirrorBlockJob *s,
@@ -274,8 +273,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
     s->sectors_in_flight += nb_sectors;
     trace_mirror_one_iteration(s, sector_num, nb_sectors);
 
-    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov,
-                   nb_sectors * BDRV_SECTOR_SIZE,
+    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
                    mirror_read_complete, op);
     return ret;
 }

From fa166538743d4e28de7374c41332c3e448826f4b Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Mon, 13 Jun 2016 12:56:35 -0600
Subject: [PATCH 11/39] block: Assert that flags are in range

Add a new BDRV_REQ_MASK constant, and use it to make sure that
caller flags are always valid.

Tested with 'make check' and with qemu-iotests on both '-raw'
and '-qcow2'; the only failure turned up was fixed in the
previous commit.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c            | 6 ++++++
 include/block/block.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/block/io.c b/block/io.c
index fb99a7151c..5b2017fedc 100644
--- a/block/io.c
+++ b/block/io.c
@@ -776,6 +776,8 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
     int64_t sector_num;
     unsigned int nb_sectors;
 
+    assert(!(flags & ~BDRV_REQ_MASK));
+
     if (drv->bdrv_co_preadv) {
         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
     }
@@ -815,6 +817,8 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
     unsigned int nb_sectors;
     int ret;
 
+    assert(!(flags & ~BDRV_REQ_MASK));
+
     if (drv->bdrv_co_pwritev) {
         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
                                    flags & bs->supported_write_flags);
@@ -953,6 +957,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+    assert(!(flags & ~BDRV_REQ_MASK));
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
@@ -1242,6 +1247,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+    assert(!(flags & ~BDRV_REQ_MASK));
 
     waited = wait_serialising_requests(req);
     assert(!waited || !req->serialising);
diff --git a/include/block/block.h b/include/block/block.h
index 54cca28bac..8cabcddf6c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -65,6 +65,9 @@ typedef enum {
     BDRV_REQ_MAY_UNMAP          = 0x4,
     BDRV_REQ_NO_SERIALISING     = 0x8,
     BDRV_REQ_FUA                = 0x10,
+
+    /* Mask of valid flags */
+    BDRV_REQ_MASK               = 0x1f,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {

From 8c0dcbc4ad2bf4f9f3b27c637b357e87cad70ec7 Mon Sep 17 00:00:00 2001
From: "Daniel P. Berrange" <berrange@redhat.com>
Date: Mon, 13 Jun 2016 12:30:09 +0100
Subject: [PATCH 12/39] block: drop support for using qcow[2] encryption with
 system emulators

Back in the 2.3.0 release we declared qcow[2] encryption as
deprecated, warning people that it would be removed in a future
release.

  commit a1f688f4152e65260b94f37543521ceff8bfebe4
  Author: Markus Armbruster <armbru@redhat.com>
  Date:   Fri Mar 13 21:09:40 2015 +0100

    block: Deprecate QCOW/QCOW2 encryption

The code still exists today, but by a (happy?) accident we entirely
broke the ability to use qcow[2] encryption in the system emulators
in the 2.4.0 release due to

  commit 8336aafae1451d54c81dd2b187b45f7c45d2428e
  Author: Daniel P. Berrange <berrange@redhat.com>
  Date:   Tue May 12 17:09:18 2015 +0100

    qcow2/qcow: protect against uninitialized encryption key

This commit was designed to prevent future coding bugs which
might cause QEMU to read/write data on an encrypted block
device in plain text mode before a decryption key is set.

It turns out this preventative measure was a little too good,
because we already had a long standing bug where QEMU read
encrypted data in plain text mode during system emulator
startup, in order to guess disk geometry:

  Thread 10 (Thread 0x7fffd3fff700 (LWP 30373)):
  #0  0x00007fffe90b1a28 in raise () at /lib64/libc.so.6
  #1  0x00007fffe90b362a in abort () at /lib64/libc.so.6
  #2  0x00007fffe90aa227 in __assert_fail_base () at /lib64/libc.so.6
  #3  0x00007fffe90aa2d2 in  () at /lib64/libc.so.6
  #4  0x000055555587ae19 in qcow2_co_readv (bs=0x5555562accb0, sector_num=0, remaining_sectors=1, qiov=0x7fffffffd260) at block/qcow2.c:1229
  #5  0x000055555589b60d in bdrv_aligned_preadv (bs=bs@entry=0x5555562accb0, req=req@entry=0x7fffd3ffea50, offset=offset@entry=0, bytes=bytes@entry=512, align=align@entry=512, qiov=qiov@entry=0x7fffffffd260, flags=0) at block/io.c:908
  #6  0x000055555589b8bc in bdrv_co_do_preadv (bs=0x5555562accb0, offset=0, bytes=512, qiov=0x7fffffffd260, flags=<optimized out>) at block/io.c:999
  #7  0x000055555589c375 in bdrv_rw_co_entry (opaque=0x7fffffffd210) at block/io.c:544
  #8  0x000055555586933b in coroutine_thread (opaque=0x555557876310) at coroutine-gthread.c:134
  #9  0x00007ffff64e1835 in g_thread_proxy (data=0x5555562b5590) at gthread.c:778
  #10 0x00007ffff6bb760a in start_thread () at /lib64/libpthread.so.0
  #11 0x00007fffe917f59d in clone () at /lib64/libc.so.6

  Thread 1 (Thread 0x7ffff7ecab40 (LWP 30343)):
  #0  0x00007fffe91797a9 in syscall () at /lib64/libc.so.6
  #1  0x00007ffff64ff87f in g_cond_wait (cond=cond@entry=0x555555e085f0 <coroutine_cond>, mutex=mutex@entry=0x555555e08600 <coroutine_lock>) at gthread-posix.c:1397
  #2  0x00005555558692c3 in qemu_coroutine_switch (co=<optimized out>) at coroutine-gthread.c:117
  #3  0x00005555558692c3 in qemu_coroutine_switch (from_=0x5555562b5e30, to_=to_@entry=0x555557876310, action=action@entry=COROUTINE_ENTER) at coroutine-gthread.c:175
  #4  0x0000555555868a90 in qemu_coroutine_enter (co=0x555557876310, opaque=0x0) at qemu-coroutine.c:116
  #5  0x0000555555859b84 in thread_pool_completion_bh (opaque=0x7fffd40010e0) at thread-pool.c:187
  #6  0x0000555555859514 in aio_bh_poll (ctx=ctx@entry=0x5555562953b0) at async.c:85
  #7  0x0000555555864d10 in aio_dispatch (ctx=ctx@entry=0x5555562953b0) at aio-posix.c:135
  #8  0x0000555555864f75 in aio_poll (ctx=ctx@entry=0x5555562953b0, blocking=blocking@entry=true) at aio-posix.c:291
  #9  0x000055555589c40d in bdrv_prwv_co (bs=bs@entry=0x5555562accb0, offset=offset@entry=0, qiov=qiov@entry=0x7fffffffd260, is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at block/io.c:591
  #10 0x000055555589c503 in bdrv_rw_co (bs=bs@entry=0x5555562accb0, sector_num=sector_num@entry=0, buf=buf@entry=0x7fffffffd2e0 "\321,", nb_sectors=nb_sectors@entry=21845, is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at block/io.c:614
  #11 0x000055555589c562 in bdrv_read_unthrottled (nb_sectors=21845, buf=0x7fffffffd2e0 "\321,", sector_num=0, bs=0x5555562accb0) at block/io.c:622
  #12 0x000055555589c562 in bdrv_read_unthrottled (bs=0x5555562accb0, sector_num=sector_num@entry=0, buf=buf@entry=0x7fffffffd2e0 "\321,", nb_sectors=nb_sectors@entry=21845) at block/io.c:634
    nb_sectors@entry=1) at block/block-backend.c:504
  #14 0x0000555555752e9f in guess_disk_lchs (blk=blk@entry=0x5555562a5290, pcylinders=pcylinders@entry=0x7fffffffd52c, pheads=pheads@entry=0x7fffffffd530, psectors=psectors@entry=0x7fffffffd534) at hw/block/hd-geometry.c:68
  #15 0x0000555555752ff7 in hd_geometry_guess (blk=0x5555562a5290, pcyls=pcyls@entry=0x555557875d1c, pheads=pheads@entry=0x555557875d20, psecs=psecs@entry=0x555557875d24, ptrans=ptrans@entry=0x555557875d28) at hw/block/hd-geometry.c:133
  #16 0x0000555555752b87 in blkconf_geometry (conf=conf@entry=0x555557875d00, ptrans=ptrans@entry=0x555557875d28, cyls_max=cyls_max@entry=65536, heads_max=heads_max@entry=16, secs_max=secs_max@entry=255, errp=errp@entry=0x7fffffffd5e0) at hw/block/block.c:71
  #17 0x0000555555799bc4 in ide_dev_initfn (dev=0x555557875c80, kind=IDE_HD) at hw/ide/qdev.c:174
  #18 0x0000555555768394 in device_realize (dev=0x555557875c80, errp=0x7fffffffd640) at hw/core/qdev.c:247
  #19 0x0000555555769a81 in device_set_realized (obj=0x555557875c80, value=<optimized out>, errp=0x7fffffffd730) at hw/core/qdev.c:1058
  #20 0x00005555558240ce in property_set_bool (obj=0x555557875c80, v=<optimized out>, opaque=0x555557875de0, name=<optimized out>, errp=0x7fffffffd730)
        at qom/object.c:1514
  #21 0x0000555555826c87 in object_property_set_qobject (obj=obj@entry=0x555557875c80, value=value@entry=0x55555784bcb0, name=name@entry=0x55555591cb3d "realized", errp=errp@entry=0x7fffffffd730) at qom/qom-qobject.c:24
  #22 0x0000555555825760 in object_property_set_bool (obj=obj@entry=0x555557875c80, value=value@entry=true, name=name@entry=0x55555591cb3d "realized", errp=errp@entry=0x7fffffffd730) at qom/object.c:905
  #23 0x000055555576897b in qdev_init_nofail (dev=dev@entry=0x555557875c80) at hw/core/qdev.c:380
  #24 0x0000555555799ead in ide_create_drive (bus=bus@entry=0x555557629630, unit=unit@entry=0, drive=0x5555562b77e0) at hw/ide/qdev.c:122
  #25 0x000055555579a746 in pci_ide_create_devs (dev=dev@entry=0x555557628db0, hd_table=hd_table@entry=0x7fffffffd830) at hw/ide/pci.c:440
  #26 0x000055555579b165 in pci_piix3_ide_init (bus=<optimized out>, hd_table=0x7fffffffd830, devfn=<optimized out>) at hw/ide/piix.c:218
  #27 0x000055555568ca55 in pc_init1 (machine=0x5555562960a0, pci_enabled=1, kvmclock_enabled=<optimized out>) at /home/berrange/src/virt/qemu/hw/i386/pc_piix.c:256
  #28 0x0000555555603ab2 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4249

So the safety net is correctly preventing QEMU reading cipher
text as if it were plain text, during startup and aborting QEMU
to avoid bad usage of this data.

For added fun this bug only happens if the encrypted qcow2
file happens to have data written to the first cluster,
otherwise the cluster won't be allocated and so qcow2 would
not try the decryption routines at all, just return all 0's.

That no one even noticed, let alone reported, this bug that
has shipped in 2.4.0, 2.5.0 and 2.6.0 shows that the number
of actual users of encrypted qcow2 is approximately zero.

So rather than fix the crash, and backport it to stable
releases, just go ahead with what we have warned users about
and disable any use of qcow2 encryption in the system
emulators. qemu-img/qemu-io/qemu-nbd are still able to access
qcow2 encrypted images for the sake of data conversion.

In the future, qcow2 will gain support for the alternative
luks format, but when this happens it'll be using the
'-object secret' infrastructure for getting keys, which
avoids this problematic scenario entirely.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c               | 14 ++++++++++----
 block/qcow2.c              | 14 ++++++++++----
 tests/qemu-iotests/087.out | 12 ++----------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index c5cf813469..312af52816 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -162,10 +162,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     if (s->crypt_method_header) {
         if (bdrv_uses_whitelist() &&
             s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_report("qcow built-in AES encryption is deprecated");
-            error_printf("Support for it will be removed in a future release.\n"
-                         "You can use 'qemu-img convert' to switch to an\n"
-                         "unencrypted qcow image, or a LUKS raw image.\n");
+            error_setg(errp,
+                       "Use of AES-CBC encrypted qcow images is no longer "
+                       "supported in system emulators");
+            error_append_hint(errp,
+                              "You can use 'qemu-img convert' to convert your "
+                              "image to an alternative supported format, such "
+                              "as unencrypted qcow, or raw with the LUKS "
+                              "format instead.\n");
+            ret = -ENOSYS;
+            goto fail;
         }
 
         bs->encrypted = 1;
diff --git a/block/qcow2.c b/block/qcow2.c
index cb55e2d897..9de716a524 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -968,10 +968,16 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     if (s->crypt_method_header) {
         if (bdrv_uses_whitelist() &&
             s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_report("qcow2 built-in AES encryption is deprecated");
-            error_printf("Support for it will be removed in a future release.\n"
-                         "You can use 'qemu-img convert' to switch to an\n"
-                         "unencrypted qcow2 image, or a LUKS raw image.\n");
+            error_setg(errp,
+                       "Use of AES-CBC encrypted qcow2 images is no longer "
+                       "supported in system emulators");
+            error_append_hint(errp,
+                              "You can use 'qemu-img convert' to convert your "
+                              "image to an alternative supported format, such "
+                              "as unencrypted qcow2, or raw with the LUKS "
+                              "format instead.\n");
+            ret = -ENOSYS;
+            goto fail;
         }
 
         bs->encrypted = 1;
diff --git a/tests/qemu-iotests/087.out b/tests/qemu-iotests/087.out
index 055c553cdb..a95c4b0be8 100644
--- a/tests/qemu-iotests/087.out
+++ b/tests/qemu-iotests/087.out
@@ -42,22 +42,14 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 encryption=on
 Testing: -S
 QMP_VERSION
 {"return": {}}
-IMGFMT built-in AES encryption is deprecated
-Support for it will be removed in a future release.
-You can use 'qemu-img convert' to switch to an
-unencrypted IMGFMT image, or a LUKS raw image.
-{"error": {"class": "GenericError", "desc": "blockdev-add doesn't support encrypted devices"}}
+{"error": {"class": "GenericError", "desc": "Use of AES-CBC encrypted IMGFMT images is no longer supported in system emulators"}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN"}
 
 Testing:
 QMP_VERSION
 {"return": {}}
-IMGFMT built-in AES encryption is deprecated
-Support for it will be removed in a future release.
-You can use 'qemu-img convert' to switch to an
-unencrypted IMGFMT image, or a LUKS raw image.
-{"error": {"class": "GenericError", "desc": "Guest must be stopped for opening of encrypted image"}}
+{"error": {"class": "GenericError", "desc": "Use of AES-CBC encrypted IMGFMT images is no longer supported in system emulators"}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN"}
 

From 244483e64ee726cc89a1e05bed2be0ed37071403 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 2 Jun 2016 11:41:52 +0200
Subject: [PATCH 13/39] block: Byte-based bdrv_co_do_copy_on_readv()

In a first step to convert the common I/O path to work on bytes rather
than sectors, this converts the copy-on-read logic that is used by
bdrv_aligned_preadv().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c            | 63 ++++++++++++++++++++++++++-----------------
 block/mirror.c        | 10 ++++---
 include/block/block.h | 10 ++++---
 trace-events          |  2 +-
 4 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/block/io.c b/block/io.c
index 5b2017fedc..b6a2c800a1 100644
--- a/block/io.c
+++ b/block/io.c
@@ -404,12 +404,12 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 }
 
 /**
- * Round a region to cluster boundaries
+ * Round a region to cluster boundaries (sector-based)
  */
-void bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t sector_num, int nb_sectors,
-                            int64_t *cluster_sector_num,
-                            int *cluster_nb_sectors)
+void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
+                                    int64_t sector_num, int nb_sectors,
+                                    int64_t *cluster_sector_num,
+                                    int *cluster_nb_sectors)
 {
     BlockDriverInfo bdi;
 
@@ -424,6 +424,26 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
     }
 }
 
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t offset, unsigned int bytes,
+                            int64_t *cluster_offset,
+                            unsigned int *cluster_bytes)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_offset = offset;
+        *cluster_bytes = bytes;
+    } else {
+        int64_t c = bdi.cluster_size;
+        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
+        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
+    }
+}
+
 static int bdrv_get_cluster_size(BlockDriverState *bs)
 {
     BlockDriverInfo bdi;
@@ -865,7 +885,7 @@ emulate_flags:
 }
 
 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 {
     /* Perform I/O through a temporary buffer so that users who scribble over
      * their read buffer while the operation is in progress do not end up
@@ -877,21 +897,20 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
     BlockDriver *drv = bs->drv;
     struct iovec iov;
     QEMUIOVector bounce_qiov;
-    int64_t cluster_sector_num;
-    int cluster_nb_sectors;
+    int64_t cluster_offset;
+    unsigned int cluster_bytes;
     size_t skip_bytes;
     int ret;
 
     /* Cover entire cluster so no additional backing file I/O is required when
      * allocating cluster in the image file.
      */
-    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-                           &cluster_sector_num, &cluster_nb_sectors);
+    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 
-    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
-                                   cluster_sector_num, cluster_nb_sectors);
+    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
+                                   cluster_offset, cluster_bytes);
 
-    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_len = cluster_bytes;
     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
     if (bounce_buffer == NULL) {
         ret = -ENOMEM;
@@ -900,8 +919,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 
     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-    ret = bdrv_driver_preadv(bs, cluster_sector_num * BDRV_SECTOR_SIZE,
-                             cluster_nb_sectors * BDRV_SECTOR_SIZE,
+    ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
                              &bounce_qiov, 0);
     if (ret < 0) {
         goto err;
@@ -909,16 +927,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 
     if (drv->bdrv_co_pwrite_zeroes &&
         buffer_is_zero(bounce_buffer, iov.iov_len)) {
-        ret = bdrv_co_do_pwrite_zeroes(bs,
-                                       cluster_sector_num * BDRV_SECTOR_SIZE,
-                                       cluster_nb_sectors * BDRV_SECTOR_SIZE,
-                                       0);
+        ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
     } else {
         /* This does not change the data on the disk, it is not necessary
          * to flush even in cache=writethrough mode.
          */
-        ret = bdrv_driver_pwritev(bs, cluster_sector_num * BDRV_SECTOR_SIZE,
-                                  cluster_nb_sectors * BDRV_SECTOR_SIZE,
+        ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
                                   &bounce_qiov, 0);
     }
 
@@ -930,9 +944,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
         goto err;
     }
 
-    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
-    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
-                        nb_sectors * BDRV_SECTOR_SIZE);
+    skip_bytes = offset - cluster_offset;
+    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
 
 err:
     qemu_vfree(bounce_buffer);
@@ -982,7 +995,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         }
 
         if (!ret || pnum != nb_sectors) {
-            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
             goto out;
         }
     }
diff --git a/block/mirror.c b/block/mirror.c
index 1f01f2488c..41848b2c8e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -185,8 +185,9 @@ static int mirror_cow_align(MirrorBlockJob *s,
     need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                           s->cow_bitmap);
     if (need_cow) {
-        bdrv_round_to_clusters(blk_bs(s->target), *sector_num, *nb_sectors,
-                               &align_sector_num, &align_nb_sectors);
+        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
+                                       *nb_sectors, &align_sector_num,
+                                       &align_nb_sectors);
     }
 
     if (align_nb_sectors > max_sectors) {
@@ -384,8 +385,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
             int64_t target_sector_num;
             int target_nb_sectors;
-            bdrv_round_to_clusters(blk_bs(s->target), sector_num, io_sectors,
-                                   &target_sector_num, &target_nb_sectors);
+            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
+                                           io_sectors,  &target_sector_num,
+                                           &target_nb_sectors);
             if (target_sector_num == sector_num &&
                 target_nb_sectors == io_sectors) {
                 mirror_method = ret & BDRV_BLOCK_ZERO ?
diff --git a/include/block/block.h b/include/block/block.h
index 8cabcddf6c..9c3a62cc0a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -404,10 +404,14 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
                           const uint8_t *buf, int nb_sectors);
 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs);
+void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
+                                    int64_t sector_num, int nb_sectors,
+                                    int64_t *cluster_sector_num,
+                                    int *cluster_nb_sectors);
 void bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t sector_num, int nb_sectors,
-                            int64_t *cluster_sector_num,
-                            int *cluster_nb_sectors);
+                            int64_t offset, unsigned int bytes,
+                            int64_t *cluster_offset,
+                            unsigned int *cluster_bytes);
 
 const char *bdrv_get_encrypted_filename(BlockDriverState *bs);
 void bdrv_get_backing_filename(BlockDriverState *bs,
diff --git a/trace-events b/trace-events
index 720c644873..104b64fae1 100644
--- a/trace-events
+++ b/trace-events
@@ -73,7 +73,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
-bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
+bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
 
 # block/stream.c
 stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"

From 49c0752600d22606b411ace75b22e65398999d95 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 3 Jun 2016 16:17:28 +0200
Subject: [PATCH 14/39] block: Prepare bdrv_aligned_preadv() for byte-aligned
 requests

This patch makes bdrv_aligned_preadv() ready to accept byte-aligned
requests. Note that this doesn't mean that such requests are actually
made. The caller still ensures that all requests are aligned to at least
512 bytes.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/block/io.c b/block/io.c
index b6a2c800a1..e75bce263d 100644
--- a/block/io.c
+++ b/block/io.c
@@ -963,11 +963,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 {
     int ret;
 
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(is_power_of_2(align));
+    assert((offset & (align - 1)) == 0);
+    assert((bytes & (align - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
     assert(!(flags & ~BDRV_REQ_MASK));
@@ -987,9 +985,12 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
+        int64_t start_sector = offset >> BDRV_SECTOR_BITS;
+        int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+        unsigned int nb_sectors = end_sector - start_sector;
         int pnum;
 
-        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
         if (ret < 0) {
             goto out;
         }
@@ -1005,28 +1006,24 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
     } else {
         /* Read zeros after EOF */
-        int64_t total_sectors, max_nb_sectors;
+        int64_t total_bytes, max_bytes;
 
-        total_sectors = bdrv_nb_sectors(bs);
-        if (total_sectors < 0) {
-            ret = total_sectors;
+        total_bytes = bdrv_getlength(bs);
+        if (total_bytes < 0) {
+            ret = total_bytes;
             goto out;
         }
 
-        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
-                                  align >> BDRV_SECTOR_BITS);
-        if (nb_sectors < max_nb_sectors) {
+        max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
+        if (bytes < max_bytes) {
             ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
-        } else if (max_nb_sectors > 0) {
+        } else if (max_bytes > 0) {
             QEMUIOVector local_qiov;
 
             qemu_iovec_init(&local_qiov, qiov->niov);
-            qemu_iovec_concat(&local_qiov, qiov, 0,
-                              max_nb_sectors * BDRV_SECTOR_SIZE);
+            qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
 
-            ret = bdrv_driver_preadv(bs, offset,
-                                     max_nb_sectors * BDRV_SECTOR_SIZE,
-                                     &local_qiov, 0);
+            ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
 
             qemu_iovec_destroy(&local_qiov);
         } else {
@@ -1034,11 +1031,10 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         }
 
         /* Reading beyond end of file is supposed to produce zeroes */
-        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
-            uint64_t offset = MAX(0, total_sectors - sector_num);
-            uint64_t bytes = (sector_num + nb_sectors - offset) *
-                              BDRV_SECTOR_SIZE;
-            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+        if (ret == 0 && total_bytes < offset + bytes) {
+            uint64_t zero_offset = MAX(0, total_bytes - offset);
+            uint64_t zero_bytes = offset + bytes - zero_offset;
+            qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
         }
     }
 

From 9896c8765fdd916844596c3e6929b3c641e930b1 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 3 Jun 2016 18:42:51 +0200
Subject: [PATCH 15/39] block: Prepare bdrv_aligned_pwritev() for byte-aligned
 requests

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/block/io.c b/block/io.c
index e75bce263d..b261cc6327 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1249,11 +1249,9 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     bool waited;
     int ret;
 
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int64_t start_sector = offset >> BDRV_SECTOR_BITS;
+    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
     assert(!(flags & ~BDRV_REQ_MASK));
@@ -1278,22 +1276,21 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
         /* Do nothing, write notifier decided to fail this request */
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
-        ret = bdrv_co_do_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
-                                       nb_sectors << BDRV_SECTOR_BITS, flags);
+        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
     } else {
         bdrv_debug_event(bs, BLKDBG_PWRITEV);
         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
     }
     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
 
-    bdrv_set_dirty(bs, sector_num, nb_sectors);
+    bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
 
     if (bs->wr_highest_offset < offset + bytes) {
         bs->wr_highest_offset = offset + bytes;
     }
 
     if (ret >= 0) {
-        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+        bs->total_sectors = MAX(bs->total_sectors, end_sector);
     }
 
     return ret;

From 2174f12bdeb3974141784e14bbb7ad8c53178cd9 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 6 Aug 2014 17:18:07 +0200
Subject: [PATCH 16/39] raw-posix: Switch to bdrv_co_* interfaces

In order to use the modern byte-based .bdrv_co_preadv/pwritev()
interface, this patch switches raw-posix to coroutine-based interfaces
as a first step. In terms of semantics and performance, it doesn't make
a difference with the existing code whether we go from a coroutine to a
callback-based interface already in block/io.c or only in linux-aio.c

As there have been concerns in the past that this change may be a step
in the wrong direction with respect to a possible AIO fast path, the
old callback-based interface for linux-aio is left around and can be
reactivated when a fast path (e.g. directly from virtio-blk dataplane,
bypassing the whole block layer) is implemented.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/linux-aio.c | 87 +++++++++++++++++++++++++++++++++++------------
 block/raw-aio.h   |  4 +++
 block/raw-posix.c | 59 +++++++++++++++-----------------
 3 files changed, 96 insertions(+), 54 deletions(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index 90ec98ee23..657577a22d 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -11,8 +11,10 @@
 #include "qemu-common.h"
 #include "block/aio.h"
 #include "qemu/queue.h"
+#include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/event_notifier.h"
+#include "qemu/coroutine.h"
 
 #include <libaio.h>
 
@@ -30,6 +32,7 @@
 
 struct qemu_laiocb {
     BlockAIOCB common;
+    Coroutine *co;
     LinuxAioState *ctx;
     struct iocb iocb;
     ssize_t ret;
@@ -88,9 +91,14 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
             }
         }
     }
-    laiocb->common.cb(laiocb->common.opaque, ret);
 
-    qemu_aio_unref(laiocb);
+    laiocb->ret = ret;
+    if (laiocb->co) {
+        qemu_coroutine_enter(laiocb->co, NULL);
+    } else {
+        laiocb->common.cb(laiocb->common.opaque, ret);
+        qemu_aio_unref(laiocb);
+    }
 }
 
 /* The completion BH fetches completed I/O requests and invokes their
@@ -230,22 +238,12 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
     }
 }
 
-BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
+static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
+                          int type)
 {
-    struct qemu_laiocb *laiocb;
-    struct iocb *iocbs;
-    off_t offset = sector_num * 512;
-
-    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
-    laiocb->nbytes = nb_sectors * 512;
-    laiocb->ctx = s;
-    laiocb->ret = -EINPROGRESS;
-    laiocb->is_read = (type == QEMU_AIO_READ);
-    laiocb->qiov = qiov;
-
-    iocbs = &laiocb->iocb;
+    LinuxAioState *s = laiocb->ctx;
+    struct iocb *iocbs = &laiocb->iocb;
+    QEMUIOVector *qiov = laiocb->qiov;
 
     switch (type) {
     case QEMU_AIO_WRITE:
@@ -258,7 +256,7 @@ BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
     default:
         fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                         __func__, type);
-        goto out_free_aiocb;
+        return -EIO;
     }
     io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
 
@@ -268,11 +266,56 @@ BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
         (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
         ioq_submit(s);
     }
-    return &laiocb->common;
 
-out_free_aiocb:
-    qemu_aio_unref(laiocb);
-    return NULL;
+    return 0;
+}
+
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+                                int64_t sector_num, QEMUIOVector *qiov,
+                                int nb_sectors, int type)
+{
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+    int ret;
+
+    struct qemu_laiocb laiocb = {
+        .co         = qemu_coroutine_self(),
+        .nbytes     = nb_sectors * BDRV_SECTOR_SIZE,
+        .ctx        = s,
+        .is_read    = (type == QEMU_AIO_READ),
+        .qiov       = qiov,
+    };
+
+    ret = laio_do_submit(fd, &laiocb, offset, type);
+    if (ret < 0) {
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+    return laiocb.ret;
+}
+
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque, int type)
+{
+    struct qemu_laiocb *laiocb;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+    int ret;
+
+    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
+    laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    laiocb->ctx = s;
+    laiocb->ret = -EINPROGRESS;
+    laiocb->is_read = (type == QEMU_AIO_READ);
+    laiocb->qiov = qiov;
+
+    ret = laio_do_submit(fd, laiocb, offset, type);
+    if (ret < 0) {
+        qemu_aio_unref(laiocb);
+        return NULL;
+    }
+
+    return &laiocb->common;
 }
 
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
diff --git a/block/raw-aio.h b/block/raw-aio.h
index 714714e016..03bbfba565 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -15,6 +15,7 @@
 #ifndef QEMU_RAW_AIO_H
 #define QEMU_RAW_AIO_H
 
+#include "qemu/coroutine.h"
 #include "qemu/iov.h"
 
 /* AIO request types */
@@ -38,6 +39,9 @@
 typedef struct LinuxAioState LinuxAioState;
 LinuxAioState *laio_init(void);
 void laio_cleanup(LinuxAioState *s);
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+                                int64_t sector_num, QEMUIOVector *qiov,
+                                int nb_sectors, int type);
 BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockCompletionFunc *cb, void *opaque, int type);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index ce2e20f203..cb987692c6 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1325,14 +1325,13 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
 
-static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
+static int coroutine_fn raw_co_rw(BlockDriverState *bs, int64_t sector_num,
+                                  int nb_sectors, QEMUIOVector *qiov, int type)
 {
     BDRVRawState *s = bs->opaque;
 
     if (fd_open(bs) < 0)
-        return NULL;
+        return -EIO;
 
     /*
      * Check if the underlying device requires requests to be aligned,
@@ -1345,14 +1344,26 @@ static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
         } else if (s->use_aio) {
-            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
-                               nb_sectors, cb, opaque, type);
+            return laio_co_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
+                                  nb_sectors, type);
 #endif
         }
     }
 
-    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
-                       cb, opaque, type);
+    return paio_submit_co(bs, s->fd, sector_num * BDRV_SECTOR_SIZE, qiov,
+                          nb_sectors * BDRV_SECTOR_SIZE, type);
+}
+
+static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
+                                     int nb_sectors, QEMUIOVector *qiov)
+{
+    return raw_co_rw(bs, sector_num, nb_sectors, qiov, QEMU_AIO_READ);
+}
+
+static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    return raw_co_rw(bs, sector_num, nb_sectors, qiov, QEMU_AIO_WRITE);
 }
 
 static void raw_aio_plug(BlockDriverState *bs)
@@ -1375,22 +1386,6 @@ static void raw_aio_unplug(BlockDriverState *bs)
 #endif
 }
 
-static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
-                          cb, opaque, QEMU_AIO_READ);
-}
-
-static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
-                          cb, opaque, QEMU_AIO_WRITE);
-}
-
 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
         BlockCompletionFunc *cb, void *opaque)
 {
@@ -1957,8 +1952,8 @@ BlockDriver bdrv_file = {
     .bdrv_co_get_block_status = raw_co_get_block_status,
     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
 
-    .bdrv_aio_readv = raw_aio_readv,
-    .bdrv_aio_writev = raw_aio_writev,
+    .bdrv_co_readv          = raw_co_readv,
+    .bdrv_co_writev         = raw_co_writev,
     .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_discard = raw_aio_discard,
     .bdrv_refresh_limits = raw_refresh_limits,
@@ -2405,8 +2400,8 @@ static BlockDriver bdrv_host_device = {
     .create_opts         = &raw_create_opts,
     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
 
-    .bdrv_aio_readv	= raw_aio_readv,
-    .bdrv_aio_writev	= raw_aio_writev,
+    .bdrv_co_readv          = raw_co_readv,
+    .bdrv_co_writev         = raw_co_writev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_aio_discard   = hdev_aio_discard,
     .bdrv_refresh_limits = raw_refresh_limits,
@@ -2535,8 +2530,8 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_create         = hdev_create,
     .create_opts         = &raw_create_opts,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_co_readv          = raw_co_readv,
+    .bdrv_co_writev         = raw_co_writev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
@@ -2670,8 +2665,8 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_create        = hdev_create,
     .create_opts        = &raw_create_opts,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_co_readv          = raw_co_readv,
+    .bdrv_co_writev         = raw_co_writev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,

From 9d52aa3c38484703c05e791bac1ff588588a217c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 3 Jun 2016 17:36:27 +0200
Subject: [PATCH 17/39] raw-posix: Implement .bdrv_co_preadv/pwritev

The raw-posix block driver actually supports byte-aligned requests now
on non-O_DIRECT images, like it already (and previously incorrectly)
claimed in bs->request_alignment.

For some block drivers this means that a RMW cycle can be avoided when
they write sub-sector metadata e.g. for cluster allocation.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/linux-aio.c |  7 ++-----
 block/raw-aio.h   |  3 +--
 block/raw-posix.c | 43 +++++++++++++++++++++++--------------------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index 657577a22d..fe7cece415 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -271,15 +271,12 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
 }
 
 int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                int64_t sector_num, QEMUIOVector *qiov,
-                                int nb_sectors, int type)
+                                uint64_t offset, QEMUIOVector *qiov, int type)
 {
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
     int ret;
-
     struct qemu_laiocb laiocb = {
         .co         = qemu_coroutine_self(),
-        .nbytes     = nb_sectors * BDRV_SECTOR_SIZE,
+        .nbytes     = qiov->size,
         .ctx        = s,
         .is_read    = (type == QEMU_AIO_READ),
         .qiov       = qiov,
diff --git a/block/raw-aio.h b/block/raw-aio.h
index 03bbfba565..a4cdbbf1b7 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -40,8 +40,7 @@ typedef struct LinuxAioState LinuxAioState;
 LinuxAioState *laio_init(void);
 void laio_cleanup(LinuxAioState *s);
 int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                int64_t sector_num, QEMUIOVector *qiov,
-                                int nb_sectors, int type);
+                                uint64_t offset, QEMUIOVector *qiov, int type);
 BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockCompletionFunc *cb, void *opaque, int type);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index cb987692c6..aacf13203f 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1325,8 +1325,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
 
-static int coroutine_fn raw_co_rw(BlockDriverState *bs, int64_t sector_num,
-                                  int nb_sectors, QEMUIOVector *qiov, int type)
+static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
+                                   uint64_t bytes, QEMUIOVector *qiov, int type)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -1344,26 +1344,28 @@ static int coroutine_fn raw_co_rw(BlockDriverState *bs, int64_t sector_num,
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
         } else if (s->use_aio) {
-            return laio_co_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
-                                  nb_sectors, type);
+            assert(qiov->size == bytes);
+            return laio_co_submit(bs, s->aio_ctx, s->fd, offset, qiov, type);
 #endif
         }
     }
 
-    return paio_submit_co(bs, s->fd, sector_num * BDRV_SECTOR_SIZE, qiov,
-                          nb_sectors * BDRV_SECTOR_SIZE, type);
+    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
 }
 
-static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
-                                     int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                      uint64_t bytes, QEMUIOVector *qiov,
+                                      int flags)
 {
-    return raw_co_rw(bs, sector_num, nb_sectors, qiov, QEMU_AIO_READ);
+    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
 }
 
-static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes, QEMUIOVector *qiov,
+                                       int flags)
 {
-    return raw_co_rw(bs, sector_num, nb_sectors, qiov, QEMU_AIO_WRITE);
+    assert(flags == 0);
+    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }
 
 static void raw_aio_plug(BlockDriverState *bs)
@@ -1952,8 +1954,8 @@ BlockDriver bdrv_file = {
     .bdrv_co_get_block_status = raw_co_get_block_status,
     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
 
-    .bdrv_co_readv          = raw_co_readv,
-    .bdrv_co_writev         = raw_co_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_discard = raw_aio_discard,
     .bdrv_refresh_limits = raw_refresh_limits,
@@ -2400,8 +2402,8 @@ static BlockDriver bdrv_host_device = {
     .create_opts         = &raw_create_opts,
     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
 
-    .bdrv_co_readv          = raw_co_readv,
-    .bdrv_co_writev         = raw_co_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_aio_discard   = hdev_aio_discard,
     .bdrv_refresh_limits = raw_refresh_limits,
@@ -2530,8 +2532,9 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_create         = hdev_create,
     .create_opts         = &raw_create_opts,
 
-    .bdrv_co_readv          = raw_co_readv,
-    .bdrv_co_writev         = raw_co_writev,
+
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
@@ -2665,8 +2668,8 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_create        = hdev_create,
     .create_opts        = &raw_create_opts,
 
-    .bdrv_co_readv          = raw_co_readv,
-    .bdrv_co_writev         = raw_co_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,

From 23b0d9fb1dd39d85fc2e0cabba5cbe005126e7b5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 3 Jun 2016 16:45:44 +0200
Subject: [PATCH 18/39] block: Don't enforce 512 byte minimum alignment

If block drivers say that they can do an alignment < 512 bytes, let's
just suppose they mean it. raw-posix used to be an offender with respect
to this, but it can actually deal with byte-aligned requests now.

The default is still 512 bytes for any drivers that only implement
sector-based interfaces, but it is 1 now for drivers that implement
.bdrv_co_preadv.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c    | 2 +-
 block/io.c | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index f54bc25e8a..3d850a2999 100644
--- a/block.c
+++ b/block.c
@@ -937,7 +937,7 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
         goto fail_opts;
     }
 
-    bs->request_alignment = 512;
+    bs->request_alignment = drv->bdrv_co_preadv ? 1 : 512;
     bs->zero_beyond_eof = true;
     bs->read_only = !(bs->open_flags & BDRV_O_RDWR);
 
diff --git a/block/io.c b/block/io.c
index b261cc6327..b3ff9bec38 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1052,8 +1052,7 @@ int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
 
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint64_t align = bs->request_alignment;
     uint8_t *head_buf = NULL;
     uint8_t *tail_buf = NULL;
     QEMUIOVector local_qiov;
@@ -1305,7 +1304,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
     uint8_t *buf = NULL;
     QEMUIOVector local_qiov;
     struct iovec iov;
-    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint64_t align = bs->request_alignment;
     unsigned int head_padding_bytes, tail_padding_bytes;
     int ret = 0;
 
@@ -1392,8 +1391,7 @@ int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
     BdrvRequestFlags flags)
 {
     BdrvTrackedRequest req;
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint64_t align = bs->request_alignment;
     uint8_t *head_buf = NULL;
     uint8_t *tail_buf = NULL;
     QEMUIOVector local_qiov;

From ccb9dc10129954d0bcd7814298ed445e684d5a2a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 28 Nov 2014 15:23:12 +0100
Subject: [PATCH 19/39] linux-aio: Cancel BH if not needed

linux-aio uses a BH in order to make sure that the remaining completions
are processed even in nested event loops of completion callbacks in
order to avoid deadlocks.

There is no need, however, to have the BH overhead for the first call
into qemu_laio_completion_bh() or after all pending completions have
already been processed. Therefore, this patch calls directly into
qemu_laio_completion_bh() in qemu_laio_completion_cb() and cancels
the BH after qemu_laio_completion_bh() has processed all pending
completions.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/linux-aio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index fe7cece415..e468960146 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -149,6 +149,8 @@ static void qemu_laio_completion_bh(void *opaque)
     if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
+
+    qemu_bh_cancel(s->completion_bh);
 }
 
 static void qemu_laio_completion_cb(EventNotifier *e)
@@ -156,7 +158,7 @@ static void qemu_laio_completion_cb(EventNotifier *e)
     LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
-        qemu_bh_schedule(s->completion_bh);
+        qemu_laio_completion_bh(s);
     }
 }
 

From 48bea965724fafd51b395f4c0b5a0c173de46c2b Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Thu, 16 Jun 2016 09:53:53 +0200
Subject: [PATCH 20/39] doc: Fix mailing list address in
 tests/qemu-iotests/README

The address of the mailing list is qemu-devel@nongnu.org
instead of qemu-devel@savannah.nongnu.org. And while we're
at it, also mention the qemu-block mailing list here.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/README | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/README b/tests/qemu-iotests/README
index 4ccfdd1cc0..6079b401ae 100644
--- a/tests/qemu-iotests/README
+++ b/tests/qemu-iotests/README
@@ -17,4 +17,5 @@ additional options to test further image formats or I/O methods.
 * Feedback and patches
 
 Please send improvements to the test suite, general feedback or just
-reports of failing tests cases to qemu-devel@savannah.nongnu.org.
+reports of failing tests cases to qemu-devel@nongnu.org with a CC:
+to qemu-block@nongnu.org.

From f1e8474115d6be7eda14092050ffa2b031afb729 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 9 Jun 2016 16:36:00 +0200
Subject: [PATCH 21/39] block: Introduce bdrv_preadv()

We already have a byte-based bdrv_pwritev(), but the read counterpart
was still missing. This commit adds it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c            | 20 +++++++++++++-------
 include/block/block.h |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index b3ff9bec38..72d72109a5 100644
--- a/block/io.c
+++ b/block/io.c
@@ -700,6 +700,18 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
     }
 }
 
+int bdrv_preadv(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+    int ret;
+
+    ret = bdrv_prwv_co(bs, offset, qiov, false, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return qiov->size;
+}
+
 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
 {
     QEMUIOVector qiov;
@@ -707,19 +719,13 @@ int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
         .iov_base = (void *)buf,
         .iov_len = bytes,
     };
-    int ret;
 
     if (bytes < 0) {
         return -EINVAL;
     }
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return bytes;
+    return bdrv_preadv(bs, offset, &qiov);
 }
 
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
diff --git a/include/block/block.h b/include/block/block.h
index 9c3a62cc0a..9c63d07765 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -235,6 +235,7 @@ int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
                void *buf, int count);
+int bdrv_preadv(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov);
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
                 const void *buf, int count);
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov);

From 5ddda0b8f0c07c8082c87d248c8eb23f43fd44a1 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 9 Jun 2016 16:50:16 +0200
Subject: [PATCH 22/39] block: Make .bdrv_load_vmstate() vectored

This brings it in line with .bdrv_save_vmstate().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c                | 25 ++++++++++++++++++++-----
 block/qcow2.c             |  6 +++---
 block/sheepdog.c          | 13 ++++++++++---
 include/block/block.h     |  1 +
 include/block/block_int.h |  4 ++--
 5 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/block/io.c b/block/io.c
index 72d72109a5..aac9b67d94 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1870,14 +1870,29 @@ int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 
 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
                       int64_t pos, int size)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = buf,
+        .iov_len    = size,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_readv_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 {
     BlockDriver *drv = bs->drv;
-    if (!drv)
+
+    if (!drv) {
         return -ENOMEDIUM;
-    if (drv->bdrv_load_vmstate)
-        return drv->bdrv_load_vmstate(bs, buf, pos, size);
-    if (bs->file)
-        return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
+    } else if (drv->bdrv_load_vmstate) {
+        return drv->bdrv_load_vmstate(bs, qiov, pos);
+    } else if (bs->file) {
+        return bdrv_readv_vmstate(bs->file->bs, qiov, pos);
+    }
+
     return -ENOTSUP;
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 9de716a524..362ada24f9 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2926,8 +2926,8 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
     return ret;
 }
 
-static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
-                              int64_t pos, int size)
+static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                              int64_t pos)
 {
     BDRVQcow2State *s = bs->opaque;
     bool zero_beyond_eof = bs->zero_beyond_eof;
@@ -2935,7 +2935,7 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
     bs->zero_beyond_eof = false;
-    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+    ret = bdrv_preadv(bs, qcow2_vm_state_offset(s) + pos, qiov);
     bs->zero_beyond_eof = zero_beyond_eof;
 
     return ret;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 23fbace1f9..ef5d044ab9 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2784,12 +2784,19 @@ static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
     return ret;
 }
 
-static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
-                           int64_t pos, int size)
+static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                           int64_t pos)
 {
     BDRVSheepdogState *s = bs->opaque;
+    void *buf;
+    int ret;
 
-    return do_load_save_vmstate(s, data, pos, size, 1);
+    buf = qemu_blockalign(bs, qiov->size);
+    ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
+    qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
+    qemu_vfree(buf);
+
+    return ret;
 }
 
 
diff --git a/include/block/block.h b/include/block/block.h
index 9c63d07765..733a8ec2ec 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -431,6 +431,7 @@ void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
 
+int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
                       int64_t pos, int size);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 8a4963c4fe..f9a32cc5c5 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -226,8 +226,8 @@ struct BlockDriver {
 
     int (*bdrv_save_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos);
-    int (*bdrv_load_vmstate)(BlockDriverState *bs, uint8_t *buf,
-                             int64_t pos, int size);
+    int (*bdrv_load_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov,
+                             int64_t pos);
 
     int (*bdrv_change_backing_file)(BlockDriverState *bs,
         const char *backing_file, const char *backing_fmt);

From b433d9424d271ce68f592590f77dc2b30944d3a5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 10 Jun 2016 17:57:26 +0200
Subject: [PATCH 23/39] block: Allow .bdrv_load/save_vmstate() to return
 0/-errno

The return value of .bdrv_load/save_vmstate() can be any non-negative
number in case of success now. It used to be bytes/-errno.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index aac9b67d94..af4d43ebef 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1848,9 +1848,16 @@ int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
         .iov_base   = (void *) buf,
         .iov_len    = size,
     };
+    int ret;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_writev_vmstate(bs, &qiov, pos);
+
+    ret = bdrv_writev_vmstate(bs, &qiov, pos);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return size;
 }
 
 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
@@ -1876,9 +1883,15 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
         .iov_base   = buf,
         .iov_len    = size,
     };
+    int ret;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_readv_vmstate(bs, &qiov, pos);
+    ret = bdrv_readv_vmstate(bs, &qiov, pos);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return size;
 }
 
 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)

From 1a8ae8221799901dc399a174b52a970d8e6f976a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 9 Jun 2016 16:24:44 +0200
Subject: [PATCH 24/39] block: Make bdrv_load/save_vmstate coroutine_fns

This allows drivers to share code between normal I/O and vmstate
accesses.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/io.c                | 80 ++++++++++++++++++++++++++++-----------
 include/block/block_int.h | 10 +++--
 2 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/block/io.c b/block/io.c
index af4d43ebef..e1e8948be9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1840,6 +1840,62 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
 }
 
+typedef struct BdrvVmstateCo {
+    BlockDriverState    *bs;
+    QEMUIOVector        *qiov;
+    int64_t             pos;
+    bool                is_read;
+    int                 ret;
+} BdrvVmstateCo;
+
+static int coroutine_fn
+bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+                   bool is_read)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    } else if (drv->bdrv_load_vmstate) {
+        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
+                       : drv->bdrv_save_vmstate(bs, qiov, pos);
+    } else if (bs->file) {
+        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+    }
+
+    return -ENOTSUP;
+}
+
+static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
+{
+    BdrvVmstateCo *co = opaque;
+    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
+}
+
+static inline int
+bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+                bool is_read)
+{
+    if (qemu_in_coroutine()) {
+        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
+    } else {
+        BdrvVmstateCo data = {
+            .bs         = bs,
+            .qiov       = qiov,
+            .pos        = pos,
+            .is_read    = is_read,
+            .ret        = -EINPROGRESS,
+        };
+        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry);
+
+        qemu_coroutine_enter(co, &data);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(bdrv_get_aio_context(bs), true);
+        }
+        return data.ret;
+    }
+}
+
 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
                       int64_t pos, int size)
 {
@@ -1862,17 +1918,7 @@ int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
 
 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 {
-    BlockDriver *drv = bs->drv;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    } else if (drv->bdrv_save_vmstate) {
-        return drv->bdrv_save_vmstate(bs, qiov, pos);
-    } else if (bs->file) {
-        return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
-    }
-
-    return -ENOTSUP;
+    return bdrv_rw_vmstate(bs, qiov, pos, false);
 }
 
 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
@@ -1896,17 +1942,7 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 
 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 {
-    BlockDriver *drv = bs->drv;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    } else if (drv->bdrv_load_vmstate) {
-        return drv->bdrv_load_vmstate(bs, qiov, pos);
-    } else if (bs->file) {
-        return bdrv_readv_vmstate(bs->file->bs, qiov, pos);
-    }
-
-    return -ENOTSUP;
+    return bdrv_rw_vmstate(bs, qiov, pos, true);
 }
 
 /**************************************************************/
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f9a32cc5c5..1fe0811a65 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -224,10 +224,12 @@ struct BlockDriver {
     int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
     ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs);
 
-    int (*bdrv_save_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov,
-                             int64_t pos);
-    int (*bdrv_load_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov,
-                             int64_t pos);
+    int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
+                                          QEMUIOVector *qiov,
+                                          int64_t pos);
+    int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
+                                          QEMUIOVector *qiov,
+                                          int64_t pos);
 
     int (*bdrv_change_backing_file)(BlockDriverState *bs,
         const char *backing_file, const char *backing_fmt);

From 734a77584ae13d36113a7a7cd8b54beb49a8a48e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 1 Jun 2016 17:07:24 +0200
Subject: [PATCH 25/39] qcow2: Let vmstate call qcow2_co_preadv/pwrite directly

We don't really want to go through the block layer in order to read from
or write to the vmstate in a qcow2 image. Doing so required a few ugly
hacks like saving and restoring the old image size (because writing to
vmstate offsets would increase the image size) or disabling the "reads
after EOF = zeroes" logic. When calling the right functions directly,
these hacks aren't necessary any more.

Note that .bdrv_vmstate_load/save() return 0 instead of the number of
bytes in case of success now.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2.c | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 362ada24f9..4718f8250e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2909,36 +2909,20 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                               int64_t pos)
 {
     BDRVQcow2State *s = bs->opaque;
-    int64_t total_sectors = bs->total_sectors;
-    bool zero_beyond_eof = bs->zero_beyond_eof;
-    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
-    bs->zero_beyond_eof = false;
-    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
-    bs->zero_beyond_eof = zero_beyond_eof;
-
-    /* bdrv_co_do_writev will have increased the total_sectors value to include
-     * the VM state - the VM state is however not an actual part of the block
-     * device, therefore, we need to restore the old value. */
-    bs->total_sectors = total_sectors;
-
-    return ret;
+    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
+                                    qiov->size, qiov, 0);
 }
 
 static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                               int64_t pos)
 {
     BDRVQcow2State *s = bs->opaque;
-    bool zero_beyond_eof = bs->zero_beyond_eof;
-    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
-    bs->zero_beyond_eof = false;
-    ret = bdrv_preadv(bs, qcow2_vm_state_offset(s) + pos, qiov);
-    bs->zero_beyond_eof = zero_beyond_eof;
-
-    return ret;
+    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
+                                   qiov->size, qiov, 0);
 }
 
 /*

From c9d20029f43a08c6362a655c2c5272612186a004 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 1 Jun 2016 17:13:47 +0200
Subject: [PATCH 26/39] block: Remove bs->zero_beyond_eof

It is always true for open images now.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c                   |  2 --
 block/io.c                | 56 +++++++++++++++++----------------------
 include/block/block_int.h |  3 ---
 3 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/block.c b/block.c
index 3d850a2999..b350794c3e 100644
--- a/block.c
+++ b/block.c
@@ -938,7 +938,6 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
     }
 
     bs->request_alignment = drv->bdrv_co_preadv ? 1 : 512;
-    bs->zero_beyond_eof = true;
     bs->read_only = !(bs->open_flags & BDRV_O_RDWR);
 
     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
@@ -2192,7 +2191,6 @@ static void bdrv_close(BlockDriverState *bs)
         bs->encrypted = 0;
         bs->valid_key = 0;
         bs->sg = 0;
-        bs->zero_beyond_eof = false;
         QDECREF(bs->options);
         QDECREF(bs->explicit_options);
         bs->options = NULL;
diff --git a/block/io.c b/block/io.c
index e1e8948be9..f5ce5f171e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -967,6 +967,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
     int64_t align, QEMUIOVector *qiov, int flags)
 {
+    int64_t total_bytes, max_bytes;
     int ret;
 
     assert(is_power_of_2(align));
@@ -1008,40 +1009,33 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
     /* Forward the request to the BlockDriver */
-    if (!bs->zero_beyond_eof) {
+    total_bytes = bdrv_getlength(bs);
+    if (total_bytes < 0) {
+        ret = total_bytes;
+        goto out;
+    }
+
+    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
+    if (bytes < max_bytes) {
         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
+    } else if (max_bytes > 0) {
+        QEMUIOVector local_qiov;
+
+        qemu_iovec_init(&local_qiov, qiov->niov);
+        qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
+
+        ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
+
+        qemu_iovec_destroy(&local_qiov);
     } else {
-        /* Read zeros after EOF */
-        int64_t total_bytes, max_bytes;
+        ret = 0;
+    }
 
-        total_bytes = bdrv_getlength(bs);
-        if (total_bytes < 0) {
-            ret = total_bytes;
-            goto out;
-        }
-
-        max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
-        if (bytes < max_bytes) {
-            ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
-        } else if (max_bytes > 0) {
-            QEMUIOVector local_qiov;
-
-            qemu_iovec_init(&local_qiov, qiov->niov);
-            qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
-
-            ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
-
-            qemu_iovec_destroy(&local_qiov);
-        } else {
-            ret = 0;
-        }
-
-        /* Reading beyond end of file is supposed to produce zeroes */
-        if (ret == 0 && total_bytes < offset + bytes) {
-            uint64_t zero_offset = MAX(0, total_bytes - offset);
-            uint64_t zero_bytes = offset + bytes - zero_offset;
-            qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
-        }
+    /* Reading beyond end of file is supposed to produce zeroes */
+    if (ret == 0 && total_bytes < offset + bytes) {
+        uint64_t zero_offset = MAX(0, total_bytes - offset);
+        uint64_t zero_bytes = offset + bytes - zero_offset;
+        qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
     }
 
 out:
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1fe0811a65..16c43e224e 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -451,9 +451,6 @@ struct BlockDriverState {
     /* I/O Limits */
     BlockLimits bl;
 
-    /* Whether produces zeros when read beyond eof */
-    bool zero_beyond_eof;
-
     /* Alignment requirement for offset/length of I/O requests */
     unsigned int request_alignment;
     /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */

From 418690447a5340050e64efc3c811fd30d9e96e5b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 16 Jun 2016 12:59:30 +0200
Subject: [PATCH 27/39] block: Fix snapshot=on with aio=native

snapshot=on creates a temporary overlay that is always opened with
cache=unsafe (the cache mode specified by the user is only for the
actual image file and its children). This means that we must not inherit
the BDRV_O_NATIVE_AIO flag for the temporary overlay because trying to
use Linux AIO with cache=unsafe results in an error.

Reproducer without this patch:

$ x86_64-softmmu/qemu-system-x86_64 -drive file=/tmp/test.qcow2,cache=none,aio=native,snapshot=on
qemu-system-x86_64: -drive file=/tmp/test.qcow2,cache=none,aio=native,snapshot=on: aio=native was
specified, but it requires cache.direct=on, which was not specified.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block.c b/block.c
index b350794c3e..8104225cd3 100644
--- a/block.c
+++ b/block.c
@@ -684,6 +684,10 @@ static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
     /* For temporary files, unconditional cache=unsafe is fine */
     qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
     qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
+
+    /* aio=native doesn't work for cache.direct=off, so disable it for the
+     * temporary snapshot */
+    *child_flags &= ~BDRV_O_NATIVE_AIO;
 }
 
 /*

From eb1364ceac3d62f69701d00383b3052a9bdb0df7 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Fri, 27 May 2016 12:53:36 +0200
Subject: [PATCH 28/39] block: use the block job list in bdrv_drain_all()

bdrv_drain_all() pauses all block jobs by using bdrv_next() to iterate
over all top-level BlockDriverStates. Therefore the code is unable to
find block jobs in other nodes.

This patch uses block_job_next() to iterate over all block jobs.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 55ee7d7d4a65c28aa1a1b28823897ef326f328e2.1464346103.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/io.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index f5ce5f171e..ebdb9d834c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -289,15 +289,21 @@ void bdrv_drain_all(void)
     bool busy = true;
     BlockDriverState *bs;
     BdrvNextIterator it;
+    BlockJob *job = NULL;
     GSList *aio_ctxs = NULL, *ctx;
 
+    while ((job = block_job_next(job))) {
+        AioContext *aio_context = blk_get_aio_context(job->blk);
+
+        aio_context_acquire(aio_context);
+        block_job_pause(job);
+        aio_context_release(aio_context);
+    }
+
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        if (bs->job) {
-            block_job_pause(bs->job);
-        }
         bdrv_parent_drained_begin(bs);
         bdrv_io_unplugged_begin(bs);
         bdrv_drain_recurse(bs);
@@ -340,12 +346,18 @@ void bdrv_drain_all(void)
         aio_context_acquire(aio_context);
         bdrv_io_unplugged_end(bs);
         bdrv_parent_drained_end(bs);
-        if (bs->job) {
-            block_job_resume(bs->job);
-        }
         aio_context_release(aio_context);
     }
     g_slist_free(aio_ctxs);
+
+    job = NULL;
+    while ((job = block_job_next(job))) {
+        AioContext *aio_context = blk_get_aio_context(job->blk);
+
+        aio_context_acquire(aio_context);
+        block_job_resume(job);
+        aio_context_release(aio_context);
+    }
 }
 
 /**

From f0f55deda2ac3c742d1dc67526d8834a50870285 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Fri, 27 May 2016 12:53:37 +0200
Subject: [PATCH 29/39] block: use the block job list in qmp_query_block_jobs()

qmp_query_block_jobs() uses bdrv_next() to look for block jobs, but
this function can only find those in top-level BlockDriverStates.

This patch uses block_job_next() instead.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: a8b7e5497b7c1fa67c12fcceae1630d01c3b1f96.1464346103.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockdev.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 11177b4fcc..1d498c7b19 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -4157,22 +4157,18 @@ void qmp_x_blockdev_change(const char *parent, bool has_child,
 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
     BlockJobInfoList *head = NULL, **p_next = &head;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockJob *job;
 
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+        BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
+        AioContext *aio_context = blk_get_aio_context(job->blk);
 
         aio_context_acquire(aio_context);
-
-        if (bs->job) {
-            BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
-            elem->value = block_job_query(bs->job);
-            *p_next = elem;
-            p_next = &elem->next;
-        }
-
+        elem->value = block_job_query(job);
         aio_context_release(aio_context);
+
+        *p_next = elem;
+        p_next = &elem->next;
     }
 
     return head;

From 0824afda0cd20045ffe87d58e142774514b61026 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Fri, 27 May 2016 12:53:38 +0200
Subject: [PATCH 30/39] block: Prevent sleeping jobs from resuming if they have
 been paused

If we pause a block job and drain its BlockDriverState we want that
the job remains inactive until we call block_job_resume() again.

However if we pause the job while it is sleeping then it will resume
when the sleep timer fires.

This patch prevents that from happening by checking if the job has
been paused after it comes back from sleeping.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Suggested-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 3d9011151512326b890d22bdab3530244ef349d7.1464346103.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockjob.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index c095cc57cb..01b896b7e7 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -361,10 +361,12 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
     }
 
     job->busy = false;
+    if (!block_job_is_paused(job)) {
+        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
+    }
+    /* The job can be paused while sleeping, so check this again */
     if (block_job_is_paused(job)) {
         qemu_coroutine_yield();
-    } else {
-        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
     }
     job->busy = true;
 }

From 834fe28ddffaec469cf048c7a48eb610e7a6e906 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Fri, 27 May 2016 12:53:39 +0200
Subject: [PATCH 31/39] block: Create the commit block job before reopening any
 image

If the base or overlay images need to be reopened in read-write mode
but the block_job_create() call fails then no one will put those
images back in read-only mode.

We can solve this problem easily by calling block_job_create() first.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: aa495045770a6f1a7cc5d408397a17c75097fdd8.1464346103.git.berto@igalia.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/commit.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index 8a00e1146c..444333ba65 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -236,6 +236,11 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
+    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
     orig_base_flags    = bdrv_get_flags(base);
     orig_overlay_flags = bdrv_get_flags(overlay_bs);
 
@@ -252,16 +257,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         bdrv_reopen_multiple(reopen_queue, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
+            block_job_unref(&s->common);
             return;
         }
     }
 
 
-    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
-    if (!s) {
-        return;
-    }
-
     s->base = blk_new();
     blk_insert_bs(s->base, base);
 

From 6ea66b590cda71bddc3112b40a5c13378c1a3445 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 Jun 2016 17:07:52 +0800
Subject: [PATCH 32/39] iotests: 095: Clean up QEMU before showing image info

Signed-off-by: Fam Zheng <famz@redhat.com>
Message-id: 1464944872-24484-1-git-send-email-famz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/095 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qemu-iotests/095 b/tests/qemu-iotests/095
index dad04b9ac9..030adb22e1 100755
--- a/tests/qemu-iotests/095
+++ b/tests/qemu-iotests/095
@@ -74,6 +74,8 @@ _send_qemu_cmd $h "{ 'execute': 'block-commit',
                                  'arguments': { 'device': 'test',
                                  'top': '"${TEST_IMG}.snp1"' } }" "BLOCK_JOB_COMPLETED"
 
+_cleanup_qemu
+
 echo
 echo "=== Base image info after commit and resize ==="
 TEST_IMG="${TEST_IMG}.base" _img_info | _filter_img_info

From 87cd3d20e1e9b31315bf416d6e720cf7e3eb2ea9 Mon Sep 17 00:00:00 2001
From: Vikhyat Umrao <vumrao@redhat.com>
Date: Mon, 9 May 2016 13:21:59 +0530
Subject: [PATCH 33/39] rbd:change error_setg() to error_setg_errno()

Ceph RBD block driver does not use error_setg_errno() where
it is possible to use. This patch replaces error_setg()
from error_setg_errno().

Signed-off-by: Vikhyat Umrao <vumrao@redhat.com>
Message-id: 1462780319-5796-1-git-send-email-vumrao@redhat.com
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/rbd.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 5bc5b32530..5226b6fef8 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -290,7 +290,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
             if (only_read_conf_file) {
                 ret = rados_conf_read_file(cluster, value);
                 if (ret < 0) {
-                    error_setg(errp, "error reading conf file %s", value);
+                    error_setg_errno(errp, -ret, "error reading conf file %s",
+                                     value);
                     break;
                 }
             }
@@ -299,7 +300,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
         } else if (!only_read_conf_file) {
             ret = rados_conf_set(cluster, name, value);
             if (ret < 0) {
-                error_setg(errp, "invalid conf option %s", name);
+                error_setg_errno(errp, -ret, "invalid conf option %s", name);
                 ret = -EINVAL;
                 break;
             }
@@ -354,9 +355,10 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
     }
 
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
-    if (rados_create(&cluster, clientname) < 0) {
-        error_setg(errp, "error initializing");
-        return -EIO;
+    ret = rados_create(&cluster, clientname);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error initializing");
+        return ret;
     }
 
     if (strstr(conf, "conf=") == NULL) {
@@ -381,21 +383,27 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
         return -EIO;
     }
 
-    if (rados_connect(cluster) < 0) {
-        error_setg(errp, "error connecting");
+    ret = rados_connect(cluster);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error connecting");
         rados_shutdown(cluster);
-        return -EIO;
+        return ret;
     }
 
-    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
-        error_setg(errp, "error opening pool %s", pool);
+    ret = rados_ioctx_create(cluster, pool, &io_ctx);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error opening pool %s", pool);
         rados_shutdown(cluster);
-        return -EIO;
+        return ret;
     }
 
     ret = rbd_create(io_ctx, name, bytes, &obj_order);
     rados_ioctx_destroy(io_ctx);
     rados_shutdown(cluster);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error rbd create");
+        return ret;
+    }
 
     return ret;
 }
@@ -500,7 +508,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
     r = rados_create(&s->cluster, clientname);
     if (r < 0) {
-        error_setg(errp, "error initializing");
+        error_setg_errno(errp, -r, "error initializing");
         goto failed_opts;
     }
 
@@ -546,19 +554,19 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
 
     r = rados_connect(s->cluster);
     if (r < 0) {
-        error_setg(errp, "error connecting");
+        error_setg_errno(errp, -r, "error connecting");
         goto failed_shutdown;
     }
 
     r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
     if (r < 0) {
-        error_setg(errp, "error opening pool %s", pool);
+        error_setg_errno(errp, -r, "error opening pool %s", pool);
         goto failed_shutdown;
     }
 
     r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
     if (r < 0) {
-        error_setg(errp, "error reading header from %s", s->name);
+        error_setg_errno(errp, -r, "error reading header from %s", s->name);
         goto failed_open;
     }
 

From 9bd910e2cbe413ab5927068bf189e929cb6790bc Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 10 Jun 2016 20:57:46 +0200
Subject: [PATCH 34/39] block: Allow replacement of a BDS by its overlay

change_parent_backing_link() asserts that the BDS to be replaced is not
used as a backing file. However, we may want to replace a BDS by its
overlay in which case that very link should not be redirected.

For instance, when doing a sync=none drive-mirror operation, we may have
the following BDS/BB forest before block job completion:

  target

  base <- source <- BlockBackend

During job completion, we want to establish the source BDS as the
target's backing node:

          target
            |
            v
  base <- source <- BlockBackend

This makes the target a valid replacement for the source:

          target <- BlockBackend
            |
            v
  base <- source

Without this modification to change_parent_backing_link() we have to
inject the target into the graph before the source is its backing node,
thus temporarily creating a wrong graph:

  target <- BlockBackend

  base <- source

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20160610185750.30956-2-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 8104225cd3..d090324c68 100644
--- a/block.c
+++ b/block.c
@@ -2226,9 +2226,23 @@ void bdrv_close_all(void)
 static void change_parent_backing_link(BlockDriverState *from,
                                        BlockDriverState *to)
 {
-    BdrvChild *c, *next;
+    BdrvChild *c, *next, *to_c;
 
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
+        if (c->role == &child_backing) {
+            /* @from is generally not allowed to be a backing file, except for
+             * when @to is the overlay. In that case, @from may not be replaced
+             * by @to as @to's backing node. */
+            QLIST_FOREACH(to_c, &to->children, next) {
+                if (to_c == c) {
+                    break;
+                }
+            }
+            if (to_c) {
+                continue;
+            }
+        }
+
         assert(c->role != &child_backing);
         bdrv_ref(to);
         bdrv_replace_child(c, to);

From 274fccee2bf63702b34e3923b1e50a49147a7918 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 10 Jun 2016 20:57:47 +0200
Subject: [PATCH 35/39] block/mirror: Fix target backing BDS

Currently, we are trying to move the backing BDS from the source to the
target in bdrv_replace_in_backing_chain() which is called from
mirror_exit(). However, mirror_complete() already tries to open the
target's backing chain with a call to bdrv_open_backing_file().

First, we should only set the target's backing BDS once. Second, the
mirroring block job has a better idea of what to set it to than the
generic code in bdrv_replace_in_backing_chain() (in fact, the latter's
conditions on when to move the backing BDS from source to target are not
really correct).

Therefore, remove that code from bdrv_replace_in_backing_chain() and
leave it to mirror_complete().

Depending on what kind of mirroring is performed, we furthermore want to
use different strategies to open the target's backing chain:

- If blockdev-mirror is used, we can assume the user made sure that the
  target already has the correct backing chain. In particular, we should
  not try to open a backing file if the target does not have any yet.

- If drive-mirror with mode=absolute-paths is used, we can and should
  reuse the already existing chain of nodes that the source BDS is in.
  In case of sync=full, no backing BDS is required; with sync=top, we
  just link the source's backing BDS to the target, and with sync=none,
  we use the source BDS as the target's backing BDS.
  We should not try to open these backing files anew because this would
  lead to two BDSs existing per physical file in the backing chain, and
  we would like to avoid such concurrent access.

- If drive-mirror with mode=existing is used, we have to use the
  information provided in the physical image file which means opening
  the target's backing chain completely anew, just as it has been done
  already.
  If the target's backing chain shares images with the source, this may
  lead to multiple BDSs per physical image file. But since we cannot
  reliably ascertain this case, there is nothing we can do about it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20160610185750.30956-3-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block.c                   |  8 --------
 block/mirror.c            | 39 ++++++++++++++++++++++++++++-----------
 blockdev.c                | 15 ++++++++++++---
 include/block/block_int.h | 18 +++++++++++++++++-
 4 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/block.c b/block.c
index d090324c68..b331eb9d38 100644
--- a/block.c
+++ b/block.c
@@ -2291,14 +2291,6 @@ void bdrv_replace_in_backing_chain(BlockDriverState *old, BlockDriverState *new)
 
     change_parent_backing_link(old, new);
 
-    /* Change backing files if a previously independent node is added to the
-     * chain. For active commit, we replace top by its own (indirect) backing
-     * file and don't do anything here so we don't build a loop. */
-    if (new->backing == NULL && !bdrv_chain_contains(backing_bs(old), new)) {
-        bdrv_set_backing_hd(new, backing_bs(old));
-        bdrv_set_backing_hd(old, NULL);
-    }
-
     bdrv_unref(old);
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index 41848b2c8e..075384a9cf 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -44,6 +44,7 @@ typedef struct MirrorBlockJob {
     /* Used to block operations on the drive-mirror-replace target */
     Error *replace_blocker;
     bool is_none_mode;
+    BlockMirrorBackingMode backing_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
     bool should_complete;
@@ -742,20 +743,26 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
 static void mirror_complete(BlockJob *job, Error **errp)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-    Error *local_err = NULL;
-    int ret;
+    BlockDriverState *src, *target;
+
+    src = blk_bs(job->blk);
+    target = blk_bs(s->target);
 
-    ret = bdrv_open_backing_file(blk_bs(s->target), NULL, "backing",
-                                 &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        return;
-    }
     if (!s->synced) {
         error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
         return;
     }
 
+    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
+        int ret;
+
+        assert(!target->backing);
+        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
+        if (ret < 0) {
+            return;
+        }
+    }
+
     /* check the target bs is not blocked and block all operations on it */
     if (s->replaces) {
         AioContext *replace_aio_context;
@@ -777,6 +784,13 @@ static void mirror_complete(BlockJob *job, Error **errp)
         aio_context_release(replace_aio_context);
     }
 
+    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
+        BlockDriverState *backing = s->is_none_mode ? src : s->base;
+        if (backing_bs(target) != backing) {
+            bdrv_set_backing_hd(target, backing);
+        }
+    }
+
     s->should_complete = true;
     block_job_enter(&s->common);
 }
@@ -799,6 +813,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                              const char *replaces,
                              int64_t speed, uint32_t granularity,
                              int64_t buf_size,
+                             BlockMirrorBackingMode backing_mode,
                              BlockdevOnError on_source_error,
                              BlockdevOnError on_target_error,
                              bool unmap,
@@ -836,6 +851,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
     s->is_none_mode = is_none_mode;
+    s->backing_mode = backing_mode;
     s->base = base;
     s->granularity = granularity;
     s->buf_size = ROUND_UP(buf_size, granularity);
@@ -859,7 +875,8 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
                   int64_t speed, uint32_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
+                  BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   bool unmap,
                   BlockCompletionFunc *cb,
@@ -875,7 +892,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
     base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
     mirror_start_job(bs, target, replaces,
-                     speed, granularity, buf_size,
+                     speed, granularity, buf_size, backing_mode,
                      on_source_error, on_target_error, unmap, cb, opaque, errp,
                      &mirror_job_driver, is_none_mode, base);
 }
@@ -922,7 +939,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
         }
     }
 
-    mirror_start_job(bs, base, NULL, speed, 0, 0,
+    mirror_start_job(bs, base, NULL, speed, 0, 0, MIRROR_LEAVE_BACKING_CHAIN,
                      on_error, on_error, false, cb, opaque, &local_err,
                      &commit_active_job_driver, false, base);
     if (local_err) {
diff --git a/blockdev.c b/blockdev.c
index 1d498c7b19..c9a0068cd6 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3426,6 +3426,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
                                    BlockDriverState *target,
                                    bool has_replaces, const char *replaces,
                                    enum MirrorSyncMode sync,
+                                   BlockMirrorBackingMode backing_mode,
                                    bool has_speed, int64_t speed,
                                    bool has_granularity, uint32_t granularity,
                                    bool has_buf_size, int64_t buf_size,
@@ -3483,7 +3484,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
      */
     mirror_start(bs, target,
                  has_replaces ? replaces : NULL,
-                 speed, granularity, buf_size, sync,
+                 speed, granularity, buf_size, sync, backing_mode,
                  on_source_error, on_target_error, unmap,
                  block_job_cb, bs, errp);
 }
@@ -3506,6 +3507,7 @@ void qmp_drive_mirror(const char *device, const char *target,
     BlockBackend *blk;
     BlockDriverState *source, *target_bs;
     AioContext *aio_context;
+    BlockMirrorBackingMode backing_mode;
     Error *local_err = NULL;
     QDict *options = NULL;
     int flags;
@@ -3579,6 +3581,12 @@ void qmp_drive_mirror(const char *device, const char *target,
         }
     }
 
+    if (mode == NEW_IMAGE_MODE_ABSOLUTE_PATHS) {
+        backing_mode = MIRROR_SOURCE_BACKING_CHAIN;
+    } else {
+        backing_mode = MIRROR_OPEN_BACKING_CHAIN;
+    }
+
     if ((sync == MIRROR_SYNC_MODE_FULL || !source)
         && mode != NEW_IMAGE_MODE_EXISTING)
     {
@@ -3627,7 +3635,7 @@ void qmp_drive_mirror(const char *device, const char *target,
     bdrv_set_aio_context(target_bs, aio_context);
 
     blockdev_mirror_common(bs, target_bs,
-                           has_replaces, replaces, sync,
+                           has_replaces, replaces, sync, backing_mode,
                            has_speed, speed,
                            has_granularity, granularity,
                            has_buf_size, buf_size,
@@ -3659,6 +3667,7 @@ void qmp_blockdev_mirror(const char *device, const char *target,
     BlockBackend *blk;
     BlockDriverState *target_bs;
     AioContext *aio_context;
+    BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN;
     Error *local_err = NULL;
 
     blk = blk_by_name(device);
@@ -3684,7 +3693,7 @@ void qmp_blockdev_mirror(const char *device, const char *target,
     bdrv_set_aio_context(target_bs, aio_context);
 
     blockdev_mirror_common(bs, target_bs,
-                           has_replaces, replaces, sync,
+                           has_replaces, replaces, sync, backing_mode,
                            has_speed, speed,
                            has_granularity, granularity,
                            has_buf_size, buf_size,
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 16c43e224e..688c6be009 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -509,6 +509,20 @@ struct BlockBackendRootState {
     BlockdevDetectZeroesOptions detect_zeroes;
 };
 
+typedef enum BlockMirrorBackingMode {
+    /* Reuse the existing backing chain from the source for the target.
+     * - sync=full: Set backing BDS to NULL.
+     * - sync=top:  Use source's backing BDS.
+     * - sync=none: Use source as the backing BDS. */
+    MIRROR_SOURCE_BACKING_CHAIN,
+
+    /* Open the target's backing chain completely anew */
+    MIRROR_OPEN_BACKING_CHAIN,
+
+    /* Do not change the target's backing BDS after job completion */
+    MIRROR_LEAVE_BACKING_CHAIN,
+} BlockMirrorBackingMode;
+
 static inline BlockDriverState *backing_bs(BlockDriverState *bs)
 {
     return bs->backing ? bs->backing->bs : NULL;
@@ -671,6 +685,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
  * @granularity: The chosen granularity for the dirty bitmap.
  * @buf_size: The amount of data that can be in flight at one time.
  * @mode: Whether to collapse all images in the chain to the target.
+ * @backing_mode: How to establish the target's backing chain after completion.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
  * @unmap: Whether to unmap target where source sectors only contain zeroes.
@@ -686,7 +701,8 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
                   int64_t speed, uint32_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
+                  BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   bool unmap,
                   BlockCompletionFunc *cb,

From 67882b153544f7b29412a5bdf40dd3bbba8d64ae Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 10 Jun 2016 20:57:48 +0200
Subject: [PATCH 36/39] block/null: Implement bdrv_refresh_filename()

The null block driver ignores any filename used for creating its BDSs,
which allows creating such BDSs even without any filename at all. In
that case, we currently construct a JSON filename when queried instead
of a plain "null-co://" or "null-aio://". This patch implements
bdrv_refresh_filename() to remedy this behavior.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20160610185750.30956-4-mreitz@redhat.com
[mreitz@redhat.com: Added commit message]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/null.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/block/null.c b/block/null.c
index 396500babd..b511010ba5 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,6 +12,8 @@
 
 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
 #include "block/block_int.h"
 
 #define NULL_OPT_LATENCY "latency-ns"
@@ -223,6 +225,20 @@ static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs,
     }
 }
 
+static void null_refresh_filename(BlockDriverState *bs, QDict *opts)
+{
+    QINCREF(opts);
+    qdict_del(opts, "filename");
+
+    if (!qdict_size(opts)) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename), "%s://",
+                 bs->drv->format_name);
+    }
+
+    qdict_put(opts, "driver", qstring_from_str(bs->drv->format_name));
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_null_co = {
     .format_name            = "null-co",
     .protocol_name          = "null-co",
@@ -238,6 +254,8 @@ static BlockDriver bdrv_null_co = {
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
     .bdrv_co_get_block_status   = null_co_get_block_status,
+
+    .bdrv_refresh_filename  = null_refresh_filename,
 };
 
 static BlockDriver bdrv_null_aio = {
@@ -255,6 +273,8 @@ static BlockDriver bdrv_null_aio = {
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
     .bdrv_co_get_block_status   = null_co_get_block_status,
+
+    .bdrv_refresh_filename  = null_refresh_filename,
 };
 
 static void bdrv_null_init(void)

From 298c6009dc79dc0cc71cab4a1ad9f8231c1ae858 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 10 Jun 2016 20:57:49 +0200
Subject: [PATCH 37/39] iotests: Add test for post-mirror backing chains

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20160610185750.30956-5-mreitz@redhat.com
Reviewed-by: Fam Zheng <famz@redhat.com>
[mreitz@redhat.com: Removed unnecessary imports]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/155     | 261 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/155.out |   5 +
 tests/qemu-iotests/group   |   1 +
 3 files changed, 267 insertions(+)
 create mode 100755 tests/qemu-iotests/155
 create mode 100644 tests/qemu-iotests/155.out

diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155
new file mode 100755
index 0000000000..4057b5e2aa
--- /dev/null
+++ b/tests/qemu-iotests/155
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+#
+# Test whether the backing BDSs are correct after completion of a
+# mirror block job; in "existing" modes (drive-mirror with
+# mode=existing and blockdev-mirror) the backing chain should not be
+# overridden.
+#
+# Copyright (C) 2016 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+from iotests import qemu_img
+
+back0_img = os.path.join(iotests.test_dir, 'back0.' + iotests.imgfmt)
+back1_img = os.path.join(iotests.test_dir, 'back1.' + iotests.imgfmt)
+back2_img = os.path.join(iotests.test_dir, 'back2.' + iotests.imgfmt)
+source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt)
+target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt)
+
+
+# Class variables for controlling its behavior:
+#
+# existing: If True, explicitly create the target image and blockdev-add it
+# target_backing: If existing is True: Use this filename as the backing file
+#                 of the target image
+#                 (None: no backing file)
+# target_blockdev_backing: If existing is True: Pass this dict as "backing"
+#                          for the blockdev-add command
+#                          (None: do not pass "backing")
+# target_real_backing: If existing is True: The real filename of the backing
+#                      image during runtime, only makes sense if
+#                      target_blockdev_backing is not None
+#                      (None: same as target_backing)
+
+class BaseClass(iotests.QMPTestCase):
+    target_blockdev_backing = None
+    target_real_backing = None
+
+    def setUp(self):
+        qemu_img('create', '-f', iotests.imgfmt, back0_img, '1M')
+        qemu_img('create', '-f', iotests.imgfmt, '-b', back0_img, back1_img)
+        qemu_img('create', '-f', iotests.imgfmt, '-b', back1_img, back2_img)
+        qemu_img('create', '-f', iotests.imgfmt, '-b', back2_img, source_img)
+
+        self.vm = iotests.VM()
+        self.vm.add_drive(None, '', 'none')
+        self.vm.launch()
+
+        # Add the BDS via blockdev-add so it stays around after the mirror block
+        # job has been completed
+        result = self.vm.qmp('blockdev-add',
+                             options={'node-name': 'source',
+                                      'driver': iotests.imgfmt,
+                                      'file': {'driver': 'file',
+                                               'filename': source_img}})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('x-blockdev-insert-medium',
+                             device='drive0', node_name='source')
+        self.assert_qmp(result, 'return', {})
+
+        self.assertIntactSourceBackingChain()
+
+        if self.existing:
+            if self.target_backing:
+                qemu_img('create', '-f', iotests.imgfmt,
+                         '-b', self.target_backing, target_img, '1M')
+            else:
+                qemu_img('create', '-f', iotests.imgfmt, target_img, '1M')
+
+            if self.cmd == 'blockdev-mirror':
+                options = { 'node-name': 'target',
+                            'driver': iotests.imgfmt,
+                            'file': { 'driver': 'file',
+                                      'filename': target_img } }
+                if self.target_blockdev_backing:
+                    options['backing'] = self.target_blockdev_backing
+
+                result = self.vm.qmp('blockdev-add', options=options)
+                self.assert_qmp(result, 'return', {})
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(source_img)
+        os.remove(back2_img)
+        os.remove(back1_img)
+        os.remove(back0_img)
+        try:
+            os.remove(target_img)
+        except OSError:
+            pass
+
+    def findBlockNode(self, node_name, id=None):
+        if id:
+            result = self.vm.qmp('query-block')
+            for device in result['return']:
+                if device['device'] == id:
+                    if node_name:
+                        self.assert_qmp(device, 'inserted/node-name', node_name)
+                    return device['inserted']
+        else:
+            result = self.vm.qmp('query-named-block-nodes')
+            for node in result['return']:
+                if node['node-name'] == node_name:
+                    return node
+
+        self.fail('Cannot find node %s/%s' % (id, node_name))
+
+    def assertIntactSourceBackingChain(self):
+        node = self.findBlockNode('source')
+
+        self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename',
+                        source_img)
+        self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename',
+                        back2_img)
+        self.assert_qmp(node, 'image' + '/backing-image' * 2 + '/filename',
+                        back1_img)
+        self.assert_qmp(node, 'image' + '/backing-image' * 3 + '/filename',
+                        back0_img)
+        self.assert_qmp_absent(node, 'image' + '/backing-image' * 4)
+
+    def assertCorrectBackingImage(self, node, default_image):
+        if self.existing:
+            if self.target_real_backing:
+                image = self.target_real_backing
+            else:
+                image = self.target_backing
+        else:
+            image = default_image
+
+        if image:
+            self.assert_qmp(node, 'image/backing-image/filename', image)
+        else:
+            self.assert_qmp_absent(node, 'image/backing-image')
+
+
+# Class variables for controlling its behavior:
+#
+# cmd: Mirroring command to execute, either drive-mirror or blockdev-mirror
+
+class MirrorBaseClass(BaseClass):
+    def runMirror(self, sync):
+        if self.cmd == 'blockdev-mirror':
+            result = self.vm.qmp(self.cmd, device='drive0', sync=sync,
+                                 target='target')
+        else:
+            if self.existing:
+                mode = 'existing'
+            else:
+                mode = 'absolute-paths'
+            result = self.vm.qmp(self.cmd, device='drive0', sync=sync,
+                                 target=target_img, format=iotests.imgfmt,
+                                 mode=mode, node_name='target')
+
+        self.assert_qmp(result, 'return', {})
+
+        self.vm.event_wait('BLOCK_JOB_READY')
+
+        result = self.vm.qmp('block-job-complete', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.vm.event_wait('BLOCK_JOB_COMPLETED')
+
+    def testFull(self):
+        self.runMirror('full')
+
+        node = self.findBlockNode('target', 'drive0')
+        self.assertCorrectBackingImage(node, None)
+        self.assertIntactSourceBackingChain()
+
+    def testTop(self):
+        self.runMirror('top')
+
+        node = self.findBlockNode('target', 'drive0')
+        self.assertCorrectBackingImage(node, back2_img)
+        self.assertIntactSourceBackingChain()
+
+    def testNone(self):
+        self.runMirror('none')
+
+        node = self.findBlockNode('target', 'drive0')
+        self.assertCorrectBackingImage(node, source_img)
+        self.assertIntactSourceBackingChain()
+
+
+class TestDriveMirrorAbsolutePaths(MirrorBaseClass):
+    cmd = 'drive-mirror'
+    existing = False
+
+class TestDriveMirrorExistingNoBacking(MirrorBaseClass):
+    cmd = 'drive-mirror'
+    existing = True
+    target_backing = None
+
+class TestDriveMirrorExistingBacking(MirrorBaseClass):
+    cmd = 'drive-mirror'
+    existing = True
+    target_backing = 'null-co://'
+
+class TestBlockdevMirrorNoBacking(MirrorBaseClass):
+    cmd = 'blockdev-mirror'
+    existing = True
+    target_backing = None
+
+class TestBlockdevMirrorBacking(MirrorBaseClass):
+    cmd = 'blockdev-mirror'
+    existing = True
+    target_backing = 'null-co://'
+
+class TestBlockdevMirrorForcedBacking(MirrorBaseClass):
+    cmd = 'blockdev-mirror'
+    existing = True
+    target_backing = None
+    target_blockdev_backing = { 'driver': 'null-co' }
+    target_real_backing = 'null-co://'
+
+
+class TestCommit(BaseClass):
+    existing = False
+
+    def testCommit(self):
+        result = self.vm.qmp('block-commit', device='drive0', base=back1_img)
+        self.assert_qmp(result, 'return', {})
+
+        self.vm.event_wait('BLOCK_JOB_READY')
+
+        result = self.vm.qmp('block-job-complete', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.vm.event_wait('BLOCK_JOB_COMPLETED')
+
+        node = self.findBlockNode(None, 'drive0')
+        self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename',
+                        back1_img)
+        self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename',
+                        back0_img)
+        self.assert_qmp_absent(node, 'image' + '/backing-image' * 2 +
+                               '/filename')
+
+        self.assertIntactSourceBackingChain()
+
+
+BaseClass = None
+MirrorBaseClass = None
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=['qcow2'])
diff --git a/tests/qemu-iotests/155.out b/tests/qemu-iotests/155.out
new file mode 100644
index 0000000000..4176bb9402
--- /dev/null
+++ b/tests/qemu-iotests/155.out
@@ -0,0 +1,5 @@
+...................
+----------------------------------------------------------------------
+Ran 19 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index ab1d76efdf..9f1f2c0d9e 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -154,3 +154,4 @@
 150 rw auto quick
 152 rw auto quick
 154 rw auto backing quick
+155 rw auto

From 3dd48fdc55613c102f2811c63ceefd9aebcd44ca Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 10 Jun 2016 20:57:50 +0200
Subject: [PATCH 38/39] iotests: Add test for oVirt-like storage migration

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20160610185750.30956-6-mreitz@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/156     | 174 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/156.out |  48 ++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 223 insertions(+)
 create mode 100755 tests/qemu-iotests/156
 create mode 100644 tests/qemu-iotests/156.out

diff --git a/tests/qemu-iotests/156 b/tests/qemu-iotests/156
new file mode 100755
index 0000000000..cc95ff1f98
--- /dev/null
+++ b/tests/qemu-iotests/156
@@ -0,0 +1,174 @@
+#!/bin/bash
+#
+# Tests oVirt-like storage migration:
+#  - Create snapshot
+#  - Create target image with (not yet existing) target backing chain
+#    (i.e. just write the name of a soon-to-be-copied-over backing file into it)
+#  - drive-mirror the snapshot to the target with mode=existing and sync=top
+#  - In the meantime, copy the original source files to the destination via
+#    conventional means (i.e. outside of qemu)
+#  - Complete the drive-mirror job
+#  - Delete all source images
+#
+# Copyright (C) 2016 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz@redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+status=1	# failure is the default!
+
+_cleanup()
+{
+    rm -f "$TEST_IMG{,.target}{,.backing,.overlay}"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt qcow2 qed
+_supported_proto generic
+_supported_os Linux
+
+# Create source disk
+TEST_IMG="$TEST_IMG.backing" _make_test_img 1M
+_make_test_img -b "$TEST_IMG.backing" 1M
+
+$QEMU_IO -c 'write -P 1 0 256k' "$TEST_IMG.backing" | _filter_qemu_io
+$QEMU_IO -c 'write -P 2 64k 192k' "$TEST_IMG" | _filter_qemu_io
+
+_launch_qemu -drive if=none,id=source,file="$TEST_IMG"
+
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'qmp_capabilities' }" \
+    'return'
+
+# Create snapshot
+TEST_IMG="$TEST_IMG.overlay" _make_test_img -b "$TEST_IMG" 1M
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'blockdev-snapshot-sync',
+       'arguments': { 'device': 'source',
+                      'snapshot-file': '$TEST_IMG.overlay',
+                      'format': '$IMGFMT',
+                      'mode': 'existing' } }" \
+    'return'
+
+# Write something to the snapshot
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io source \"write -P 3 128k 128k\"' } }" \
+    'return'
+
+# Create target image
+TEST_IMG="$TEST_IMG.target.overlay" _make_test_img -b "$TEST_IMG.target" 1M
+
+# Mirror snapshot
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'drive-mirror',
+       'arguments': { 'device': 'source',
+                      'target': '$TEST_IMG.target.overlay',
+                      'mode': 'existing',
+                      'sync': 'top' } }" \
+    'return'
+
+# Wait for convergence
+_send_qemu_cmd $QEMU_HANDLE \
+    '' \
+    'BLOCK_JOB_READY'
+
+# Write some more
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io source \"write -P 4 192k 64k\"' } }" \
+    'return'
+
+# Copy source backing chain to the target before completing the job
+cp "$TEST_IMG.backing" "$TEST_IMG.target.backing"
+cp "$TEST_IMG" "$TEST_IMG.target"
+$QEMU_IMG rebase -u -b "$TEST_IMG.target.backing" "$TEST_IMG.target"
+
+# Complete block job
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'block-job-complete',
+       'arguments': { 'device': 'source' } }" \
+    ''
+
+_send_qemu_cmd $QEMU_HANDLE \
+    '' \
+    'BLOCK_JOB_COMPLETED'
+
+# Remove the source images
+rm -f "$TEST_IMG{,.backing,.overlay}"
+
+echo
+
+# Check online disk contents
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io source \"read -P 1 0k 64k\"' } }" \
+    'return'
+
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io source \"read -P 2 64k 64k\"' } }" \
+    'return'
+
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io source \"read -P 3 128k 64k\"' } }" \
+    'return'
+
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'human-monitor-command',
+       'arguments': { 'command-line':
+                      'qemu-io source \"read -P 4 192k 64k\"' } }" \
+    'return'
+
+echo
+
+_send_qemu_cmd $QEMU_HANDLE \
+    "{ 'execute': 'quit' }" \
+    'return'
+
+wait=1 _cleanup_qemu
+
+echo
+
+# Check offline disk contents
+$QEMU_IO -c 'read -P 1 0k 64k' \
+         -c 'read -P 2 64k 64k' \
+         -c 'read -P 3 128k 64k' \
+         -c 'read -P 4 192k 64k' \
+         "$TEST_IMG.target.overlay" | _filter_qemu_io
+
+echo
+
+# success, all done
+echo '*** done'
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/156.out b/tests/qemu-iotests/156.out
new file mode 100644
index 0000000000..3af82ae540
--- /dev/null
+++ b/tests/qemu-iotests/156.out
@@ -0,0 +1,48 @@
+QA output created by 156
+Formatting 'TEST_DIR/t.IMGFMT.backing', fmt=IMGFMT size=1048576
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.backing
+wrote 262144/262144 bytes at offset 0
+256 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 196608/196608 bytes at offset 65536
+192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": {}}
+Formatting 'TEST_DIR/t.IMGFMT.overlay', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT
+{"return": {}}
+wrote 131072/131072 bytes at offset 131072
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+Formatting 'TEST_DIR/t.IMGFMT.target.overlay', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.target
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "source", "len": 131072, "offset": 131072, "speed": 0, "type": "mirror"}}
+wrote 65536/65536 bytes at offset 196608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "source", "len": 196608, "offset": 196608, "speed": 0, "type": "mirror"}}
+
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+read 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+read 65536/65536 bytes at offset 131072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+read 65536/65536 bytes at offset 196608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN"}
+
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 131072
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 196608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 9f1f2c0d9e..1c6fcb6018 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -155,3 +155,4 @@
 152 rw auto quick
 154 rw auto backing quick
 155 rw auto
+156 rw auto quick

From 0e321191224c8cd137eef41da3257e096965c3d6 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 14 Jun 2016 20:08:12 +0300
Subject: [PATCH 39/39] hbitmap: add 'pos < size' asserts

For now, fail in hbitmap_set on start + count > size will come from
hbitmap_set
  hb_count_between
    hbitmap_iter_init
      assert(pos < hb->size)

This patch adds such checks to set/get/reset functions of hbitmap.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 1465924093-76875-2-git-send-email-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 util/hbitmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/util/hbitmap.c b/util/hbitmap.c
index 7121b11c01..99fd2ba37b 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -269,6 +269,7 @@ void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
     start >>= hb->granularity;
     last >>= hb->granularity;
     count = last - start + 1;
+    assert(last < hb->size);
 
     hb->count += count - hb_count_between(hb, start, last);
     hb_set_between(hb, HBITMAP_LEVELS - 1, start, last);
@@ -348,6 +349,7 @@ void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
 
     start >>= hb->granularity;
     last >>= hb->granularity;
+    assert(last < hb->size);
 
     hb->count -= hb_count_between(hb, start, last);
     hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last);
@@ -371,6 +373,7 @@ bool hbitmap_get(const HBitmap *hb, uint64_t item)
     /* Compute position and bit in the last layer.  */
     uint64_t pos = item >> hb->granularity;
     unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1));
+    assert(pos < hb->size);
 
     return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
 }