From 443668ca408cdb7f01e1a70a58518bb100c3e9d1 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 25 May 2016 21:48:45 -0600
Subject: [PATCH 01/31] block: split write_zeroes always

We should split requests even if they are less than write_zeroes_alignment.
For example we can have the following request:
  offset 62k
  size   4k
  write_zeroes_alignment 64k
The original code sent 1 request covering 2 qcow2 clusters, and resulted
in both clusters being allocated. But by splitting the request, we can
cater to the case where one of the two clusters can be zeroed as a
whole, for only 1 cluster allocated after the operation.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Eric Blake <eblake@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
Message-Id: <1463476543-3087-2-git-send-email-den@openvz.org>

[eblake: Avoid exceeding nb_sectors, hoist alignment checks out of
loop, and update testsuite to show that patch works]

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c                 | 30 +++++++++++++++++-------------
 tests/qemu-iotests/154.out | 18 ++++++++++++------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/block/io.c b/block/io.c
index 6070e773b7..c73be55f86 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1118,28 +1118,32 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     struct iovec iov = {0};
     int ret = 0;
     bool need_flush = false;
+    int head = 0;
+    int tail = 0;
 
     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
                                         BDRV_REQUEST_MAX_SECTORS);
+    if (bs->bl.write_zeroes_alignment) {
+        assert(is_power_of_2(bs->bl.write_zeroes_alignment));
+        head = sector_num & (bs->bl.write_zeroes_alignment - 1);
+        tail = (sector_num + nb_sectors) & (bs->bl.write_zeroes_alignment - 1);
+        max_write_zeroes &= ~(bs->bl.write_zeroes_alignment - 1);
+    }
 
     while (nb_sectors > 0 && !ret) {
         int num = nb_sectors;
 
         /* Align request.  Block drivers can expect the "bulk" of the request
-         * to be aligned.
+         * to be aligned, and that unaligned requests do not cross cluster
+         * boundaries.
          */
-        if (bs->bl.write_zeroes_alignment
-            && num > bs->bl.write_zeroes_alignment) {
-            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
-                /* Make a small request up to the first aligned sector.  */
-                num = bs->bl.write_zeroes_alignment;
-                num -= sector_num % bs->bl.write_zeroes_alignment;
-            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
-                /* Shorten the request to the last aligned sector.  num cannot
-                 * underflow because num > bs->bl.write_zeroes_alignment.
-                 */
-                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
-            }
+        if (head) {
+            /* Make a small request up to the first aligned sector.  */
+            num = MIN(nb_sectors, bs->bl.write_zeroes_alignment - head);
+            head = 0;
+        } else if (tail && num > bs->bl.write_zeroes_alignment) {
+            /* Shorten the request to the last aligned sector.  */
+            num -= tail;
         }
 
         /* limit request size */
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index 8946b734ac..b9d27c501a 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -106,11 +106,14 @@ read 1024/1024 bytes at offset 67584
 read 5120/5120 bytes at offset 68608
 5 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
-{ "start": 32768, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 36864, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false},
-{ "start": 49152, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 49152, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 53248, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false},
-{ "start": 65536, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 36864},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 69632, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}]
 
 == spanning two clusters, non-zero after request ==
@@ -145,11 +148,14 @@ read 7168/7168 bytes at offset 65536
 read 1024/1024 bytes at offset 72704
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
-{ "start": 32768, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 36864, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
 { "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false},
-{ "start": 49152, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 49152, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 53248, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
 { "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false},
-{ "start": 65536, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 36864},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": true, "data": false},
+{ "start": 69632, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
 { "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}]
 
 == spanning two clusters, partially overwriting backing file ==

From ba142846b0f608c433e71d61efc6467c7b367dbf Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 25 May 2016 21:48:46 -0600
Subject: [PATCH 02/31] qcow2: simplify logic in qcow2_co_write_zeroes

Unaligned requests will occupy only one cluster. This is true since the
previous commit. Simplify the code taking this consideration into
account.

In other words, the caller is now buggy if it ever passes us an unaligned
request that crosses cluster boundaries (the only requests that can cross
boundaries will be aligned).

There are no other changes so far.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
CC: Eric Blake <eblake@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
Message-Id: <1463476543-3087-3-git-send-email-den@openvz.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index c9306a78c1..2f7320110e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2438,33 +2438,20 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
     int tail = (sector_num + nb_sectors) % s->cluster_sectors;
 
     if (head != 0 || tail != 0) {
-        int64_t cl_end = -1;
+        int64_t cl_start = sector_num - head;
 
-        sector_num -= head;
-        nb_sectors += head;
+        assert(cl_start + s->cluster_sectors >= sector_num + nb_sectors);
 
-        if (tail != 0) {
-            nb_sectors += s->cluster_sectors - tail;
-        }
+        sector_num = cl_start;
+        nb_sectors = s->cluster_sectors;
 
         if (!is_zero_cluster(bs, sector_num)) {
             return -ENOTSUP;
         }
 
-        if (nb_sectors > s->cluster_sectors) {
-            /* Technically the request can cover 2 clusters, f.e. 4k write
-               at s->cluster_sectors - 2k offset. One of these cluster can
-               be zeroed, one unallocated */
-            cl_end = sector_num + nb_sectors - s->cluster_sectors;
-            if (!is_zero_cluster(bs, cl_end)) {
-                return -ENOTSUP;
-            }
-        }
-
         qemu_co_mutex_lock(&s->lock);
         /* We can have new write after previous check */
-        if (!is_zero_cluster_top_locked(bs, sector_num) ||
-                (cl_end > 0 && !is_zero_cluster_top_locked(bs, cl_end))) {
+        if (!is_zero_cluster_top_locked(bs, sector_num)) {
             qemu_co_mutex_unlock(&s->lock);
             return -ENOTSUP;
         }

From 5a64e9425160ace607b2043804d73d5579808bd3 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 25 May 2016 21:48:47 -0600
Subject: [PATCH 03/31] qcow2: add tracepoints for qcow2_co_write_zeroes

This patch follows guidelines of all other tracepoints in qcow2, like ones
in qcow2_co_writev. I think that they should dump values in the same
quantities or be changed all together.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Eric Blake <eblake@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
Message-Id: <1463476543-3087-4-git-send-email-den@openvz.org>
[eblake: typo fix in commit message]
Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 5 +++++
 trace-events  | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 2f7320110e..105fd5e753 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2437,6 +2437,9 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
     int head = sector_num % s->cluster_sectors;
     int tail = (sector_num + nb_sectors) % s->cluster_sectors;
 
+    trace_qcow2_write_zeroes_start_req(qemu_coroutine_self(), sector_num,
+                                       nb_sectors);
+
     if (head != 0 || tail != 0) {
         int64_t cl_start = sector_num - head;
 
@@ -2459,6 +2462,8 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
         qemu_co_mutex_lock(&s->lock);
     }
 
+    trace_qcow2_write_zeroes(qemu_coroutine_self(), sector_num, nb_sectors);
+
     /* Whatever is left can use real zero clusters */
     ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
     qemu_co_mutex_unlock(&s->lock);
diff --git a/trace-events b/trace-events
index c55d7084fb..5f720d0ddb 100644
--- a/trace-events
+++ b/trace-events
@@ -611,6 +611,8 @@ qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
 qcow2_writev_start_part(void *co) "co %p"
 qcow2_writev_done_part(void *co, int cur_nr_sectors) "co %p cur_nr_sectors %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
+qcow2_write_zeroes_start_req(void *co, int64_t sector, int nb_sectors) "co %p sector %" PRIx64 " nb_sectors %d"
+qcow2_write_zeroes(void *co, int64_t sector, int nb_sectors) "co %p sector %" PRIx64 " nb_sectors %d"
 
 # block/qcow2-cluster.c
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int num) "co %p offset %" PRIx64 " num %d"

From 31ad4fdf914e86a2ea105ff3687a0f4dfc4802cb Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 25 May 2016 21:48:48 -0600
Subject: [PATCH 04/31] qemu-iotests: Test one more spot for optimizing
 write_zeroes

Add another test to 154, showing that we currently allocate a
data cluster in the top layer if any sector of the backing file
was allocated.  The next patch will optimize this case.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/154     | 40 ++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/154.out | 37 +++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154
index 23f1b3ab16..5905c55de9 100755
--- a/tests/qemu-iotests/154
+++ b/tests/qemu-iotests/154
@@ -114,6 +114,46 @@ $QEMU_IO -c "read -P 0 40k 3k" "$TEST_IMG" | _filter_qemu_io
 
 $QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
 
+echo
+echo == write_zeroes covers non-zero data ==
+
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $size
+_make_test_img -b "$TEST_IMG.base"
+
+# non-zero data at front of request
+# Backing file: -- XX -- --
+# Active layer: -- 00 00 --
+
+$QEMU_IO -c "write -P 0x11 5k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 5k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+# non-zero data at end of request
+# Backing file: -- -- XX --
+# Active layer: -- 00 00 --
+
+$QEMU_IO -c "write -P 0x11 14k 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 13k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 12k 4k" "$TEST_IMG" | _filter_qemu_io
+
+# non-zero data matches size of request
+# Backing file: -- XX XX --
+# Active layer: -- 00 00 --
+
+$QEMU_IO -c "write -P 0x11 21k 2k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 21k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 20k 4k" "$TEST_IMG" | _filter_qemu_io
+
+# non-zero data smaller than request
+# Backing file: -- -X X- --
+# Active layer: -- 00 00 --
+
+$QEMU_IO -c "write -P 0x11 30208 1k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 29k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 28k 4k" "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
 echo
 echo == spanning two clusters, non-zero before request ==
 
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index b9d27c501a..531fd8cd88 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -74,6 +74,43 @@ read 3072/3072 bytes at offset 40960
 { "start": 40960, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
 { "start": 45056, "length": 134172672, "depth": 1, "zero": true, "data": false}]
 
+== write_zeroes covers non-zero data ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 5120
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 5120
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 14336
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 13312
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 12288
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 21504
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 21504
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 20480
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 30208
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2048/2048 bytes at offset 29696
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 28672
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 4096, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 8192, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 12288, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 16384, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 20480, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 24576, "length": 4096, "depth": 1, "zero": true, "data": false},
+{ "start": 28672, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 32768},
+{ "start": 32768, "length": 134184960, "depth": 1, "zero": true, "data": false}]
+
 == spanning two clusters, non-zero before request ==
 Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base

From ebb718a5c7240f6ffb308e0d0b67a92c3b63b91c Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 25 May 2016 21:48:49 -0600
Subject: [PATCH 05/31] qcow2: Catch more unaligned write_zero into zero
 cluster

is_zero_cluster() and is_zero_cluster_top_locked() are used only
by qcow2_co_write_zeroes().  The former is too broad (we don't
care if the sectors we are about to overwrite are non-zero, only
that all other sectors in the cluster are zero), so it needs to
be called up to twice but with smaller limits - rename it along
with adding the neeeded parameter.  The latter can be inlined for
more compact code.

The testsuite change shows that we now have a sparser top file
when an unaligned write_zeroes overwrites the only portion of
the backing file with data.

Based on a patch proposal by Denis V. Lunev.

CC: Denis V. Lunev <den@openvz.org>
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c              | 47 +++++++++++++++++++-------------------
 tests/qemu-iotests/154.out |  8 +++----
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 105fd5e753..ecac399b43 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2406,26 +2406,19 @@ finish:
 }
 
 
-static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
+static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
+                            uint32_t count)
 {
-    BDRVQcow2State *s = bs->opaque;
     int nr;
     BlockDriverState *file;
-    int64_t res = bdrv_get_block_status_above(bs, NULL, start,
-                                              s->cluster_sectors, &nr, &file);
-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
-}
+    int64_t res;
 
-static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
-{
-    BDRVQcow2State *s = bs->opaque;
-    int nr = s->cluster_sectors;
-    uint64_t off;
-    int ret;
-
-    ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
-    assert(nr == s->cluster_sectors);
-    return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
+    if (!count) {
+        return true;
+    }
+    res = bdrv_get_block_status_above(bs, NULL, start, count,
+                                      &nr, &file);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
 }
 
 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
@@ -2434,27 +2427,33 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
-    int head = sector_num % s->cluster_sectors;
-    int tail = (sector_num + nb_sectors) % s->cluster_sectors;
+    uint32_t head = sector_num % s->cluster_sectors;
+    uint32_t tail = (sector_num + nb_sectors) % s->cluster_sectors;
 
     trace_qcow2_write_zeroes_start_req(qemu_coroutine_self(), sector_num,
                                        nb_sectors);
 
-    if (head != 0 || tail != 0) {
+    if (head || tail) {
         int64_t cl_start = sector_num - head;
+        uint64_t off;
+        int nr;
 
         assert(cl_start + s->cluster_sectors >= sector_num + nb_sectors);
 
-        sector_num = cl_start;
-        nb_sectors = s->cluster_sectors;
-
-        if (!is_zero_cluster(bs, sector_num)) {
+        /* check whether remainder of cluster already reads as zero */
+        if (!(is_zero_sectors(bs, cl_start, head) &&
+              is_zero_sectors(bs, sector_num + nb_sectors,
+                              -tail & (s->cluster_sectors - 1)))) {
             return -ENOTSUP;
         }
 
         qemu_co_mutex_lock(&s->lock);
         /* We can have new write after previous check */
-        if (!is_zero_cluster_top_locked(bs, sector_num)) {
+        sector_num = cl_start;
+        nb_sectors = nr = s->cluster_sectors;
+        ret = qcow2_get_cluster_offset(bs, cl_start << BDRV_SECTOR_BITS,
+                                       &nr, &off);
+        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
             qemu_co_mutex_unlock(&s->lock);
             return -ENOTSUP;
         }
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index 531fd8cd88..da9eabdda8 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -102,13 +102,13 @@ wrote 2048/2048 bytes at offset 29696
 read 4096/4096 bytes at offset 28672
 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 4096, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 4096, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 8192, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 12288, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 12288, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 16384, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 20480, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 20480, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 24576, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 28672, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 32768},
+{ "start": 28672, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 32768, "length": 134184960, "depth": 1, "zero": true, "data": false}]
 
 == spanning two clusters, non-zero before request ==

From 8b1847445159ff6c12e7a24f04dc989ff97e34d4 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:01 -0600
Subject: [PATCH 06/31] iscsi: Use block size as minimum zero/discard alignment

If hardware does not advertise a minimum zero/discard
alignment, we still want to guarantee that the block layer
will align requests to our blocks, rather than the arbitrary
512-byte BDRV sector size.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index e7d5f7b0c3..94f9974f10 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1711,6 +1711,8 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
         }
         bs->bl.discard_alignment =
             sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+    } else {
+        bs->bl.discard_alignment = iscsilun->block_size >> BDRV_SECTOR_BITS;
     }
 
     if (iscsilun->bl.max_ws_len < 0xffffffff) {
@@ -1720,6 +1722,9 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
     if (iscsilun->lbp.lbpws) {
         bs->bl.write_zeroes_alignment =
             sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+    } else {
+        bs->bl.write_zeroes_alignment =
+            iscsilun->block_size >> BDRV_SECTOR_BITS;
     }
     bs->bl.opt_transfer_length =
         sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);

From cf081fca4e3cc02a309659b571ab0c5b225ea4cf Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:02 -0600
Subject: [PATCH 07/31] block: Track write zero limits in bytes

Another step towards removing sector-based interfaces: convert
the maximum write and minimum alignment values from sectors to
bytes.  Rename the variables to let the compiler check that all
users are converted to the new semantics.

The maximum remains an int as long as BDRV_REQUEST_MAX_SECTORS
is constrained by INT_MAX (this means that we can't even
support a 2G write_zeroes, but just under it) - changing
operation lengths to unsigned or to 64-bits is a much bigger
audit, and debatable if we even want to do it (since at the
core, a 32-bit platform will still have ssize_t as its
underlying limit on write()).

Meanwhile, alignment is changed to 'uint32_t', since it makes no
sense to have an alignment larger than the maximum write, and
less painful to use an unsigned type with well-defined behavior
in bit operations than to have to worry about what happens if
a driver mistakenly supplies a negative alignment.

Add an assert that no one was trying to use sectors to get a
write zeroes larger than 2G, and therefore that a later conversion
to bytes won't be impacted by keeping the limit at 32 bits.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c                | 22 +++++++++++++---------
 block/iscsi.c             | 13 ++++++-------
 block/qcow2.c             |  2 +-
 block/qed.c               |  2 +-
 block/vmdk.c              |  6 +++---
 include/block/block_int.h | 10 ++++++----
 6 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/block/io.c b/block/io.c
index c73be55f86..1eda1b2f58 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1121,15 +1121,19 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int head = 0;
     int tail = 0;
 
-    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
-                                        BDRV_REQUEST_MAX_SECTORS);
-    if (bs->bl.write_zeroes_alignment) {
-        assert(is_power_of_2(bs->bl.write_zeroes_alignment));
-        head = sector_num & (bs->bl.write_zeroes_alignment - 1);
-        tail = (sector_num + nb_sectors) & (bs->bl.write_zeroes_alignment - 1);
-        max_write_zeroes &= ~(bs->bl.write_zeroes_alignment - 1);
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
+    int write_zeroes_sector_align =
+        bs->bl.pwrite_zeroes_alignment >> BDRV_SECTOR_BITS;
+
+    max_write_zeroes >>= BDRV_SECTOR_BITS;
+    if (write_zeroes_sector_align) {
+        assert(is_power_of_2(bs->bl.pwrite_zeroes_alignment));
+        head = sector_num & (write_zeroes_sector_align - 1);
+        tail = (sector_num + nb_sectors) & (write_zeroes_sector_align - 1);
+        max_write_zeroes &= ~(write_zeroes_sector_align - 1);
     }
 
+    assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
     while (nb_sectors > 0 && !ret) {
         int num = nb_sectors;
 
@@ -1139,9 +1143,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
          */
         if (head) {
             /* Make a small request up to the first aligned sector.  */
-            num = MIN(nb_sectors, bs->bl.write_zeroes_alignment - head);
+            num = MIN(nb_sectors, write_zeroes_sector_align - head);
             head = 0;
-        } else if (tail && num > bs->bl.write_zeroes_alignment) {
+        } else if (tail && num > write_zeroes_sector_align) {
             /* Shorten the request to the last aligned sector.  */
             num -= tail;
         }
diff --git a/block/iscsi.c b/block/iscsi.c
index 94f9974f10..52ea9d7a70 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1715,16 +1715,15 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
         bs->bl.discard_alignment = iscsilun->block_size >> BDRV_SECTOR_BITS;
     }
 
-    if (iscsilun->bl.max_ws_len < 0xffffffff) {
-        bs->bl.max_write_zeroes =
-            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
+    if (iscsilun->bl.max_ws_len < 0xffffffff / iscsilun->block_size) {
+        bs->bl.max_pwrite_zeroes =
+            iscsilun->bl.max_ws_len * iscsilun->block_size;
     }
     if (iscsilun->lbp.lbpws) {
-        bs->bl.write_zeroes_alignment =
-            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+        bs->bl.pwrite_zeroes_alignment =
+            iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
     } else {
-        bs->bl.write_zeroes_alignment =
-            iscsilun->block_size >> BDRV_SECTOR_BITS;
+        bs->bl.pwrite_zeroes_alignment = iscsilun->block_size;
     }
     bs->bl.opt_transfer_length =
         sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
diff --git a/block/qcow2.c b/block/qcow2.c
index ecac399b43..a6ea6cb62d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1193,7 +1193,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
 
-    bs->bl.write_zeroes_alignment = s->cluster_sectors;
+    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
 }
 
 static int qcow2_set_key(BlockDriverState *bs, const char *key)
diff --git a/block/qed.c b/block/qed.c
index 0f621924f8..113b8e8cc6 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -517,7 +517,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVQEDState *s = bs->opaque;
 
-    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
+    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
 }
 
 /* We have nothing to do for QED reopen, stubs just return
diff --git a/block/vmdk.c b/block/vmdk.c
index 2205cc888f..5a01e1672f 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -997,9 +997,9 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
 
     for (i = 0; i < s->num_extents; i++) {
         if (!s->extents[i].flat) {
-            bs->bl.write_zeroes_alignment =
-                MAX(bs->bl.write_zeroes_alignment,
-                    s->extents[i].cluster_sectors);
+            bs->bl.pwrite_zeroes_alignment =
+                MAX(bs->bl.pwrite_zeroes_alignment,
+                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
         }
     }
 }
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 30a97178c8..2e9c81ff95 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -328,11 +328,13 @@ typedef struct BlockLimits {
     /* optimal alignment for discard requests in sectors */
     int64_t discard_alignment;
 
-    /* maximum number of sectors that can zeroized at once */
-    int max_write_zeroes;
+    /* maximum number of bytes that can zeroized at once (since it is
+     * signed, it must be < 2G, if set) */
+    int32_t max_pwrite_zeroes;
 
-    /* optimal alignment for write zeroes requests in sectors */
-    int64_t write_zeroes_alignment;
+    /* optimal alignment for write zeroes requests in bytes, must be
+     * power of 2, and less than max_pwrite_zeroes if that is set */
+    uint32_t pwrite_zeroes_alignment;
 
     /* optimal transfer length in sectors */
     int opt_transfer_length;

From d05aa8bb4a8b6aa9a915ec5074fb12ae632d2323 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:03 -0600
Subject: [PATCH 08/31] block: Add .bdrv_co_pwrite_zeroes()

Update bdrv_co_do_write_zeroes() to be byte-based, and select
between the new byte-based bdrv_co_pwrite_zeroes() or the old
bdrv_co_write_zeroes().  The next patches will convert drivers,
then remove the old interface.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c                | 78 +++++++++++++++++++++------------------
 include/block/block_int.h |  4 +-
 2 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/block/io.c b/block/io.c
index 1eda1b2f58..91a2e23f5f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -42,8 +42,8 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                          void *opaque,
                                          bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags);
 
 static void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
@@ -893,10 +893,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
         goto err;
     }
 
-    if (drv->bdrv_co_write_zeroes &&
+    if ((drv->bdrv_co_write_zeroes || drv->bdrv_co_pwrite_zeroes) &&
         buffer_is_zero(bounce_buffer, iov.iov_len)) {
-        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-                                      cluster_nb_sectors, 0);
+        ret = bdrv_co_do_pwrite_zeroes(bs,
+                                       cluster_sector_num * BDRV_SECTOR_SIZE,
+                                       cluster_nb_sectors * BDRV_SECTOR_SIZE,
+                                       0);
     } else {
         /* This does not change the data on the disk, it is not necessary
          * to flush even in cache=writethrough mode.
@@ -1110,8 +1112,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
 
 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
 
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
     QEMUIOVector qiov;
@@ -1122,20 +1124,16 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int tail = 0;
 
     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
-    int write_zeroes_sector_align =
-        bs->bl.pwrite_zeroes_alignment >> BDRV_SECTOR_BITS;
+    int alignment = MAX(bs->bl.pwrite_zeroes_alignment ?: 1,
+                        bs->request_alignment);
 
-    max_write_zeroes >>= BDRV_SECTOR_BITS;
-    if (write_zeroes_sector_align) {
-        assert(is_power_of_2(bs->bl.pwrite_zeroes_alignment));
-        head = sector_num & (write_zeroes_sector_align - 1);
-        tail = (sector_num + nb_sectors) & (write_zeroes_sector_align - 1);
-        max_write_zeroes &= ~(write_zeroes_sector_align - 1);
-    }
+    assert(is_power_of_2(alignment));
+    head = offset & (alignment - 1);
+    tail = (offset + count) & (alignment - 1);
+    max_write_zeroes &= ~(alignment - 1);
 
-    assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
-    while (nb_sectors > 0 && !ret) {
-        int num = nb_sectors;
+    while (count > 0 && !ret) {
+        int num = count;
 
         /* Align request.  Block drivers can expect the "bulk" of the request
          * to be aligned, and that unaligned requests do not cross cluster
@@ -1143,9 +1141,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
          */
         if (head) {
             /* Make a small request up to the first aligned sector.  */
-            num = MIN(nb_sectors, write_zeroes_sector_align - head);
+            num = MIN(count, alignment - head);
             head = 0;
-        } else if (tail && num > write_zeroes_sector_align) {
+        } else if (tail && num > alignment) {
             /* Shorten the request to the last aligned sector.  */
             num -= tail;
         }
@@ -1157,8 +1155,18 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
 
         ret = -ENOTSUP;
         /* First try the efficient write zeroes operation */
-        if (drv->bdrv_co_write_zeroes) {
-            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num,
+        if (drv->bdrv_co_pwrite_zeroes) {
+            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
+                                             flags & bs->supported_zero_flags);
+            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
+                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
+                need_flush = true;
+            }
+        } else if (drv->bdrv_co_write_zeroes) {
+            assert(offset % BDRV_SECTOR_SIZE == 0);
+            assert(count % BDRV_SECTOR_SIZE == 0);
+            ret = drv->bdrv_co_write_zeroes(bs, offset >> BDRV_SECTOR_BITS,
+                                            num >> BDRV_SECTOR_BITS,
                                             flags & bs->supported_zero_flags);
             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
@@ -1181,33 +1189,31 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
                 write_flags &= ~BDRV_REQ_FUA;
                 need_flush = true;
             }
-            num = MIN(num, max_xfer_len);
-            iov.iov_len = num * BDRV_SECTOR_SIZE;
+            num = MIN(num, max_xfer_len << BDRV_SECTOR_BITS);
+            iov.iov_len = num;
             if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                iov.iov_base = qemu_try_blockalign(bs, num);
                 if (iov.iov_base == NULL) {
                     ret = -ENOMEM;
                     goto fail;
                 }
-                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+                memset(iov.iov_base, 0, num);
             }
             qemu_iovec_init_external(&qiov, &iov, 1);
 
-            ret = bdrv_driver_pwritev(bs, sector_num * BDRV_SECTOR_SIZE,
-                                      num * BDRV_SECTOR_SIZE, &qiov,
-                                      write_flags);
+            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
 
             /* Keep bounce buffer around if it is big enough for all
              * all future requests.
              */
-            if (num < max_xfer_len) {
+            if (num < max_xfer_len << BDRV_SECTOR_BITS) {
                 qemu_vfree(iov.iov_base);
                 iov.iov_base = NULL;
             }
         }
 
-        sector_num += num;
-        nb_sectors -= num;
+        offset += num;
+        count -= num;
     }
 
 fail:
@@ -1245,7 +1251,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+        !(flags & BDRV_REQ_ZERO_WRITE) &&
+        (drv->bdrv_co_pwrite_zeroes || drv->bdrv_co_write_zeroes) &&
         qemu_iovec_is_zero(qiov)) {
         flags |= BDRV_REQ_ZERO_WRITE;
         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
@@ -1257,7 +1264,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
         /* Do nothing, write notifier decided to fail this request */
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
-        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+        ret = bdrv_co_do_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
+                                       nb_sectors << BDRV_SECTOR_BITS, flags);
     } else {
         bdrv_debug_event(bs, BLKDBG_PWRITEV);
         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 2e9c81ff95..1dfdf92dc8 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -165,6 +165,8 @@ struct BlockDriver {
      */
     int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+    int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
+        int64_t offset, int count, BdrvRequestFlags flags);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
     int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
@@ -456,7 +458,7 @@ struct BlockDriverState {
     unsigned int request_alignment;
     /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
     unsigned int supported_write_flags;
-    /* Flags honored during write_zeroes (so far: BDRV_REQ_FUA,
+    /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
      * BDRV_REQ_MAY_UNMAP) */
     unsigned int supported_zero_flags;
 

From 74021bc497c8e8a1b03d633656aa5ff7112bd721 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:04 -0600
Subject: [PATCH 09/31] block: Switch bdrv_write_zeroes() to byte interface

Rename to bdrv_pwrite_zeroes() to let the compiler ensure we
cater to the updated semantics.  Do the same for bdrv_co_write_zeroes().

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/blkreplay.c      |  4 +++-
 block/io.c             | 34 +++++++++++++++++++++-------------
 block/parallels.c      |  4 +++-
 block/qcow2-cluster.c  |  3 +--
 block/qcow2.c          |  9 ++++-----
 block/raw_bsd.c        |  3 ++-
 include/block/block.h  | 10 +++++-----
 migration/block.c      |  5 +++--
 tests/qemu-iotests/034 |  2 +-
 tests/qemu-iotests/154 |  2 +-
 trace-events           |  2 +-
 11 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/block/blkreplay.c b/block/blkreplay.c
index 42f1813af1..1a721ad283 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -107,7 +107,9 @@ static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
+    int ret = bdrv_co_pwrite_zeroes(bs->file->bs,
+                                    sector_num << BDRV_SECTOR_BITS,
+                                    nb_sectors << BDRV_SECTOR_BITS, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
diff --git a/block/io.c b/block/io.c
index 91a2e23f5f..a426094a33 100644
--- a/block/io.c
+++ b/block/io.c
@@ -620,18 +620,25 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 }
 
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                      int nb_sectors, BdrvRequestFlags flags)
+int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                       int count, BdrvRequestFlags flags)
 {
-    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
-                      BDRV_REQ_ZERO_WRITE | flags);
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = count,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_prwv_co(bs, offset, &qiov, true,
+                        BDRV_REQ_ZERO_WRITE | flags);
 }
 
 /*
- * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
  * The operation is sped up by checking the block status and only writing
  * zeroes to the device if they currently do not return zeroes. Optional
- * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
+ * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
  * BDRV_REQ_FUA).
  *
  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
@@ -662,7 +669,8 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
             sector_num += n;
             continue;
         }
-        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+        ret = bdrv_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
+                                 n << BDRV_SECTOR_BITS, flags);
         if (ret < 0) {
             error_report("error writing zeroes at sector %" PRId64 ": %s",
                          sector_num, strerror(-ret));
@@ -1526,18 +1534,18 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
 }
 
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors,
-                                      BdrvRequestFlags flags)
+int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs,
+                                       int64_t offset, int count,
+                                       BdrvRequestFlags flags)
 {
-    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+    trace_bdrv_co_pwrite_zeroes(bs, offset, count, flags);
 
     if (!(bs->open_flags & BDRV_O_UNMAP)) {
         flags &= ~BDRV_REQ_MAY_UNMAP;
     }
 
-    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
-                             BDRV_REQ_ZERO_WRITE | flags);
+    return bdrv_co_pwritev(bs, offset, count, NULL,
+                           BDRV_REQ_ZERO_WRITE | flags);
 }
 
 typedef struct BdrvCoGetBlockStatusData {
diff --git a/block/parallels.c b/block/parallels.c
index b9b5c6dc3d..d6a1a616b6 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -210,7 +210,9 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
         int ret;
         space += s->prealloc_size;
         if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
-            ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0);
+            ret = bdrv_pwrite_zeroes(bs->file->bs,
+                                     s->data_end << BDRV_SECTOR_BITS,
+                                     space << BDRV_SECTOR_BITS, 0);
         } else {
             ret = bdrv_truncate(bs->file->bs,
                                 (s->data_end + space) << BDRV_SECTOR_BITS);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c73797378e..b04bfafd65 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1765,8 +1765,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                 goto fail;
             }
 
-            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
-                                    s->cluster_sectors, 0);
+            ret = bdrv_pwrite_zeroes(bs->file->bs, offset, s->cluster_size, 0);
             if (ret < 0) {
                 if (!preallocated) {
                     qcow2_free_clusters(bs, offset, s->cluster_size,
diff --git a/block/qcow2.c b/block/qcow2.c
index a6ea6cb62d..cc59efc184 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2655,8 +2655,8 @@ static int make_completely_empty(BlockDriverState *bs)
     /* After this call, neither the in-memory nor the on-disk refcount
      * information accurately describe the actual references */
 
-    ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE,
-                            l1_clusters * s->cluster_sectors, 0);
+    ret = bdrv_pwrite_zeroes(bs->file->bs, s->l1_table_offset,
+                             l1_clusters * s->cluster_size, 0);
     if (ret < 0) {
         goto fail_broken_refcounts;
     }
@@ -2669,9 +2669,8 @@ static int make_completely_empty(BlockDriverState *bs)
      * overwrite parts of the existing refcount and L1 table, which is not
      * an issue because the dirty flag is set, complete data loss is in fact
      * desired and partial data loss is consequently fine as well */
-    ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE,
-                            (2 + l1_clusters) * s->cluster_size /
-                            BDRV_SECTOR_SIZE, 0);
+    ret = bdrv_pwrite_zeroes(bs->file->bs, s->cluster_size,
+                             (2 + l1_clusters) * s->cluster_size, 0);
     /* This call (even if it failed overall) may have overwritten on-disk
      * refcount structures; in that case, the in-memory refcount information
      * will probably differ from the on-disk information which makes the BDS
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 3385ed448d..d9adf90d0e 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -131,7 +131,8 @@ static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
                                             int64_t sector_num, int nb_sectors,
                                             BdrvRequestFlags flags)
 {
-    return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
+    return bdrv_co_pwrite_zeroes(bs->file->bs, sector_num << BDRV_SECTOR_BITS,
+                                 nb_sectors << BDRV_SECTOR_BITS, flags);
 }
 
 static int coroutine_fn raw_co_discard(BlockDriverState *bs,
diff --git a/include/block/block.h b/include/block/block.h
index 3fd5043d01..54cca28bac 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -33,7 +33,7 @@ typedef struct BlockDriverInfo {
      * True if the driver can optimize writing zeroes by unmapping
      * sectors. This is equivalent to the BLKDISCARDZEROES ioctl in Linux
      * with the difference that in qemu a discard is allowed to silently
-     * fail. Therefore we have to use bdrv_write_zeroes with the
+     * fail. Therefore we have to use bdrv_pwrite_zeroes with the
      * BDRV_REQ_MAY_UNMAP flag for an optimized zero write with unmapping.
      * After this call the driver has to guarantee that the contents read
      * back as zero. It is additionally required that the block device is
@@ -227,8 +227,8 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
               uint8_t *buf, int nb_sectors);
 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
                const uint8_t *buf, int nb_sectors);
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-               int nb_sectors, BdrvRequestFlags flags);
+int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                       int count, BdrvRequestFlags flags);
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
                void *buf, int count);
@@ -247,8 +247,8 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
  * function is not suitable for zeroing the entire image in a single request
  * because it may allocate memory for the entire region.
  */
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+    int count, BdrvRequestFlags flags);
 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
     const char *backing_file);
 int bdrv_get_backing_file_depth(BlockDriverState *bs);
diff --git a/migration/block.c b/migration/block.c
index e0628d187f..16cc1f8609 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -883,8 +883,9 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
             }
 
             if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
-                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
-                                        BDRV_REQ_MAY_UNMAP);
+                ret = bdrv_pwrite_zeroes(bs, addr << BDRV_SECTOR_BITS,
+                                         nr_sectors << BDRV_SECTOR_BITS,
+                                         BDRV_REQ_MAY_UNMAP);
             } else {
                 buf = g_malloc(BLOCK_SIZE);
                 qemu_get_buffer(f, buf, BLOCK_SIZE);
diff --git a/tests/qemu-iotests/034 b/tests/qemu-iotests/034
index c711cfce94..1b28bdae63 100755
--- a/tests/qemu-iotests/034
+++ b/tests/qemu-iotests/034
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Test bdrv_write_zeroes with backing files
+# Test bdrv_pwrite_zeroes with backing files (see also 154)
 #
 # Copyright (C) 2012 Red Hat, Inc.
 #
diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154
index 5905c55de9..7ca7219f08 100755
--- a/tests/qemu-iotests/154
+++ b/tests/qemu-iotests/154
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# qcow2 specific bdrv_write_zeroes tests with backing files (complements 034)
+# qcow2 specific bdrv_pwrite_zeroes tests with backing files (complements 034)
 #
 # Copyright (C) 2016 Red Hat, Inc.
 #
diff --git a/trace-events b/trace-events
index 5f720d0ddb..cfb18861ce 100644
--- a/trace-events
+++ b/trace-events
@@ -72,7 +72,7 @@ bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %
 bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
+bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
 bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
 
 # block/stream.c

From 94d047a35bf663e28f8fef137544d8ea78165add Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:05 -0600
Subject: [PATCH 10/31] iscsi: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

As this is the first byte-based iscsi interface, convert
is_request_lun_aligned() into two versions, one for sectors
and one for bytes.  Also, change from outright -EINVAL failure
on an unaligned request, to instead failing with -ENOTSUP to
trigger a read-modify-write fallback, particularly since the
block layer should be honoring bs->request_alignment to avoid
-EINVAL on read/write requests.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 56 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 52ea9d7a70..7e78adea15 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -401,18 +401,26 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
     return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
 }
 
-static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
-                                      IscsiLun *iscsilun)
+static bool is_byte_request_lun_aligned(int64_t offset, int count,
+                                        IscsiLun *iscsilun)
 {
-    if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
-        (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
-            error_report("iSCSI misaligned request: "
-                         "iscsilun->block_size %u, sector_num %" PRIi64
-                         ", nb_sectors %d",
-                         iscsilun->block_size, sector_num, nb_sectors);
-            return 0;
+    if (offset % iscsilun->block_size || count % iscsilun->block_size) {
+        error_report("iSCSI misaligned request: "
+                     "iscsilun->block_size %u, offset %" PRIi64
+                     ", count %d",
+                     iscsilun->block_size, offset, count);
+        return false;
     }
-    return 1;
+    return true;
+}
+
+static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors,
+                                          IscsiLun *iscsilun)
+{
+    assert(nb_sectors < BDRV_REQUEST_MAX_SECTORS);
+    return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS,
+                                       nb_sectors << BDRV_SECTOR_BITS,
+                                       iscsilun);
 }
 
 static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
@@ -461,7 +469,7 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
     if (fua) {
         assert(iscsilun->dpofua);
     }
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
     }
 
@@ -541,7 +549,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         ret = -EINVAL;
         goto out;
     }
@@ -638,7 +646,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
     uint64_t lba;
     uint32_t num_sectors;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
     }
 
@@ -926,7 +934,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
     struct IscsiTask iTask;
     struct unmap_list list;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
     }
 
@@ -977,8 +985,8 @@ retry:
 }
 
 static int
-coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, BdrvRequestFlags flags)
+coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                    int count, BdrvRequestFlags flags)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
@@ -986,8 +994,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     uint32_t nb_blocks;
     bool use_16_for_ws = iscsilun->use_16_for_rw;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return -EINVAL;
+    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+        return -ENOTSUP;
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1008,8 +1016,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
         return -ENOTSUP;
     }
 
-    lba = sector_qemu2lun(sector_num, iscsilun);
-    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+    lba = offset / iscsilun->block_size;
+    nb_blocks = count / iscsilun->block_size;
 
     if (iscsilun->zeroblock == NULL) {
         iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1065,9 +1073,11 @@ retry:
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+        iscsi_allocationmap_clear(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                  count >> BDRV_SECTOR_BITS);
     } else {
-        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+        iscsi_allocationmap_set(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                count >> BDRV_SECTOR_BITS);
     }
 
     return 0;
@@ -1856,7 +1866,7 @@ static BlockDriver bdrv_iscsi = {
 
     .bdrv_co_get_block_status = iscsi_co_get_block_status,
     .bdrv_co_discard      = iscsi_co_discard,
-    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
     .bdrv_co_writev_flags  = iscsi_co_writev_flags,
     .bdrv_co_flush_to_disk = iscsi_co_flush,

From 5544b59f8e672bf0a32bef376c59902530f8240f Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:06 -0600
Subject: [PATCH 11/31] qcow2: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 37 +++++++++++++++++++------------------
 trace-events  |  4 ++--
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index cc59efc184..5e26c3d5b7 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2421,38 +2421,39 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
     return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
 }
 
-static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
-    uint32_t head = sector_num % s->cluster_sectors;
-    uint32_t tail = (sector_num + nb_sectors) % s->cluster_sectors;
+    uint32_t head = offset % s->cluster_size;
+    uint32_t tail = (offset + count) % s->cluster_size;
 
-    trace_qcow2_write_zeroes_start_req(qemu_coroutine_self(), sector_num,
-                                       nb_sectors);
+    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
 
     if (head || tail) {
-        int64_t cl_start = sector_num - head;
+        int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
         uint64_t off;
         int nr;
 
-        assert(cl_start + s->cluster_sectors >= sector_num + nb_sectors);
+        assert(head + count <= s->cluster_size);
 
         /* check whether remainder of cluster already reads as zero */
-        if (!(is_zero_sectors(bs, cl_start, head) &&
-              is_zero_sectors(bs, sector_num + nb_sectors,
-                              -tail & (s->cluster_sectors - 1)))) {
+        if (!(is_zero_sectors(bs, cl_start,
+                              DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) &&
+              is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS,
+                              DIV_ROUND_UP(-tail & (s->cluster_size - 1),
+                                           BDRV_SECTOR_SIZE)))) {
             return -ENOTSUP;
         }
 
         qemu_co_mutex_lock(&s->lock);
         /* We can have new write after previous check */
-        sector_num = cl_start;
-        nb_sectors = nr = s->cluster_sectors;
-        ret = qcow2_get_cluster_offset(bs, cl_start << BDRV_SECTOR_BITS,
-                                       &nr, &off);
+        offset = cl_start << BDRV_SECTOR_BITS;
+        count = s->cluster_size;
+        nr = s->cluster_sectors;
+        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
         if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
             qemu_co_mutex_unlock(&s->lock);
             return -ENOTSUP;
@@ -2461,10 +2462,10 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
         qemu_co_mutex_lock(&s->lock);
     }
 
-    trace_qcow2_write_zeroes(qemu_coroutine_self(), sector_num, nb_sectors);
+    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
 
     /* Whatever is left can use real zero clusters */
-    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
+    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
     qemu_co_mutex_unlock(&s->lock);
 
     return ret;
@@ -3371,7 +3372,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_co_writev         = qcow2_co_writev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
 
-    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,
     .bdrv_write_compressed  = qcow2_write_compressed,
diff --git a/trace-events b/trace-events
index cfb18861ce..b1ca05bdbe 100644
--- a/trace-events
+++ b/trace-events
@@ -611,8 +611,8 @@ qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
 qcow2_writev_start_part(void *co) "co %p"
 qcow2_writev_done_part(void *co, int cur_nr_sectors) "co %p cur_nr_sectors %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
-qcow2_write_zeroes_start_req(void *co, int64_t sector, int nb_sectors) "co %p sector %" PRIx64 " nb_sectors %d"
-qcow2_write_zeroes(void *co, int64_t sector, int nb_sectors) "co %p sector %" PRIx64 " nb_sectors %d"
+qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
+qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
 
 # block/qcow2-cluster.c
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int num) "co %p offset %" PRIx64 " num %d"

From 9c21a4220b021bb605bf8e51a629ff7e54105c40 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:07 -0600
Subject: [PATCH 12/31] blkreplay: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/blkreplay.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/blkreplay.c b/block/blkreplay.c
index 1a721ad283..525c2d54e0 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -103,13 +103,11 @@ static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs,
     return ret;
 }
 
-static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_pwrite_zeroes(bs->file->bs,
-                                    sector_num << BDRV_SECTOR_BITS,
-                                    nb_sectors << BDRV_SECTOR_BITS, flags);
+    int ret = bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
@@ -149,7 +147,7 @@ static BlockDriver bdrv_blkreplay = {
     .bdrv_co_readv          = blkreplay_co_readv,
     .bdrv_co_writev         = blkreplay_co_writev,
 
-    .bdrv_co_write_zeroes   = blkreplay_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
     .bdrv_co_discard        = blkreplay_co_discard,
     .bdrv_co_flush          = blkreplay_co_flush,
 };

From e88a36ebad257fbbdd3ea534624d14938c2c441b Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:08 -0600
Subject: [PATCH 13/31] gluster: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/gluster.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/block/gluster.c b/block/gluster.c
index a8aaacf645..d361d8e847 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -454,14 +454,12 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state)
 }
 
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+        int64_t offset, int size, BdrvRequestFlags flags)
 {
     int ret;
     GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
-    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
     acb.size = size;
     acb.ret = 0;
@@ -769,7 +767,7 @@ static BlockDriver bdrv_gluster = {
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
     .create_opts                  = &qemu_gluster_create_opts,
 };
@@ -796,7 +794,7 @@ static BlockDriver bdrv_gluster_tcp = {
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
     .create_opts                  = &qemu_gluster_create_opts,
 };
@@ -823,7 +821,7 @@ static BlockDriver bdrv_gluster_unix = {
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
     .create_opts                  = &qemu_gluster_create_opts,
 };
@@ -850,7 +848,7 @@ static BlockDriver bdrv_gluster_rdma = {
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
     .create_opts                  = &qemu_gluster_create_opts,
 };

From 49a2e48348b7b0fda04e9d767a537e6c5d230bdd Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:09 -0600
Subject: [PATCH 14/31] qed: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Kill an abuse of the comma operator while at it (fortunately,
the semantics were still right).  Also, the test for requests
not aligned to clusters should be applied always, not just
when a backing file is present.

Signed-off-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 113b8e8cc6..12068061ac 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1418,7 +1418,7 @@ typedef struct {
     bool done;
 } QEDWriteZeroesCB;
 
-static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
 {
     QEDWriteZeroesCB *cb = opaque;
 
@@ -1429,10 +1429,10 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
     }
 }
 
-static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
-                                                 int64_t sector_num,
-                                                 int nb_sectors,
-                                                 BdrvRequestFlags flags)
+static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset,
+                                                  int count,
+                                                  BdrvRequestFlags flags)
 {
     BlockAIOCB *blockacb;
     BDRVQEDState *s = bs->opaque;
@@ -1440,25 +1440,22 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
     QEMUIOVector qiov;
     struct iovec iov;
 
-    /* Refuse if there are untouched backing file sectors */
-    if (bs->backing) {
-        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
-            return -ENOTSUP;
-        }
-        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
-            return -ENOTSUP;
-        }
+    /* Fall back if the request is not aligned */
+    if (qed_offset_into_cluster(s, offset) ||
+        qed_offset_into_cluster(s, count)) {
+        return -ENOTSUP;
     }
 
     /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
      * then it will be allocated during request processing.
      */
-    iov.iov_base = NULL,
-    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+    iov.iov_base = NULL;
+    iov.iov_len = count;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
-                             qed_co_write_zeroes_cb, &cb,
+    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+                             count >> BDRV_SECTOR_BITS,
+                             qed_co_pwrite_zeroes_cb, &cb,
                              QED_AIOCB_WRITE | QED_AIOCB_ZERO);
     if (!blockacb) {
         return -EIO;
@@ -1663,7 +1660,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,

From 2ffa76c2bf97416b8d6904efb1e5f147f36127eb Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:10 -0600
Subject: [PATCH 15/31] raw-posix: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake <eblake@redhat.com>
[ kwolf: Fixed up trace_paio_submit_co() call for qiov == NULL ]
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 34 +++++++++++++++++-----------------
 trace-events      |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index a4f5a1ba5f..ce1cf14f66 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1252,8 +1252,8 @@ static int aio_worker(void *arg)
 }
 
 static int paio_submit_co(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        int type)
+                          int64_t offset, QEMUIOVector *qiov,
+                          int count, int type)
 {
     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
     ThreadPool *pool;
@@ -1262,16 +1262,16 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
 
-    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
         acb->aio_niov = qiov->niov;
-        assert(qiov->size == acb->aio_nbytes);
+        assert(qiov->size == count);
     }
 
-    trace_paio_submit_co(sector_num, nb_sectors, type);
+    trace_paio_submit_co(offset, count, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_co(pool, aio_worker, acb);
 }
@@ -1868,17 +1868,17 @@ static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
                        cb, opaque, QEMU_AIO_DISCARD);
 }
 
-static int coroutine_fn raw_co_write_zeroes(
-    BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(
+    BlockDriverState *bs, int64_t offset,
+    int count, BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
 
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_WRITE_ZEROES);
     } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_DISCARD);
     }
     return -ENOTSUP;
@@ -1931,7 +1931,7 @@ BlockDriver bdrv_file = {
     .bdrv_create = raw_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_co_get_block_status = raw_co_get_block_status,
-    .bdrv_co_write_zeroes = raw_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
 
     .bdrv_aio_readv = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
@@ -2293,8 +2293,8 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 
-static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
     int rc;
@@ -2304,10 +2304,10 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
         return rc;
     }
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
     } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
     }
     return -ENOTSUP;
@@ -2379,7 +2379,7 @@ static BlockDriver bdrv_host_device = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create         = hdev_create,
     .create_opts         = &raw_create_opts,
-    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
 
     .bdrv_aio_readv	= raw_aio_readv,
     .bdrv_aio_writev	= raw_aio_writev,
diff --git a/trace-events b/trace-events
index b1ca05bdbe..421d89f476 100644
--- a/trace-events
+++ b/trace-events
@@ -131,7 +131,7 @@ thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
 
 # block/raw-win32.c
 # block/raw-posix.c
-paio_submit_co(int64_t sector_num, int nb_sectors, int type) "sector_num %"PRId64" nb_sectors %d type %d"
+paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d"
 paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d"
 
 # ioport.c

From 39ad937e162fb14f5ec940b7a9cf8a38992d7217 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:11 -0600
Subject: [PATCH 16/31] raw_bsd: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw_bsd.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index d9adf90d0e..b1d5237135 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -127,12 +127,11 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
            (sector_num << BDRV_SECTOR_BITS);
 }
 
-static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
-                                            int64_t sector_num, int nb_sectors,
-                                            BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
+                                             int64_t offset, int count,
+                                             BdrvRequestFlags flags)
 {
-    return bdrv_co_pwrite_zeroes(bs->file->bs, sector_num << BDRV_SECTOR_BITS,
-                                 nb_sectors << BDRV_SECTOR_BITS, flags);
+    return bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags);
 }
 
 static int coroutine_fn raw_co_discard(BlockDriverState *bs,
@@ -253,7 +252,7 @@ BlockDriver bdrv_raw = {
     .bdrv_create          = &raw_create,
     .bdrv_co_readv        = &raw_co_readv,
     .bdrv_co_writev_flags = &raw_co_writev_flags,
-    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
     .bdrv_co_discard      = &raw_co_discard,
     .bdrv_co_get_block_status = &raw_co_get_block_status,
     .bdrv_truncate        = &raw_truncate,

From a620f2ae1566a72aac289d40d72decbf95fc6de8 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:12 -0600
Subject: [PATCH 17/31] vmdk: Convert to bdrv_co_pwrite_zeroes()

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vmdk.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 5a01e1672f..ee09423b46 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1703,15 +1703,13 @@ static int vmdk_write_compressed(BlockDriverState *bs,
     }
 }
 
-static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             int nb_sectors,
-                                             BdrvRequestFlags flags)
+static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
+                                              int64_t offset,
+                                              int bytes,
+                                              BdrvRequestFlags flags)
 {
     int ret;
     BDRVVmdkState *s = bs->opaque;
-    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
-    uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE;
 
     qemu_co_mutex_lock(&s->lock);
     /* write zeroes could fail if sectors not aligned to cluster, test it with
@@ -2402,7 +2400,7 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_co_preadv               = vmdk_co_preadv,
     .bdrv_co_pwritev              = vmdk_co_pwritev,
     .bdrv_write_compressed        = vmdk_write_compressed,
-    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
     .bdrv_close                   = vmdk_close,
     .bdrv_create                  = vmdk_create,
     .bdrv_co_flush_to_disk        = vmdk_co_flush,

From c1499a5e73ae81b597869263ed8ac87bdaafeff7 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Wed, 1 Jun 2016 15:10:13 -0600
Subject: [PATCH 18/31] block: Kill bdrv_co_write_zeroes()

Now that all drivers have been converted to a byte interface,
we no longer need a sector interface.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c                | 15 ++-------------
 include/block/block_int.h |  2 --
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/block/io.c b/block/io.c
index a426094a33..9dc265b34d 100644
--- a/block/io.c
+++ b/block/io.c
@@ -901,7 +901,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
         goto err;
     }
 
-    if ((drv->bdrv_co_write_zeroes || drv->bdrv_co_pwrite_zeroes) &&
+    if (drv->bdrv_co_pwrite_zeroes &&
         buffer_is_zero(bounce_buffer, iov.iov_len)) {
         ret = bdrv_co_do_pwrite_zeroes(bs,
                                        cluster_sector_num * BDRV_SECTOR_SIZE,
@@ -1170,16 +1170,6 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
                 need_flush = true;
             }
-        } else if (drv->bdrv_co_write_zeroes) {
-            assert(offset % BDRV_SECTOR_SIZE == 0);
-            assert(count % BDRV_SECTOR_SIZE == 0);
-            ret = drv->bdrv_co_write_zeroes(bs, offset >> BDRV_SECTOR_BITS,
-                                            num >> BDRV_SECTOR_BITS,
-                                            flags & bs->supported_zero_flags);
-            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
-                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
-                need_flush = true;
-            }
         } else {
             assert(!bs->supported_zero_flags);
         }
@@ -1259,8 +1249,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-        !(flags & BDRV_REQ_ZERO_WRITE) &&
-        (drv->bdrv_co_pwrite_zeroes || drv->bdrv_co_write_zeroes) &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
         qemu_iovec_is_zero(qiov)) {
         flags |= BDRV_REQ_ZERO_WRITE;
         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1dfdf92dc8..8a4963c4fe 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -163,8 +163,6 @@ struct BlockDriver {
      * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
      * will be called instead.
      */
-    int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
     int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
         int64_t offset, int count, BdrvRequestFlags flags);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,

From ad2964b4fff89a32c20cfad8d904a96929956c9e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 25 May 2016 17:20:06 +0200
Subject: [PATCH 19/31] migration/block: Convert load to BlockBackend

This converts the loading part of block migration to use BlockBackend
interfaces rather than accessing the BlockDriverState directly.

Note that this takes a lazy shortcut. We should really use a separate
BlockBackend that is configured for the migration rather than for the
guest (e.g. writethrough caching is unnecessary) and holds its own
reference to the BlockDriverState, but the impact isn't that big and we
didn't have a separate migration reference before either, so it must be
good enough, I guess...

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 migration/block.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 16cc1f8609..30af182f24 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -827,8 +827,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
     int len, flags;
     char device_name[256];
     int64_t addr;
-    BlockDriverState *bs, *bs_prev = NULL;
-    BlockBackend *blk;
+    BlockBackend *blk, *blk_prev = NULL;;
     Error *local_err = NULL;
     uint8_t *buf;
     int64_t total_sectors = 0;
@@ -853,23 +852,17 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
                         device_name);
                 return -EINVAL;
             }
-            bs = blk_bs(blk);
-            if (!bs) {
-                fprintf(stderr, "Block device %s has no medium\n",
-                        device_name);
-                return -EINVAL;
-            }
 
-            if (bs != bs_prev) {
-                bs_prev = bs;
-                total_sectors = bdrv_nb_sectors(bs);
+            if (blk != blk_prev) {
+                blk_prev = blk;
+                total_sectors = blk_nb_sectors(blk);
                 if (total_sectors <= 0) {
                     error_report("Error getting length of block device %s",
                                  device_name);
                     return -EINVAL;
                 }
 
-                bdrv_invalidate_cache(bs, &local_err);
+                blk_invalidate_cache(blk, &local_err);
                 if (local_err) {
                     error_report_err(local_err);
                     return -EINVAL;
@@ -883,13 +876,14 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
             }
 
             if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
-                ret = bdrv_pwrite_zeroes(bs, addr << BDRV_SECTOR_BITS,
-                                         nr_sectors << BDRV_SECTOR_BITS,
-                                         BDRV_REQ_MAY_UNMAP);
+                ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE,
+                                        nr_sectors * BDRV_SECTOR_SIZE,
+                                        BDRV_REQ_MAY_UNMAP);
             } else {
                 buf = g_malloc(BLOCK_SIZE);
                 qemu_get_buffer(f, buf, BLOCK_SIZE);
-                ret = bdrv_write(bs, addr, buf, nr_sectors);
+                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
+                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
                 g_free(buf);
             }
 

From ebd2f9e7db4b6cbc2f144b9cef397503b3e956b5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 27 May 2016 19:50:37 +0200
Subject: [PATCH 20/31] migration/block: Convert saving to BlockBackend

This creates a new BlockBackend for copying data from an images to the
migration stream on the source host. All I/O for block migration goes
through BlockBackend now.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 migration/block.c | 124 +++++++++++++++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 45 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 30af182f24..ebc10e628d 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -52,7 +52,8 @@
 
 typedef struct BlkMigDevState {
     /* Written during setup phase.  Can be read without a lock.  */
-    BlockDriverState *bs;
+    BlockBackend *blk;
+    char *blk_name;
     int shared_base;
     int64_t total_sectors;
     QSIMPLEQ_ENTRY(BlkMigDevState) entry;
@@ -145,9 +146,9 @@ static void blk_send(QEMUFile *f, BlkMigBlock * blk)
                      | flags);
 
     /* device name */
-    len = strlen(bdrv_get_device_name(blk->bmds->bs));
+    len = strlen(blk->bmds->blk_name);
     qemu_put_byte(f, len);
-    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
+    qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len);
 
     /* if a block is zero we need to flush here since the network
      * bandwidth is now a lot higher than the storage device bandwidth.
@@ -201,7 +202,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 
-    if (sector < bdrv_nb_sectors(bmds->bs)) {
+    if (sector < blk_nb_sectors(bmds->blk)) {
         return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
             (1UL << (chunk % (sizeof(unsigned long) * 8))));
     } else {
@@ -235,10 +236,10 @@ static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 
 static void alloc_aio_bitmap(BlkMigDevState *bmds)
 {
-    BlockDriverState *bs = bmds->bs;
+    BlockBackend *bb = bmds->blk;
     int64_t bitmap_size;
 
-    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
     bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 
     bmds->aio_bitmap = g_malloc0(bitmap_size);
@@ -268,19 +269,19 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 {
     int64_t total_sectors = bmds->total_sectors;
     int64_t cur_sector = bmds->cur_sector;
-    BlockDriverState *bs = bmds->bs;
+    BlockBackend *bb = bmds->blk;
     BlkMigBlock *blk;
     int nr_sectors;
 
     if (bmds->shared_base) {
         qemu_mutex_lock_iothread();
-        aio_context_acquire(bdrv_get_aio_context(bs));
+        aio_context_acquire(blk_get_aio_context(bb));
         while (cur_sector < total_sectors &&
-               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
-                                  &nr_sectors)) {
+               !bdrv_is_allocated(blk_bs(bb), cur_sector,
+                                  MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
             cur_sector += nr_sectors;
         }
-        aio_context_release(bdrv_get_aio_context(bs));
+        aio_context_release(blk_get_aio_context(bb));
         qemu_mutex_unlock_iothread();
     }
 
@@ -323,12 +324,12 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
      * without the need to acquire the AioContext.
      */
     qemu_mutex_lock_iothread();
-    aio_context_acquire(bdrv_get_aio_context(bmds->bs));
-    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
-                                nr_sectors, blk_mig_read_cb, blk);
+    aio_context_acquire(blk_get_aio_context(bmds->blk));
+    blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov,
+                                0, blk_mig_read_cb, blk);
 
     bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
-    aio_context_release(bdrv_get_aio_context(bmds->bs));
+    aio_context_release(blk_get_aio_context(bmds->blk));
     qemu_mutex_unlock_iothread();
 
     bmds->cur_sector = cur_sector + nr_sectors;
@@ -343,10 +344,10 @@ static int set_dirty_tracking(void)
     int ret;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
-        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
-                                                      NULL, NULL);
-        aio_context_release(bdrv_get_aio_context(bmds->bs));
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
+        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
+                                                      BLOCK_SIZE, NULL, NULL);
+        aio_context_release(blk_get_aio_context(bmds->blk));
         if (!bmds->dirty_bitmap) {
             ret = -errno;
             goto fail;
@@ -357,9 +358,9 @@ static int set_dirty_tracking(void)
 fail:
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         if (bmds->dirty_bitmap) {
-            aio_context_acquire(bdrv_get_aio_context(bmds->bs));
-            bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
-            aio_context_release(bdrv_get_aio_context(bmds->bs));
+            aio_context_acquire(blk_get_aio_context(bmds->blk));
+            bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap);
+            aio_context_release(blk_get_aio_context(bmds->blk));
         }
     }
     return ret;
@@ -372,9 +373,9 @@ static void unset_dirty_tracking(void)
     BlkMigDevState *bmds;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
-        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
-        aio_context_release(bdrv_get_aio_context(bmds->bs));
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
+        bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap);
+        aio_context_release(blk_get_aio_context(bmds->blk));
     }
 }
 
@@ -384,6 +385,11 @@ static void init_blk_migration(QEMUFile *f)
     BlkMigDevState *bmds;
     int64_t sectors;
     BdrvNextIterator it;
+    int i, num_bs = 0;
+    struct {
+        BlkMigDevState *bmds;
+        BlockDriverState *bs;
+    } *bmds_bs;
 
     block_mig_state.submitted = 0;
     block_mig_state.read_done = 0;
@@ -393,27 +399,32 @@ static void init_blk_migration(QEMUFile *f)
     block_mig_state.bulk_completed = 0;
     block_mig_state.zero_blocks = migrate_zero_blocks();
 
-
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        num_bs++;
+    }
+    bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs));
+
+    for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) {
         if (bdrv_is_read_only(bs)) {
             continue;
         }
 
         sectors = bdrv_nb_sectors(bs);
         if (sectors <= 0) {
-            return;
+            goto out;
         }
 
         bmds = g_new0(BlkMigDevState, 1);
-        bmds->bs = bs;
+        bmds->blk = blk_new();
+        bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
         bmds->bulk_completed = 0;
         bmds->total_sectors = sectors;
         bmds->completed_sectors = 0;
         bmds->shared_base = block_mig_state.shared_base;
-        alloc_aio_bitmap(bmds);
-        error_setg(&bmds->blocker, "block device is in use by migration");
-        bdrv_op_block_all(bs, bmds->blocker);
-        bdrv_ref(bs);
+
+        assert(i < num_bs);
+        bmds_bs[i].bmds = bmds;
+        bmds_bs[i].bs = bs;
 
         block_mig_state.total_sector_sum += sectors;
 
@@ -426,6 +437,24 @@ static void init_blk_migration(QEMUFile *f)
 
         QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
     }
+
+    /* Can only insert new BDSes now because doing so while iterating block
+     * devices may end up in a deadlock (iterating the new BDSes, too). */
+    for (i = 0; i < num_bs; i++) {
+        BlkMigDevState *bmds = bmds_bs[i].bmds;
+        BlockDriverState *bs = bmds_bs[i].bs;
+
+        if (bmds) {
+            blk_insert_bs(bmds->blk, bs);
+
+            alloc_aio_bitmap(bmds);
+            error_setg(&bmds->blocker, "block device is in use by migration");
+            bdrv_op_block_all(bs, bmds->blocker);
+        }
+    }
+
+out:
+    g_free(bmds_bs);
 }
 
 /* Called with no lock taken.  */
@@ -482,6 +511,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                                  int is_async)
 {
     BlkMigBlock *blk;
+    BlockDriverState *bs = blk_bs(bmds->blk);
     int64_t total_sectors = bmds->total_sectors;
     int64_t sector;
     int nr_sectors;
@@ -491,11 +521,11 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
         blk_mig_lock();
         if (bmds_aio_inflight(bmds, sector)) {
             blk_mig_unlock();
-            bdrv_drain(bmds->bs);
+            blk_drain(bmds->blk);
         } else {
             blk_mig_unlock();
         }
-        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
+        if (bdrv_get_dirty(bs, bmds->dirty_bitmap, sector)) {
 
             if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                 nr_sectors = total_sectors - sector;
@@ -513,15 +543,18 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
-                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
-                                            nr_sectors, blk_mig_read_cb, blk);
+                blk->aiocb = blk_aio_preadv(bmds->blk,
+                                            sector * BDRV_SECTOR_SIZE,
+                                            &blk->qiov, 0, blk_mig_read_cb,
+                                            blk);
 
                 blk_mig_lock();
                 block_mig_state.submitted++;
                 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
                 blk_mig_unlock();
             } else {
-                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
+                ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, blk->buf,
+                                nr_sectors * BDRV_SECTOR_SIZE);
                 if (ret < 0) {
                     goto error;
                 }
@@ -559,9 +592,9 @@ static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
     int ret = 1;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
         ret = mig_save_device_dirty(f, bmds, is_async);
-        aio_context_release(bdrv_get_aio_context(bmds->bs));
+        aio_context_release(blk_get_aio_context(bmds->blk));
         if (ret <= 0) {
             break;
         }
@@ -619,9 +652,9 @@ static int64_t get_remaining_dirty(void)
     int64_t dirty = 0;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
+        aio_context_acquire(blk_get_aio_context(bmds->blk));
         dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
-        aio_context_release(bdrv_get_aio_context(bmds->bs));
+        aio_context_release(blk_get_aio_context(bmds->blk));
     }
 
     return dirty << BDRV_SECTOR_BITS;
@@ -641,15 +674,16 @@ static void block_migration_cleanup(void *opaque)
 
     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
-        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
+        bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
         error_free(bmds->blocker);
 
-        /* Save ctx, because bmds->bs can disappear during bdrv_unref.  */
-        ctx = bdrv_get_aio_context(bmds->bs);
+        /* Save ctx, because bmds->blk can disappear during blk_unref.  */
+        ctx = blk_get_aio_context(bmds->blk);
         aio_context_acquire(ctx);
-        bdrv_unref(bmds->bs);
+        blk_unref(bmds->blk);
         aio_context_release(ctx);
 
+        g_free(bmds->blk_name);
         g_free(bmds->aio_bitmap);
         g_free(bmds);
     }

From 107d433cbbc347cdd140c0f2de4574ff3675bdbe Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 30 May 2016 13:59:59 +0200
Subject: [PATCH 21/31] block: assert that bs->request_alignment is a power of
 2

at least bdrv_co_preadv/pwritev expect this.

Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 736432f67e..f54bc25e8a 100644
--- a/block.c
+++ b/block.c
@@ -1018,7 +1018,7 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
 
     assert(bdrv_opt_mem_align(bs) != 0);
     assert(bdrv_min_mem_align(bs) != 0);
-    assert((bs->request_alignment != 0) || bdrv_is_sg(bs));
+    assert(is_power_of_2(bs->request_alignment) || bdrv_is_sg(bs));
 
     qemu_opts_del(opts);
     return 0;

From 6f6071745bd0366221f5a0160ed7d18d0e38b9f7 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 Jun 2016 10:07:02 +0800
Subject: [PATCH 22/31] raw-posix: Fetch max sectors for host block device

This is sometimes a useful value we should count in.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index ce1cf14f66..ce2e20f203 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -729,9 +729,33 @@ static void raw_reopen_abort(BDRVReopenState *state)
     state->opaque = NULL;
 }
 
+static int hdev_get_max_transfer_length(int fd)
+{
+#ifdef BLKSECTGET
+    int max_sectors = 0;
+    if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+        return max_sectors;
+    } else {
+        return -errno;
+    }
+#else
+    return -ENOSYS;
+#endif
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    struct stat st;
+
+    if (!fstat(s->fd, &st)) {
+        if (S_ISBLK(st.st_mode)) {
+            int ret = hdev_get_max_transfer_length(s->fd);
+            if (ret >= 0) {
+                bs->bl.max_transfer_length = ret;
+            }
+        }
+    }
 
     raw_probe_alignment(bs, s->fd, errp);
     bs->bl.min_mem_alignment = s->buf_align;

From f3c3b87dae44ac6c82246ceb3953793951800a9a Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 2 Jun 2016 18:58:15 +0300
Subject: [PATCH 23/31] qcow2: avoid extra flushes in qcow2

The problem with excessive flushing was found by a couple of performance
tests:
  - parallel directory tree creation (from 2 processes)
  - 32 cached writes + fsync at the end in a loop

For the first one results improved from 2.6 loops/sec to 3.5 loops/sec.
Each loop creates 10^3 directories with 10 files in each.

For the second one results improved from ~600 fsync/sec to ~1100
fsync/sec. Though, it was run on SSD so it probably won't show such
performance gain on rotational media.

qcow2_cache_flush() calls bdrv_flush() unconditionally after writing
cache entries of a particular cache. This can lead to as many as
2 additional fdatasyncs inside bdrv_flush.

We can simply skip all fdatasync calls inside qcow2_co_flush_to_os
as bdrv_flush for sure will do the job. These flushes are necessary to
keep the right order of writes to the different caches. Though this is
not necessary in the current code base as this ordering is ensured through
the flush in qcow2_cache_flush_dependency().

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Pavel Borzenkov <pborzenkov@virtuozzo.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cache.c | 11 +++++++++--
 block/qcow2.c       |  4 ++--
 block/qcow2.h       |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 0fe8edae41..208a060421 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -226,7 +226,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     return 0;
 }
 
-int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c)
 {
     BDRVQcow2State *s = bs->opaque;
     int result = 0;
@@ -242,8 +242,15 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
         }
     }
 
+    return result;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int result = qcow2_cache_write(bs, c);
+
     if (result == 0) {
-        ret = bdrv_flush(bs->file->bs);
+        int ret = bdrv_flush(bs->file->bs);
         if (ret < 0) {
             result = ret;
         }
diff --git a/block/qcow2.c b/block/qcow2.c
index 5e26c3d5b7..6f5fb810e4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2813,14 +2813,14 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
     int ret;
 
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    ret = qcow2_cache_write(bs, s->l2_table_cache);
     if (ret < 0) {
         qemu_co_mutex_unlock(&s->lock);
         return ret;
     }
 
     if (qcow2_need_accurate_refcounts(s)) {
-        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        ret = qcow2_cache_write(bs, s->refcount_block_cache);
         if (ret < 0) {
             qemu_co_mutex_unlock(&s->lock);
             return ret;
diff --git a/block/qcow2.h b/block/qcow2.h
index a063a3c1a1..7db9795d44 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -583,6 +583,7 @@ int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
      void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);

From 2a9170bcd4980cdd75791f3aa0f762c5e53334bb Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 6 Jun 2016 12:53:22 +0200
Subject: [PATCH 24/31] block: Fix bdrv_all_delete_snapshot() error handling

The code to exit the loop after bdrv_snapshot_delete_by_id_or_name()
returned failure was duplicated. The first copy of it was too early so
that the AioContext lock would not be freed. This patch removes it so
that only the second, correct copy remains.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/snapshot.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/snapshot.c b/block/snapshot.c
index 6e6e34fcf4..da89d2b085 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -409,9 +409,6 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
         if (bdrv_can_snapshot(bs) &&
                 bdrv_snapshot_find(bs, snapshot, name) >= 0) {
             ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
-            if (ret < 0) {
-                goto fail;
-            }
         }
         aio_context_release(ctx);
         if (ret < 0) {

From bf18bee547d19fde314e7b6b81f21f68b46c8a92 Mon Sep 17 00:00:00 2001
From: Colin Lord <clord@redhat.com>
Date: Mon, 6 Jun 2016 14:15:22 -0400
Subject: [PATCH 25/31] blockdev: clean up error handling in do_open_tray

Returns negative error codes and accompanying error messages in cases where
the device has no tray or the tray is locked and isn't forced open. This
extra information should result in better flexibility in functions that
call do_open_tray.

Suggested by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Colin Lord <clord@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 6ccb8e1f84..7fd515a4fa 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -56,6 +56,8 @@
 static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
     QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
 
+static int do_open_tray(const char *device, bool force, Error **errp);
+
 static const char *const if_name[IF_COUNT] = {
     [IF_NONE] = "none",
     [IF_IDE] = "ide",
@@ -2274,8 +2276,6 @@ exit:
     block_job_txn_unref(block_job_txn);
 }
 
-static int do_open_tray(const char *device, bool force, Error **errp);
-
 void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
 {
     Error *local_err = NULL;
@@ -2286,16 +2286,11 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
     }
 
     rc = do_open_tray(device, force, &local_err);
-    if (local_err) {
+    if (rc && rc != -ENOSYS) {
         error_propagate(errp, local_err);
         return;
     }
-
-    if (rc == EINPROGRESS) {
-        error_setg(errp, "Device '%s' is locked and force was not specified, "
-                   "wait for tray to open and try again", device);
-        return;
-    }
+    error_free(local_err);
 
     qmp_x_blockdev_remove_medium(device, errp);
 }
@@ -2324,11 +2319,16 @@ void qmp_block_passwd(bool has_device, const char *device,
     aio_context_release(aio_context);
 }
 
-/**
- * returns -errno on fatal error, +errno for non-fatal situations.
- * errp will always be set when the return code is negative.
- * May return +ENOSYS if the device has no tray,
- * or +EINPROGRESS if the tray is locked and the guest has been notified.
+/*
+ * Attempt to open the tray of @device.
+ * If @force, ignore its tray lock.
+ * Else, if the tray is locked, don't open it, but ask the guest to open it.
+ * On error, store an error through @errp and return -errno.
+ * If @device does not exist, return -ENODEV.
+ * If it has no removable media, return -ENOTSUP.
+ * If it has no tray, return -ENOSYS.
+ * If the guest was asked to open the tray, return -EINPROGRESS.
+ * Else, return 0.
  */
 static int do_open_tray(const char *device, bool force, Error **errp)
 {
@@ -2348,8 +2348,8 @@ static int do_open_tray(const char *device, bool force, Error **errp)
     }
 
     if (!blk_dev_has_tray(blk)) {
-        /* Ignore this command on tray-less devices */
-        return ENOSYS;
+        error_setg(errp, "Device '%s' does not have a tray", device);
+        return -ENOSYS;
     }
 
     if (blk_dev_is_tray_open(blk)) {
@@ -2366,7 +2366,9 @@ static int do_open_tray(const char *device, bool force, Error **errp)
     }
 
     if (locked && !force) {
-        return EINPROGRESS;
+        error_setg(errp, "Device '%s' is locked and force was not specified, "
+                   "wait for tray to open and try again", device);
+        return -EINPROGRESS;
     }
 
     return 0;
@@ -2375,10 +2377,18 @@ static int do_open_tray(const char *device, bool force, Error **errp)
 void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
                             Error **errp)
 {
+    Error *local_err = NULL;
+    int rc;
+
     if (!has_force) {
         force = false;
     }
-    do_open_tray(device, force, errp);
+    rc = do_open_tray(device, force, &local_err);
+    if (rc && rc != -ENOSYS && rc != -EINPROGRESS) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    error_free(local_err);
 }
 
 void qmp_blockdev_close_tray(const char *device, Error **errp)

From 515c2f431ebe561e69c57371b2c2217792b3b820 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 7 Jun 2016 15:51:28 +0200
Subject: [PATCH 26/31] block: Don't emulate natively supported pwritev flags

Drivers that implement .bdrv_co_pwritev() get the flags passed as an
argument to said function, but we also unconditionally emulate the flags
anyway. We shouldn't do that.

Fix this by clearing all flags that the driver supports natively after
it returns from .bdrv_co_pwritev().

Fixes: 4df863f3 ('block: Make supported_write_flags a per-bds property')
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 block/io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/io.c b/block/io.c
index 9dc265b34d..fb99a7151c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -816,7 +816,9 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
     int ret;
 
     if (drv->bdrv_co_pwritev) {
-        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
+        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
+                                   flags & bs->supported_write_flags);
+        flags &= ~bs->supported_write_flags;
         goto emulate_flags;
     }
 

From b6133b8c68c75cdb6b74d23257cd330bb66f595b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 5 Aug 2014 14:17:13 +0200
Subject: [PATCH 27/31] qemu-img bench

This adds a qemu-img command that allows doing some simple benchmarks
for the block layer without involving guest devices and a real VM.

For the start, this implements only a test of sequential reads.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-img-cmds.hx |   6 ++
 qemu-img.c       | 190 +++++++++++++++++++++++++++++++++++++++++++++++
 qemu-img.texi    |  10 +++
 3 files changed, 206 insertions(+)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index e7cded6e24..f3bd546335 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -9,6 +9,12 @@ STEXI
 @table @option
 ETEXI
 
+DEF("bench", img_bench,
+    "bench [-c count] [-d depth] [-f fmt] [-n] [-q] [-s buffer_size] [-t cache] filename")
+STEXI
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s @var{buffer_size}] [-t @var{cache}] @var{filename}
+ETEXI
+
 DEF("check", img_check,
     "check [-q] [--object objectdef] [--image-opts] [-f fmt] [--output=ofmt] [-r [leaks | all]] [-T src_cache] filename")
 STEXI
diff --git a/qemu-img.c b/qemu-img.c
index 4b56ad36aa..d471d10020 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3459,6 +3459,196 @@ out_no_progress:
     return 0;
 }
 
+typedef struct BenchData {
+    BlockBackend *blk;
+    uint64_t image_size;
+    int bufsize;
+    int nrreq;
+    int n;
+    uint8_t *buf;
+    QEMUIOVector *qiov;
+
+    int in_flight;
+    uint64_t offset;
+} BenchData;
+
+static void bench_cb(void *opaque, int ret)
+{
+    BenchData *b = opaque;
+    BlockAIOCB *acb;
+
+    if (ret < 0) {
+        error_report("Failed request: %s\n", strerror(-ret));
+        exit(EXIT_FAILURE);
+    }
+    if (b->in_flight > 0) {
+        b->n--;
+        b->in_flight--;
+    }
+
+    while (b->n > b->in_flight && b->in_flight < b->nrreq) {
+        acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
+                             bench_cb, b);
+        if (!acb) {
+            error_report("Failed to issue request");
+            exit(EXIT_FAILURE);
+        }
+        b->in_flight++;
+        b->offset += b->bufsize;
+        b->offset %= b->image_size;
+    }
+}
+
+static int img_bench(int argc, char **argv)
+{
+    int c, ret = 0;
+    const char *fmt = NULL, *filename;
+    bool quiet = false;
+    bool image_opts = false;
+    int count = 75000;
+    int depth = 64;
+    size_t bufsize = 4096;
+    int64_t image_size;
+    BlockBackend *blk = NULL;
+    BenchData data = {};
+    int flags = 0;
+    bool writethrough;
+    struct timeval t1, t2;
+    int i;
+
+    for (;;) {
+        static const struct option long_options[] = {
+            {"help", no_argument, 0, 'h'},
+            {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
+            {0, 0, 0, 0}
+        };
+        c = getopt_long(argc, argv, "hc:d:f:nqs:t:", long_options, NULL);
+        if (c == -1) {
+            break;
+        }
+
+        switch (c) {
+        case 'h':
+        case '?':
+            help();
+            break;
+        case 'c':
+        {
+            char *end;
+            errno = 0;
+            count = strtoul(optarg, &end, 0);
+            if (errno || *end || count > INT_MAX) {
+                error_report("Invalid request count specified");
+                return 1;
+            }
+            break;
+        }
+        case 'd':
+        {
+            char *end;
+            errno = 0;
+            depth = strtoul(optarg, &end, 0);
+            if (errno || *end || depth > INT_MAX) {
+                error_report("Invalid queue depth specified");
+                return 1;
+            }
+            break;
+        }
+        case 'f':
+            fmt = optarg;
+            break;
+        case 'n':
+            flags |= BDRV_O_NATIVE_AIO;
+            break;
+        case 'q':
+            quiet = true;
+            break;
+        case 's':
+        {
+            int64_t sval;
+            char *end;
+
+            sval = qemu_strtosz_suffix(optarg, &end, QEMU_STRTOSZ_DEFSUFFIX_B);
+            if (sval < 0 || sval > INT_MAX || *end) {
+                error_report("Invalid buffer size specified");
+                return 1;
+            }
+
+            bufsize = sval;
+            break;
+        }
+        case 't':
+            ret = bdrv_parse_cache_mode(optarg, &flags, &writethrough);
+            if (ret < 0) {
+                error_report("Invalid cache mode");
+                ret = -1;
+                goto out;
+            }
+            break;
+        case OPTION_IMAGE_OPTS:
+            image_opts = true;
+            break;
+        }
+    }
+
+    if (optind != argc - 1) {
+        error_exit("Expecting one image file name");
+    }
+    filename = argv[argc - 1];
+
+    blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet);
+    if (!blk) {
+        ret = -1;
+        goto out;
+    }
+
+    image_size = blk_getlength(blk);
+    if (image_size < 0) {
+        ret = image_size;
+        goto out;
+    }
+
+    data = (BenchData) {
+        .blk        = blk,
+        .image_size = image_size,
+        .bufsize    = bufsize,
+        .nrreq      = depth,
+        .n          = count,
+    };
+    printf("Sending %d requests, %d bytes each, %d in parallel\n",
+        data.n, data.bufsize, data.nrreq);
+
+    data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
+    data.qiov = g_new(QEMUIOVector, data.nrreq);
+    for (i = 0; i < data.nrreq; i++) {
+        qemu_iovec_init(&data.qiov[i], 1);
+        qemu_iovec_add(&data.qiov[i],
+                       data.buf + i * data.bufsize, data.bufsize);
+    }
+
+    gettimeofday(&t1, NULL);
+    bench_cb(&data, 0);
+
+    while (data.n > 0) {
+        main_loop_wait(false);
+    }
+    gettimeofday(&t2, NULL);
+
+    printf("Run completed in %3.3f seconds.\n",
+           (t2.tv_sec - t1.tv_sec)
+           + ((double)(t2.tv_usec - t1.tv_usec) / 1000000));
+
+out:
+    qemu_vfree(data.buf);
+    blk_unref(blk);
+
+    if (ret) {
+        return 1;
+    }
+    return 0;
+}
+
+
 static const img_cmd_t img_cmds[] = {
 #define DEF(option, callback, arg_string)        \
     { option, callback },
diff --git a/qemu-img.texi b/qemu-img.texi
index afaebdd408..b6b28e3542 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,6 +131,16 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s @var{buffer_size}] [-t @var{cache}] @var{filename}
+
+Run a simple sequential read benchmark on the specified image. A total number
+of @var{count} I/O requests is performed, each @var{buffer_size} bytes in size,
+and with @var{depth} requests in parallel.
+
+If @code{-n} is specified, the native AIO backend is used if possible. On
+Linux, this option only works if @code{-t none} or @code{-t directsync} is
+specified as well.
+
 @item check [-f @var{fmt}] [--output=@var{ofmt}] [-r [leaks | all]] [-T @var{src_cache}] @var{filename}
 
 Perform a consistency check on the disk image @var{filename}. The command can

From b6495fa8493bb56e51506e39a5e575a4404c1cec Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 10 Jul 2015 18:09:18 +0200
Subject: [PATCH 28/31] qemu-img bench: Sequential writes

This extends qemu-img bench with an option that makes it use sequential
writes instead of reads for the test run.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.c       | 38 +++++++++++++++++++++++++++++++++-----
 qemu-img.texi    | 13 +++++++++----
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index f3bd546335..baca85ee12 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-    "bench [-c count] [-d depth] [-f fmt] [-n] [-q] [-s buffer_size] [-t cache] filename")
+    "bench [-c count] [-d depth] [-f fmt] [-n] [--pattern=pattern] [-q] [-s buffer_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s @var{buffer_size}] [-t @var{cache}] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index d471d10020..85d13532e8 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -53,6 +53,7 @@ enum {
     OPTION_BACKING_CHAIN = 257,
     OPTION_OBJECT = 258,
     OPTION_IMAGE_OPTS = 259,
+    OPTION_PATTERN = 260,
 };
 
 typedef enum OutputFormat {
@@ -3462,6 +3463,7 @@ out_no_progress:
 typedef struct BenchData {
     BlockBackend *blk;
     uint64_t image_size;
+    bool write;
     int bufsize;
     int nrreq;
     int n;
@@ -3487,8 +3489,13 @@ static void bench_cb(void *opaque, int ret)
     }
 
     while (b->n > b->in_flight && b->in_flight < b->nrreq) {
-        acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
-                             bench_cb, b);
+        if (b->write) {
+            acb = blk_aio_pwritev(b->blk, b->offset, b->qiov, 0,
+                                  bench_cb, b);
+        } else {
+            acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
+                                 bench_cb, b);
+        }
         if (!acb) {
             error_report("Failed to issue request");
             exit(EXIT_FAILURE);
@@ -3505,9 +3512,11 @@ static int img_bench(int argc, char **argv)
     const char *fmt = NULL, *filename;
     bool quiet = false;
     bool image_opts = false;
+    bool is_write = false;
     int count = 75000;
     int depth = 64;
     size_t bufsize = 4096;
+    int pattern = 0;
     int64_t image_size;
     BlockBackend *blk = NULL;
     BenchData data = {};
@@ -3520,9 +3529,10 @@ static int img_bench(int argc, char **argv)
         static const struct option long_options[] = {
             {"help", no_argument, 0, 'h'},
             {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
+            {"pattern", required_argument, 0, OPTION_PATTERN},
             {0, 0, 0, 0}
         };
-        c = getopt_long(argc, argv, "hc:d:f:nqs:t:", long_options, NULL);
+        c = getopt_long(argc, argv, "hc:d:f:nqs:t:w", long_options, NULL);
         if (c == -1) {
             break;
         }
@@ -3585,6 +3595,21 @@ static int img_bench(int argc, char **argv)
                 goto out;
             }
             break;
+        case 'w':
+            flags |= BDRV_O_RDWR;
+            is_write = true;
+            break;
+        case OPTION_PATTERN:
+        {
+            char *end;
+            errno = 0;
+            pattern = strtoul(optarg, &end, 0);
+            if (errno || *end || pattern > 0xff) {
+                error_report("Invalid pattern byte specified");
+                return 1;
+            }
+            break;
+        }
         case OPTION_IMAGE_OPTS:
             image_opts = true;
             break;
@@ -3614,11 +3639,14 @@ static int img_bench(int argc, char **argv)
         .bufsize    = bufsize,
         .nrreq      = depth,
         .n          = count,
+        .write      = is_write,
     };
-    printf("Sending %d requests, %d bytes each, %d in parallel\n",
-        data.n, data.bufsize, data.nrreq);
+    printf("Sending %d %s requests, %d bytes each, %d in parallel\n",
+           data.n, data.write ? "write" : "read", data.bufsize, data.nrreq);
 
     data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
+    memset(data.buf, pattern, data.nrreq * data.bufsize);
+
     data.qiov = g_new(QEMUIOVector, data.nrreq);
     for (i = 0; i < data.nrreq; i++) {
         qemu_iovec_init(&data.qiov[i], 1);
diff --git a/qemu-img.texi b/qemu-img.texi
index b6b28e3542..c477fbf1c3 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,16 +131,21 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s @var{buffer_size}] [-t @var{cache}] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
 
-Run a simple sequential read benchmark on the specified image. A total number
-of @var{count} I/O requests is performed, each @var{buffer_size} bytes in size,
-and with @var{depth} requests in parallel.
+Run a simple sequential I/O benchmark on the specified image. If @code{-w} is
+specified, a write test is performed, otherwise a read test is performed.
+
+A total number of @var{count} I/O requests is performed, each @var{buffer_size}
+bytes in size, and with @var{depth} requests in parallel.
 
 If @code{-n} is specified, the native AIO backend is used if possible. On
 Linux, this option only works if @code{-t none} or @code{-t directsync} is
 specified as well.
 
+For write tests, by default a buffer filled with zeros is written. This can be
+overridden with a pattern byte specified by @var{pattern}.
+
 @item check [-f @var{fmt}] [--output=@var{ofmt}] [-r [leaks | all]] [-T @var{src_cache}] @var{filename}
 
 Perform a consistency check on the disk image @var{filename}. The command can

From d3199a31c7bc72f6bcecbb3ebcc16940a1721e10 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 10 Jul 2015 18:09:18 +0200
Subject: [PATCH 29/31] qemu-img bench: Make start offset configurable

This patch adds an option the specify the offset of the first request
made by qemu-img bench. This allows to benchmark misaligned requests.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.c       | 23 ++++++++++++++++++++---
 qemu-img.texi    |  5 +++--
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index baca85ee12..117d0f9264 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-    "bench [-c count] [-d depth] [-f fmt] [-n] [--pattern=pattern] [-q] [-s buffer_size] [-t cache] [-w] filename")
+    "bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index 85d13532e8..480ef8dba3 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3515,6 +3515,7 @@ static int img_bench(int argc, char **argv)
     bool is_write = false;
     int count = 75000;
     int depth = 64;
+    int64_t offset = 0;
     size_t bufsize = 4096;
     int pattern = 0;
     int64_t image_size;
@@ -3532,7 +3533,7 @@ static int img_bench(int argc, char **argv)
             {"pattern", required_argument, 0, OPTION_PATTERN},
             {0, 0, 0, 0}
         };
-        c = getopt_long(argc, argv, "hc:d:f:nqs:t:w", long_options, NULL);
+        c = getopt_long(argc, argv, "hc:d:f:no:qs:t:w", long_options, NULL);
         if (c == -1) {
             break;
         }
@@ -3570,6 +3571,19 @@ static int img_bench(int argc, char **argv)
         case 'n':
             flags |= BDRV_O_NATIVE_AIO;
             break;
+        case 'o':
+        {
+            char *end;
+            errno = 0;
+            offset = qemu_strtosz_suffix(optarg, &end,
+                                         QEMU_STRTOSZ_DEFSUFFIX_B);
+            if (offset < 0|| *end) {
+                error_report("Invalid offset specified");
+                return 1;
+            }
+            break;
+        }
+            break;
         case 'q':
             quiet = true;
             break;
@@ -3639,10 +3653,13 @@ static int img_bench(int argc, char **argv)
         .bufsize    = bufsize,
         .nrreq      = depth,
         .n          = count,
+        .offset     = offset,
         .write      = is_write,
     };
-    printf("Sending %d %s requests, %d bytes each, %d in parallel\n",
-           data.n, data.write ? "write" : "read", data.bufsize, data.nrreq);
+    printf("Sending %d %s requests, %d bytes each, %d in parallel "
+           "(starting at offset %" PRId64 ")\n",
+           data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
+           data.offset);
 
     data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
     memset(data.buf, pattern, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index c477fbf1c3..9bffad226b 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,13 +131,14 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
 
 Run a simple sequential I/O benchmark on the specified image. If @code{-w} is
 specified, a write test is performed, otherwise a read test is performed.
 
 A total number of @var{count} I/O requests is performed, each @var{buffer_size}
-bytes in size, and with @var{depth} requests in parallel.
+bytes in size, and with @var{depth} requests in parallel. The first request
+starts at the position given by @var{offset}.
 
 If @code{-n} is specified, the native AIO backend is used if possible. On
 Linux, this option only works if @code{-t none} or @code{-t directsync} is

From 83de9be0dc59439ec6f97f58c8e9015b4fdc3010 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 13 Jul 2015 13:13:17 +0200
Subject: [PATCH 30/31] qemu-img bench: Implement -S (step size)

With this new option, qemu-img bench can be told to advance the current
offset after each request by a different value than the buffer size.
This is useful for controlling the conditions for cluster allocation in
image formats (e.g. qcow2 cluster allocation with COW in front of the
request, or COW areas that aren't overwritten immediately).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.c       | 25 +++++++++++++++++++++----
 qemu-img.texi    |  6 ++++--
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 117d0f9264..05a2991787 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-    "bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-t cache] [-w] filename")
+    "bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index 480ef8dba3..c5e2638a5a 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3465,6 +3465,7 @@ typedef struct BenchData {
     uint64_t image_size;
     bool write;
     int bufsize;
+    int step;
     int nrreq;
     int n;
     uint8_t *buf;
@@ -3501,7 +3502,7 @@ static void bench_cb(void *opaque, int ret)
             exit(EXIT_FAILURE);
         }
         b->in_flight++;
-        b->offset += b->bufsize;
+        b->offset += b->step;
         b->offset %= b->image_size;
     }
 }
@@ -3518,6 +3519,7 @@ static int img_bench(int argc, char **argv)
     int64_t offset = 0;
     size_t bufsize = 4096;
     int pattern = 0;
+    size_t step = 0;
     int64_t image_size;
     BlockBackend *blk = NULL;
     BenchData data = {};
@@ -3533,7 +3535,7 @@ static int img_bench(int argc, char **argv)
             {"pattern", required_argument, 0, OPTION_PATTERN},
             {0, 0, 0, 0}
         };
-        c = getopt_long(argc, argv, "hc:d:f:no:qs:t:w", long_options, NULL);
+        c = getopt_long(argc, argv, "hc:d:f:no:qs:S:t:w", long_options, NULL);
         if (c == -1) {
             break;
         }
@@ -3601,6 +3603,20 @@ static int img_bench(int argc, char **argv)
             bufsize = sval;
             break;
         }
+        case 'S':
+        {
+            int64_t sval;
+            char *end;
+
+            sval = qemu_strtosz_suffix(optarg, &end, QEMU_STRTOSZ_DEFSUFFIX_B);
+            if (sval < 0 || sval > INT_MAX || *end) {
+                error_report("Invalid step size specified");
+                return 1;
+            }
+
+            step = sval;
+            break;
+        }
         case 't':
             ret = bdrv_parse_cache_mode(optarg, &flags, &writethrough);
             if (ret < 0) {
@@ -3651,15 +3667,16 @@ static int img_bench(int argc, char **argv)
         .blk        = blk,
         .image_size = image_size,
         .bufsize    = bufsize,
+        .step       = step ?: bufsize,
         .nrreq      = depth,
         .n          = count,
         .offset     = offset,
         .write      = is_write,
     };
     printf("Sending %d %s requests, %d bytes each, %d in parallel "
-           "(starting at offset %" PRId64 ")\n",
+           "(starting at offset %" PRId64 ", step size %d)\n",
            data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
-           data.offset);
+           data.offset, data.step);
 
     data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
     memset(data.buf, pattern, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index 9bffad226b..ccc0b519ba 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,14 +131,16 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename}
 
 Run a simple sequential I/O benchmark on the specified image. If @code{-w} is
 specified, a write test is performed, otherwise a read test is performed.
 
 A total number of @var{count} I/O requests is performed, each @var{buffer_size}
 bytes in size, and with @var{depth} requests in parallel. The first request
-starts at the position given by @var{offset}.
+starts at the position given by @var{offset}, each following request increases
+the current position by @var{step_size}. If @var{step_size} is not given,
+@var{buffer_size} is used for its value.
 
 If @code{-n} is specified, the native AIO backend is used if possible. On
 Linux, this option only works if @code{-t none} or @code{-t directsync} is

From 55d539c8f75abab70301a43d8c94b976f6ddc358 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 3 Jun 2016 13:59:41 +0200
Subject: [PATCH 31/31] qemu-img bench: Add --flush-interval

This options allows to flush the image periodically during write tests.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-img-cmds.hx |  4 +-
 qemu-img.c       | 95 +++++++++++++++++++++++++++++++++++++++++++-----
 qemu-img.texi    |  8 +++-
 3 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 05a2991787..7e95b2da79 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-    "bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-w] filename")
+    "bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] [-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index c5e2638a5a..04cddabdf4 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -54,6 +54,8 @@ enum {
     OPTION_OBJECT = 258,
     OPTION_IMAGE_OPTS = 259,
     OPTION_PATTERN = 260,
+    OPTION_FLUSH_INTERVAL = 261,
+    OPTION_NO_DRAIN = 262,
 };
 
 typedef enum OutputFormat {
@@ -3468,13 +3470,24 @@ typedef struct BenchData {
     int step;
     int nrreq;
     int n;
+    int flush_interval;
+    bool drain_on_flush;
     uint8_t *buf;
     QEMUIOVector *qiov;
 
     int in_flight;
+    bool in_flush;
     uint64_t offset;
 } BenchData;
 
+static void bench_undrained_flush_cb(void *opaque, int ret)
+{
+    if (ret < 0) {
+        error_report("Failed flush request: %s\n", strerror(-ret));
+        exit(EXIT_FAILURE);
+    }
+}
+
 static void bench_cb(void *opaque, int ret)
 {
     BenchData *b = opaque;
@@ -3484,9 +3497,39 @@ static void bench_cb(void *opaque, int ret)
         error_report("Failed request: %s\n", strerror(-ret));
         exit(EXIT_FAILURE);
     }
-    if (b->in_flight > 0) {
+
+    if (b->in_flush) {
+        /* Just finished a flush with drained queue: Start next requests */
+        assert(b->in_flight == 0);
+        b->in_flush = false;
+    } else if (b->in_flight > 0) {
+        int remaining = b->n - b->in_flight;
+
         b->n--;
         b->in_flight--;
+
+        /* Time for flush? Drain queue if requested, then flush */
+        if (b->flush_interval && remaining % b->flush_interval == 0) {
+            if (!b->in_flight || !b->drain_on_flush) {
+                BlockCompletionFunc *cb;
+
+                if (b->drain_on_flush) {
+                    b->in_flush = true;
+                    cb = bench_cb;
+                } else {
+                    cb = bench_undrained_flush_cb;
+                }
+
+                acb = blk_aio_flush(b->blk, cb, b);
+                if (!acb) {
+                    error_report("Failed to issue flush request");
+                    exit(EXIT_FAILURE);
+                }
+            }
+            if (b->drain_on_flush) {
+                return;
+            }
+        }
     }
 
     while (b->n > b->in_flight && b->in_flight < b->nrreq) {
@@ -3520,6 +3563,8 @@ static int img_bench(int argc, char **argv)
     size_t bufsize = 4096;
     int pattern = 0;
     size_t step = 0;
+    int flush_interval = 0;
+    bool drain_on_flush = true;
     int64_t image_size;
     BlockBackend *blk = NULL;
     BenchData data = {};
@@ -3531,8 +3576,10 @@ static int img_bench(int argc, char **argv)
     for (;;) {
         static const struct option long_options[] = {
             {"help", no_argument, 0, 'h'},
+            {"flush-interval", required_argument, 0, OPTION_FLUSH_INTERVAL},
             {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
             {"pattern", required_argument, 0, OPTION_PATTERN},
+            {"no-drain", no_argument, 0, OPTION_NO_DRAIN},
             {0, 0, 0, 0}
         };
         c = getopt_long(argc, argv, "hc:d:f:no:qs:S:t:w", long_options, NULL);
@@ -3640,6 +3687,20 @@ static int img_bench(int argc, char **argv)
             }
             break;
         }
+        case OPTION_FLUSH_INTERVAL:
+        {
+            char *end;
+            errno = 0;
+            flush_interval = strtoul(optarg, &end, 0);
+            if (errno || *end || flush_interval > INT_MAX) {
+                error_report("Invalid flush interval specified");
+                return 1;
+            }
+            break;
+        }
+        case OPTION_NO_DRAIN:
+            drain_on_flush = false;
+            break;
         case OPTION_IMAGE_OPTS:
             image_opts = true;
             break;
@@ -3651,6 +3712,17 @@ static int img_bench(int argc, char **argv)
     }
     filename = argv[argc - 1];
 
+    if (!is_write && flush_interval) {
+        error_report("--flush-interval is only available in write tests");
+        ret = -1;
+        goto out;
+    }
+    if (flush_interval && flush_interval < depth) {
+        error_report("Flush interval can't be smaller than depth");
+        ret = -1;
+        goto out;
+    }
+
     blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet);
     if (!blk) {
         ret = -1;
@@ -3664,19 +3736,24 @@ static int img_bench(int argc, char **argv)
     }
 
     data = (BenchData) {
-        .blk        = blk,
-        .image_size = image_size,
-        .bufsize    = bufsize,
-        .step       = step ?: bufsize,
-        .nrreq      = depth,
-        .n          = count,
-        .offset     = offset,
-        .write      = is_write,
+        .blk            = blk,
+        .image_size     = image_size,
+        .bufsize        = bufsize,
+        .step           = step ?: bufsize,
+        .nrreq          = depth,
+        .n              = count,
+        .offset         = offset,
+        .write          = is_write,
+        .flush_interval = flush_interval,
+        .drain_on_flush = drain_on_flush,
     };
     printf("Sending %d %s requests, %d bytes each, %d in parallel "
            "(starting at offset %" PRId64 ", step size %d)\n",
            data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
            data.offset, data.step);
+    if (flush_interval) {
+        printf("Sending flush every %d requests\n", flush_interval);
+    }
 
     data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
     memset(data.buf, pattern, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index ccc0b519ba..cbe50e9b88 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,7 +131,7 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] @var{filename}
 
 Run a simple sequential I/O benchmark on the specified image. If @code{-w} is
 specified, a write test is performed, otherwise a read test is performed.
@@ -142,6 +142,12 @@ starts at the position given by @var{offset}, each following request increases
 the current position by @var{step_size}. If @var{step_size} is not given,
 @var{buffer_size} is used for its value.
 
+If @var{flush_interval} is specified for a write test, the request queue is
+drained and a flush is issued before new writes are made whenever the number of
+remaining requests is a multiple of @var{flush_interval}. If additionally
+@code{--no-drain} is specified, a flush is issued without draining the request
+queue first.
+
 If @code{-n} is specified, the native AIO backend is used if possible. On
 Linux, this option only works if @code{-t none} or @code{-t directsync} is
 specified as well.