From 1bce6b4ce3ba03015bdf16d40c1bbe33d8b6031b Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:11 -0500
Subject: [PATCH 01/22] qemu-io: Improve alignment checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several copy-and-pasted alignment checks exist in qemu-io, which
could use some minor improvements:

- Manual comparison against 0x1ff is not as clean as using our
alignment macros (QEMU_IS_ALIGNED) from osdep.h.

- The error messages aren't quite grammatically correct.

Suggested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Suggested-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170429191419.30051-2-eblake@redhat.com
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-io-cmds.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 21af9e65b2..6a0024b484 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -740,13 +740,13 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
     }
 
     if (bflag) {
-        if (offset & 0x1ff) {
-            printf("offset %" PRId64 " is not sector aligned\n",
+        if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
+            printf("%" PRId64 " is not a sector-aligned value for 'offset'\n",
                    offset);
             return 0;
         }
-        if (count & 0x1ff) {
-            printf("count %"PRId64" is not sector aligned\n",
+        if (!QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)) {
+            printf("%"PRId64" is not a sector-aligned value for 'count'\n",
                    count);
             return 0;
         }
@@ -1050,14 +1050,14 @@ static int write_f(BlockBackend *blk, int argc, char **argv)
     }
 
     if (bflag || cflag) {
-        if (offset & 0x1ff) {
-            printf("offset %" PRId64 " is not sector aligned\n",
+        if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
+            printf("%" PRId64 " is not a sector-aligned value for 'offset'\n",
                    offset);
             return 0;
         }
 
-        if (count & 0x1ff) {
-            printf("count %"PRId64" is not sector aligned\n",
+        if (!QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)) {
+            printf("%"PRId64" is not a sector-aligned value for 'count'\n",
                    count);
             return 0;
         }
@@ -1769,8 +1769,8 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
     if (offset < 0) {
         print_cvtnum_err(offset, argv[1]);
         return 0;
-    } else if (offset & 0x1ff) {
-        printf("offset %" PRId64 " is not sector aligned\n",
+    } else if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
+        printf("%" PRId64 " is not a sector-aligned value for 'offset'\n",
                offset);
         return 0;
     }

From 4401fdc77cb6d79e53ce2fc2193444cad8b95749 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:12 -0500
Subject: [PATCH 02/22] qemu-io: Switch 'alloc' command to byte-based length

For the 'alloc' command, accepting an offset in bytes but a length
in sectors, and reporting output in sectors, is confusing.  Do
everything in bytes, and adjust the expected output accordingly.

Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170429191419.30051-3-eblake@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-io-cmds.c                    | 30 ++++++++++++++++++------------
 tests/qemu-iotests/019.out        |  8 ++++----
 tests/qemu-iotests/common.pattern |  2 +-
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 6a0024b484..1e0ebb4575 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1760,7 +1760,7 @@ out:
 static int alloc_f(BlockBackend *blk, int argc, char **argv)
 {
     BlockDriverState *bs = blk_bs(blk);
-    int64_t offset, sector_num, nb_sectors, remaining;
+    int64_t offset, sector_num, nb_sectors, remaining, count;
     char s1[64];
     int num, ret;
     int64_t sum_alloc;
@@ -1776,18 +1776,24 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
     }
 
     if (argc == 3) {
-        nb_sectors = cvtnum(argv[2]);
-        if (nb_sectors < 0) {
-            print_cvtnum_err(nb_sectors, argv[2]);
+        count = cvtnum(argv[2]);
+        if (count < 0) {
+            print_cvtnum_err(count, argv[2]);
             return 0;
-        } else if (nb_sectors > INT_MAX) {
-            printf("length argument cannot exceed %d, given %s\n",
-                   INT_MAX, argv[2]);
+        } else if (count > INT_MAX * BDRV_SECTOR_SIZE) {
+            printf("length argument cannot exceed %llu, given %s\n",
+                   INT_MAX * BDRV_SECTOR_SIZE, argv[2]);
             return 0;
         }
     } else {
-        nb_sectors = 1;
+        count = BDRV_SECTOR_SIZE;
     }
+    if (!QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)) {
+        printf("%" PRId64 " is not a sector-aligned value for 'count'\n",
+               count);
+        return 0;
+    }
+    nb_sectors = count >> BDRV_SECTOR_BITS;
 
     remaining = nb_sectors;
     sum_alloc = 0;
@@ -1811,8 +1817,8 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
 
     cvtstr(offset, s1, sizeof(s1));
 
-    printf("%"PRId64"/%"PRId64" sectors allocated at offset %s\n",
-           sum_alloc, nb_sectors, s1);
+    printf("%"PRId64"/%"PRId64" bytes allocated at offset %s\n",
+           sum_alloc << BDRV_SECTOR_BITS, nb_sectors << BDRV_SECTOR_BITS, s1);
     return 0;
 }
 
@@ -1822,8 +1828,8 @@ static const cmdinfo_t alloc_cmd = {
     .argmin     = 1,
     .argmax     = 2,
     .cfunc      = alloc_f,
-    .args       = "off [sectors]",
-    .oneline    = "checks if a sector is present in the file",
+    .args       = "offset [count]",
+    .oneline    = "checks if offset is allocated in the file",
 };
 
 
diff --git a/tests/qemu-iotests/019.out b/tests/qemu-iotests/019.out
index 0124264975..17a7c036b9 100644
--- a/tests/qemu-iotests/019.out
+++ b/tests/qemu-iotests/019.out
@@ -542,8 +542,8 @@ Testing conversion with -B TEST_DIR/t.IMGFMT.base
 
 Checking if backing clusters are allocated when they shouldn't
 
-0/128 sectors allocated at offset 1 MiB
-0/128 sectors allocated at offset 4.001 GiB
+0/65536 bytes allocated at offset 1 MiB
+0/65536 bytes allocated at offset 4.001 GiB
 Reading
 
 === IO: pattern 42
@@ -1086,8 +1086,8 @@ Testing conversion with -o backing_file=TEST_DIR/t.IMGFMT.base
 
 Checking if backing clusters are allocated when they shouldn't
 
-0/128 sectors allocated at offset 1 MiB
-0/128 sectors allocated at offset 4.001 GiB
+0/65536 bytes allocated at offset 1 MiB
+0/65536 bytes allocated at offset 4.001 GiB
 Reading
 
 === IO: pattern 42
diff --git a/tests/qemu-iotests/common.pattern b/tests/qemu-iotests/common.pattern
index ddfbca1b76..34f4a8dc9b 100644
--- a/tests/qemu-iotests/common.pattern
+++ b/tests/qemu-iotests/common.pattern
@@ -18,7 +18,7 @@
 
 function do_is_allocated() {
     local start=$1
-    local size=$(( $2 / 512))
+    local size=$2
     local step=$3
     local count=$4
 

From 6f3c90af3c50d4f839849c8ba9b6ba4e9a548c28 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:13 -0500
Subject: [PATCH 03/22] qemu-io: Switch 'map' output to byte-based reporting

Mixing byte offset and sector allocation counts is a bit
confusing.  Also, reporting n/m sectors, where m decreases
according to the remaining size of the file, isn't really
adding any useful information; and reporting an offset at
both the front and end of the line, with large amounts of
whitespace, is pointless.  Update the output to use byte
counts and shorter lines, then adjust the affected tests
(./check -qcow2 102, ./check -vpc 146).

Note that 'qemu-io map' is MUCH weaker than 'qemu-img map';
the former only shows which regions of the active layer are
allocated, without regards to where the allocation comes from
or whether the allocated portion is known to read as zero
(because it is using the weaker bdrv_is_allocated()); while the
latter (especially in --output=json mode) reports more details
from bdrv_get_block_status().

Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170429191419.30051-4-eblake@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-io-cmds.c             | 11 ++++++-----
 tests/qemu-iotests/102.out |  4 ++--
 tests/qemu-iotests/146.out | 30 +++++++++++++++---------------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 1e0ebb4575..4b2278f040 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1868,7 +1868,7 @@ static int map_f(BlockBackend *blk, int argc, char **argv)
 {
     int64_t offset;
     int64_t nb_sectors, total_sectors;
-    char s1[64];
+    char s1[64], s2[64];
     int64_t num;
     int ret;
     const char *retstr;
@@ -1894,10 +1894,11 @@ static int map_f(BlockBackend *blk, int argc, char **argv)
         }
 
         retstr = ret ? "    allocated" : "not allocated";
-        cvtstr(offset << 9ULL, s1, sizeof(s1));
-        printf("[% 24" PRId64 "] % 8" PRId64 "/% 8" PRId64 " sectors %s "
-               "at offset %s (%d)\n",
-               offset << 9ULL, num, nb_sectors, retstr, s1, ret);
+        cvtstr(num << BDRV_SECTOR_BITS, s1, sizeof(s1));
+        cvtstr(offset << BDRV_SECTOR_BITS, s2, sizeof(s2));
+        printf("%s (0x%" PRIx64 ") bytes %s at offset %s (0x%" PRIx64 ")\n",
+               s1, num << BDRV_SECTOR_BITS, retstr,
+               s2, offset << BDRV_SECTOR_BITS);
 
         offset += num;
         nb_sectors -= num;
diff --git a/tests/qemu-iotests/102.out b/tests/qemu-iotests/102.out
index eecde16ad5..ccf172abd9 100644
--- a/tests/qemu-iotests/102.out
+++ b/tests/qemu-iotests/102.out
@@ -6,7 +6,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 Image resized.
-[                       0]      128/     128 sectors     allocated at offset 0 bytes (1)
+64 KiB (0x10000) bytes     allocated at offset 0 bytes (0x0)
 Offset          Length          Mapped to       File
 
 === Testing map on an image file truncated outside of qemu ===
@@ -17,5 +17,5 @@ wrote 65536/65536 bytes at offset 0
 Image resized.
 QEMU X.Y.Z monitor - type 'help' for more information
 (qemu) qemu-io drv0 map
-[                       0]      128/     128 sectors     allocated at offset 0 bytes (1)
+64 KiB (0x10000) bytes     allocated at offset 0 bytes (0x0)
 *** done
diff --git a/tests/qemu-iotests/146.out b/tests/qemu-iotests/146.out
index 4f334d86bc..db6b296b9e 100644
--- a/tests/qemu-iotests/146.out
+++ b/tests/qemu-iotests/146.out
@@ -2,39 +2,39 @@ QA output created by 146
 
 === Testing VPC Autodetect ===
 
-[                       0]  266334240/ 266334240 sectors not allocated at offset 0 bytes (0)
+126.998 GiB (0x1fbfe04000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing VPC with current_size force ===
 
-[                       0]  266338304/ 266338304 sectors not allocated at offset 0 bytes (0)
+127 GiB (0x1fc0000000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing VPC with chs force ===
 
-[                       0]  266334240/ 266334240 sectors not allocated at offset 0 bytes (0)
+126.998 GiB (0x1fbfe04000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing Hyper-V Autodetect ===
 
-[                       0]  266338304/ 266338304 sectors not allocated at offset 0 bytes (0)
+127 GiB (0x1fc0000000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing Hyper-V with current_size force ===
 
-[                       0]  266338304/ 266338304 sectors not allocated at offset 0 bytes (0)
+127 GiB (0x1fc0000000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing Hyper-V with chs force ===
 
-[                       0]  266334240/ 266334240 sectors not allocated at offset 0 bytes (0)
+126.998 GiB (0x1fbfe04000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing d2v Autodetect ===
 
-[                       0]   514560/  514560 sectors     allocated at offset 0 bytes (1)
+251.250 MiB (0xfb40000) bytes     allocated at offset 0 bytes (0x0)
 
 === Testing d2v with current_size force ===
 
-[                       0]   514560/  514560 sectors     allocated at offset 0 bytes (1)
+251.250 MiB (0xfb40000) bytes     allocated at offset 0 bytes (0x0)
 
 === Testing d2v with chs force ===
 
-[                       0]   514560/  514560 sectors     allocated at offset 0 bytes (1)
+251.250 MiB (0xfb40000) bytes     allocated at offset 0 bytes (0x0)
 
 === Testing Image create, default ===
 
@@ -42,15 +42,15 @@ Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT size=4294967296
 
 === Read created image, default opts ====
 
-[                       0]  8389584/ 8389584 sectors not allocated at offset 0 bytes (0)
+4 GiB (0x10007a000) bytes not allocated at offset 0 bytes (0x0)
 
 === Read created image, force_size_calc=chs ====
 
-[                       0]  8389584/ 8389584 sectors not allocated at offset 0 bytes (0)
+4 GiB (0x10007a000) bytes not allocated at offset 0 bytes (0x0)
 
 === Read created image, force_size_calc=current_size ====
 
-[                       0]  8389584/ 8389584 sectors not allocated at offset 0 bytes (0)
+4 GiB (0x10007a000) bytes not allocated at offset 0 bytes (0x0)
 
 === Testing Image create, force_size ===
 
@@ -58,13 +58,13 @@ Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT size=4294967296 forc
 
 === Read created image, default opts ====
 
-[                       0]  8388608/ 8388608 sectors not allocated at offset 0 bytes (0)
+4 GiB (0x100000000) bytes not allocated at offset 0 bytes (0x0)
 
 === Read created image, force_size_calc=chs ====
 
-[                       0]  8388608/ 8388608 sectors not allocated at offset 0 bytes (0)
+4 GiB (0x100000000) bytes not allocated at offset 0 bytes (0x0)
 
 === Read created image, force_size_calc=current_size ====
 
-[                       0]  8388608/ 8388608 sectors not allocated at offset 0 bytes (0)
+4 GiB (0x100000000) bytes not allocated at offset 0 bytes (0x0)
 *** done

From e0ef439588ce1ede747f82b77d893190c1cc9f4d Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:14 -0500
Subject: [PATCH 04/22] blkdebug: Sanity check block layer guarantees

Commits 04ed95f4 and 1a62d0ac updated the block layer to auto-fragment
any I/O to fit within device boundaries. Additionally, when using a
minimum alignment of 4k, we want to ensure the block layer does proper
read-modify-write rather than requesting I/O on a slice of a sector.
Let's enforce that the contract is obeyed when using blkdebug.  For
now, blkdebug only allows alignment overrides, and just inherits other
limits from whatever device it is wrapping, but a future patch will
further enhance things.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170429191419.30051-5-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 3c088934db..a562e2e32f 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -429,6 +429,13 @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     BDRVBlkdebugState *s = bs->opaque;
     BlkdebugRule *rule = NULL;
 
+    /* Sanity check block layer guarantees */
+    assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+    assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+    if (bs->bl.max_transfer) {
+        assert(bytes <= bs->bl.max_transfer);
+    }
+
     QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
         uint64_t inject_offset = rule->options.inject.offset;
 
@@ -453,6 +460,13 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     BDRVBlkdebugState *s = bs->opaque;
     BlkdebugRule *rule = NULL;
 
+    /* Sanity check block layer guarantees */
+    assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+    assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+    if (bs->bl.max_transfer) {
+        assert(bytes <= bs->bl.max_transfer);
+    }
+
     QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
         uint64_t inject_offset = rule->options.inject.offset;
 

From d157ed5f7235f3d2d5596a514ad7507b18e24b88 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:15 -0500
Subject: [PATCH 05/22] blkdebug: Refactor error injection

Rather than repeat the logic at each caller of checking if a Rule
exists that warrants an error injection, fold that logic into
inject_error(); and rename it to rule_check() for legibility.
This will help the next patch, which adds two more callers that
need to check rules for the potential of injecting errors.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170429191419.30051-6-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c | 74 +++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index a562e2e32f..554573fd14 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -403,11 +403,30 @@ out:
     return ret;
 }
 
-static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
+static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
 {
     BDRVBlkdebugState *s = bs->opaque;
-    int error = rule->options.inject.error;
-    bool immediately = rule->options.inject.immediately;
+    BlkdebugRule *rule = NULL;
+    int error;
+    bool immediately;
+
+    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+        uint64_t inject_offset = rule->options.inject.offset;
+
+        if (inject_offset == -1 ||
+            (bytes && inject_offset >= offset &&
+             inject_offset < offset + bytes))
+        {
+            break;
+        }
+    }
+
+    if (!rule || !rule->options.inject.error) {
+        return 0;
+    }
+
+    immediately = rule->options.inject.immediately;
+    error = rule->options.inject.error;
 
     if (rule->options.inject.once) {
         QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next);
@@ -426,8 +445,7 @@ static int coroutine_fn
 blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
                    QEMUIOVector *qiov, int flags)
 {
-    BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugRule *rule = NULL;
+    int err;
 
     /* Sanity check block layer guarantees */
     assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
@@ -436,18 +454,9 @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
         assert(bytes <= bs->bl.max_transfer);
     }
 
-    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
-        uint64_t inject_offset = rule->options.inject.offset;
-
-        if (inject_offset == -1 ||
-            (inject_offset >= offset && inject_offset < offset + bytes))
-        {
-            break;
-        }
-    }
-
-    if (rule && rule->options.inject.error) {
-        return inject_error(bs, rule);
+    err = rule_check(bs, offset, bytes);
+    if (err) {
+        return err;
     }
 
     return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
@@ -457,8 +466,7 @@ static int coroutine_fn
 blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
                     QEMUIOVector *qiov, int flags)
 {
-    BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugRule *rule = NULL;
+    int err;
 
     /* Sanity check block layer guarantees */
     assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
@@ -467,18 +475,9 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
         assert(bytes <= bs->bl.max_transfer);
     }
 
-    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
-        uint64_t inject_offset = rule->options.inject.offset;
-
-        if (inject_offset == -1 ||
-            (inject_offset >= offset && inject_offset < offset + bytes))
-        {
-            break;
-        }
-    }
-
-    if (rule && rule->options.inject.error) {
-        return inject_error(bs, rule);
+    err = rule_check(bs, offset, bytes);
+    if (err) {
+        return err;
     }
 
     return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
@@ -486,17 +485,10 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 
 static int blkdebug_co_flush(BlockDriverState *bs)
 {
-    BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugRule *rule = NULL;
+    int err = rule_check(bs, 0, 0);
 
-    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
-        if (rule->options.inject.offset == -1) {
-            break;
-        }
-    }
-
-    if (rule && rule->options.inject.error) {
-        return inject_error(bs, rule);
+    if (err) {
+        return err;
     }
 
     return bdrv_co_flush(bs->file->bs);

From 63188c245013dbe383e8b031e665f813e2452ea5 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:16 -0500
Subject: [PATCH 06/22] blkdebug: Add pass-through write_zero and discard
 support

In order to test the effects of artificial geometry constraints
on operations like write zero or discard, we first need blkdebug
to manage these actions.  It also allows us to inject errors on
those operations, just like we can for read/write/flush.

We can also test the contract promised by the block layer; namely,
if a device has specified limits on alignment or maximum size,
then those limits must be obeyed (for now, the blkdebug driver
merely inherits limits from whatever it is wrapping, but the next
patch will further enhance it to allow specific limit overrides).

This patch intentionally refuses to service requests smaller than
the requested alignments; this is because an upcoming patch adds
a qemu-iotest to prove that the block layer is correctly handling
fragmentation, but the test only works if there is a way to tell
the difference at artificial alignment boundaries when blkdebug is
using a larger-than-default alignment.  If we let the blkdebug
layer always defer to the underlying layer, which potentially has
a smaller granularity, the iotest will be thwarted.

Tested by setting up an NBD server with export 'foo', then invoking:
$ ./qemu-io
qemu-io> open -o driver=blkdebug blkdebug::nbd://localhost:10809/foo
qemu-io> d 0 15M
qemu-io> w -z 0 15M

Pre-patch, the server never sees the discard (it was silently
eaten by the block layer); post-patch it is passed across the
wire.  Likewise, pre-patch the write is always passed with
NBD_WRITE (with 15M of zeroes on the wire), while post-patch
it can utilize NBD_WRITE_ZEROES (for less traffic).

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170429191419.30051-7-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 554573fd14..b8cc87617e 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -1,6 +1,7 @@
 /*
  * Block protocol for I/O error injection
  *
+ * Copyright (C) 2016-2017 Red Hat, Inc.
  * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -382,6 +383,11 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
 
+    bs->supported_write_flags = BDRV_REQ_FUA &
+        bs->file->bs->supported_write_flags;
+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+        bs->file->bs->supported_zero_flags;
+
     /* Set request alignment */
     align = qemu_opt_get_size(opts, "align", 0);
     if (align < INT_MAX && is_power_of_2(align)) {
@@ -494,6 +500,72 @@ static int blkdebug_co_flush(BlockDriverState *bs)
     return bdrv_co_flush(bs->file->bs);
 }
 
+static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset, int count,
+                                                  BdrvRequestFlags flags)
+{
+    uint32_t align = MAX(bs->bl.request_alignment,
+                         bs->bl.pwrite_zeroes_alignment);
+    int err;
+
+    /* Only pass through requests that are larger than requested
+     * preferred alignment (so that we test the fallback to writes on
+     * unaligned portions), and check that the block layer never hands
+     * us anything unaligned that crosses an alignment boundary.  */
+    if (count < align) {
+        assert(QEMU_IS_ALIGNED(offset, align) ||
+               QEMU_IS_ALIGNED(offset + count, align) ||
+               DIV_ROUND_UP(offset, align) ==
+               DIV_ROUND_UP(offset + count, align));
+        return -ENOTSUP;
+    }
+    assert(QEMU_IS_ALIGNED(offset, align));
+    assert(QEMU_IS_ALIGNED(count, align));
+    if (bs->bl.max_pwrite_zeroes) {
+        assert(count <= bs->bl.max_pwrite_zeroes);
+    }
+
+    err = rule_check(bs, offset, count);
+    if (err) {
+        return err;
+    }
+
+    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+}
+
+static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
+                                             int64_t offset, int count)
+{
+    uint32_t align = bs->bl.pdiscard_alignment;
+    int err;
+
+    /* Only pass through requests that are larger than requested
+     * minimum alignment, and ensure that unaligned requests do not
+     * cross optimum discard boundaries. */
+    if (count < bs->bl.request_alignment) {
+        assert(QEMU_IS_ALIGNED(offset, align) ||
+               QEMU_IS_ALIGNED(offset + count, align) ||
+               DIV_ROUND_UP(offset, align) ==
+               DIV_ROUND_UP(offset + count, align));
+        return -ENOTSUP;
+    }
+    assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+    assert(QEMU_IS_ALIGNED(count, bs->bl.request_alignment));
+    if (align && count >= align) {
+        assert(QEMU_IS_ALIGNED(offset, align));
+        assert(QEMU_IS_ALIGNED(count, align));
+    }
+    if (bs->bl.max_pdiscard) {
+        assert(count <= bs->bl.max_pdiscard);
+    }
+
+    err = rule_check(bs, offset, count);
+    if (err) {
+        return err;
+    }
+
+    return bdrv_co_pdiscard(bs->file->bs, offset, count);
+}
 
 static void blkdebug_close(BlockDriverState *bs)
 {
@@ -748,6 +820,8 @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_co_preadv         = blkdebug_co_preadv,
     .bdrv_co_pwritev        = blkdebug_co_pwritev,
     .bdrv_co_flush_to_disk  = blkdebug_co_flush,
+    .bdrv_co_pwrite_zeroes  = blkdebug_co_pwrite_zeroes,
+    .bdrv_co_pdiscard       = blkdebug_co_pdiscard,
 
     .bdrv_debug_event           = blkdebug_debug_event,
     .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,

From 3dc834f8795a46f919d90b1e5369308628858601 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:17 -0500
Subject: [PATCH 07/22] blkdebug: Simplify override logic

Rather than store into a local variable, then copy to the struct
if the value is valid, then reporting errors otherwise, it is
simpler to just store into the struct and report errors if the
value is invalid.  This however requires that the struct store
a 64-bit number, rather than a narrower type.  Likewise, setting
a sane errno value in ret prior to the sequence of parsing and
jumping to out: on error makes it easier for the next patch to
add a chain of similar checks.

Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170429191419.30051-8-eblake@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index b8cc87617e..29ab7a3a8b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -38,7 +38,7 @@
 typedef struct BDRVBlkdebugState {
     int state;
     int new_state;
-    int align;
+    uint64_t align;
 
     /* For blkdebug_refresh_filename() */
     char *config_file;
@@ -353,7 +353,6 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVBlkdebugState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
-    uint64_t align;
     int ret;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
@@ -387,20 +386,17 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         bs->file->bs->supported_write_flags;
     bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
         bs->file->bs->supported_zero_flags;
+    ret = -EINVAL;
 
     /* Set request alignment */
-    align = qemu_opt_get_size(opts, "align", 0);
-    if (align < INT_MAX && is_power_of_2(align)) {
-        s->align = align;
-    } else if (align) {
-        error_setg(errp, "Invalid alignment");
-        ret = -EINVAL;
+    s->align = qemu_opt_get_size(opts, "align", 0);
+    if (s->align && (s->align >= INT_MAX || !is_power_of_2(s->align))) {
+        error_setg(errp, "Cannot meet constraints with align %" PRIu64,
+                   s->align);
         goto out;
     }
 
     ret = 0;
-    goto out;
-
 out:
     if (ret < 0) {
         g_free(s->config_file);

From 430b26a82da61876c4eaf559ae02332582968043 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:18 -0500
Subject: [PATCH 08/22] blkdebug: Add ability to override unmap geometries

Make it easier to simulate various unusual hardware setups (for
example, recent commits 3482b9b and b8d0a98 affect the Dell
Equallogic iSCSI with its 15M preferred and maximum unmap and
write zero sizing, or b2f95fe deals with the Linux loopback
block device having a max_transfer of 64k), by allowing blkdebug
to wrap any other device with further restrictions on various
alignments.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170429191419.30051-9-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c     | 96 +++++++++++++++++++++++++++++++++++++++++++-
 qapi/block-core.json | 33 +++++++++++++--
 2 files changed, 125 insertions(+), 4 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 29ab7a3a8b..a5196e889d 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -39,6 +39,11 @@ typedef struct BDRVBlkdebugState {
     int state;
     int new_state;
     uint64_t align;
+    uint64_t max_transfer;
+    uint64_t opt_write_zero;
+    uint64_t max_write_zero;
+    uint64_t opt_discard;
+    uint64_t max_discard;
 
     /* For blkdebug_refresh_filename() */
     char *config_file;
@@ -343,6 +348,31 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "Required alignment in bytes",
         },
+        {
+            .name = "max-transfer",
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum transfer size in bytes",
+        },
+        {
+            .name = "opt-write-zero",
+            .type = QEMU_OPT_SIZE,
+            .help = "Optimum write zero alignment in bytes",
+        },
+        {
+            .name = "max-write-zero",
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum write zero size in bytes",
+        },
+        {
+            .name = "opt-discard",
+            .type = QEMU_OPT_SIZE,
+            .help = "Optimum discard alignment in bytes",
+        },
+        {
+            .name = "max-discard",
+            .type = QEMU_OPT_SIZE,
+            .help = "Maximum discard size in bytes",
+        },
         { /* end of list */ }
     },
 };
@@ -354,6 +384,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     QemuOpts *opts;
     Error *local_err = NULL;
     int ret;
+    uint64_t align;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -388,13 +419,61 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         bs->file->bs->supported_zero_flags;
     ret = -EINVAL;
 
-    /* Set request alignment */
+    /* Set alignment overrides */
     s->align = qemu_opt_get_size(opts, "align", 0);
     if (s->align && (s->align >= INT_MAX || !is_power_of_2(s->align))) {
         error_setg(errp, "Cannot meet constraints with align %" PRIu64,
                    s->align);
         goto out;
     }
+    align = MAX(s->align, bs->file->bs->bl.request_alignment);
+
+    s->max_transfer = qemu_opt_get_size(opts, "max-transfer", 0);
+    if (s->max_transfer &&
+        (s->max_transfer >= INT_MAX ||
+         !QEMU_IS_ALIGNED(s->max_transfer, align))) {
+        error_setg(errp, "Cannot meet constraints with max-transfer %" PRIu64,
+                   s->max_transfer);
+        goto out;
+    }
+
+    s->opt_write_zero = qemu_opt_get_size(opts, "opt-write-zero", 0);
+    if (s->opt_write_zero &&
+        (s->opt_write_zero >= INT_MAX ||
+         !QEMU_IS_ALIGNED(s->opt_write_zero, align))) {
+        error_setg(errp, "Cannot meet constraints with opt-write-zero %" PRIu64,
+                   s->opt_write_zero);
+        goto out;
+    }
+
+    s->max_write_zero = qemu_opt_get_size(opts, "max-write-zero", 0);
+    if (s->max_write_zero &&
+        (s->max_write_zero >= INT_MAX ||
+         !QEMU_IS_ALIGNED(s->max_write_zero,
+                          MAX(s->opt_write_zero, align)))) {
+        error_setg(errp, "Cannot meet constraints with max-write-zero %" PRIu64,
+                   s->max_write_zero);
+        goto out;
+    }
+
+    s->opt_discard = qemu_opt_get_size(opts, "opt-discard", 0);
+    if (s->opt_discard &&
+        (s->opt_discard >= INT_MAX ||
+         !QEMU_IS_ALIGNED(s->opt_discard, align))) {
+        error_setg(errp, "Cannot meet constraints with opt-discard %" PRIu64,
+                   s->opt_discard);
+        goto out;
+    }
+
+    s->max_discard = qemu_opt_get_size(opts, "max-discard", 0);
+    if (s->max_discard &&
+        (s->max_discard >= INT_MAX ||
+         !QEMU_IS_ALIGNED(s->max_discard,
+                          MAX(s->opt_discard, align)))) {
+        error_setg(errp, "Cannot meet constraints with max-discard %" PRIu64,
+                   s->max_discard);
+        goto out;
+    }
 
     ret = 0;
 out:
@@ -789,6 +868,21 @@ static void blkdebug_refresh_limits(BlockDriverState *bs, Error **errp)
     if (s->align) {
         bs->bl.request_alignment = s->align;
     }
+    if (s->max_transfer) {
+        bs->bl.max_transfer = s->max_transfer;
+    }
+    if (s->opt_write_zero) {
+        bs->bl.pwrite_zeroes_alignment = s->opt_write_zero;
+    }
+    if (s->max_write_zero) {
+        bs->bl.max_pwrite_zeroes = s->max_write_zero;
+    }
+    if (s->opt_discard) {
+        bs->bl.pdiscard_alignment = s->opt_discard;
+    }
+    if (s->max_discard) {
+        bs->bl.max_pdiscard = s->max_discard;
+    }
 }
 
 static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 52e3ecd505..6b974b952f 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2434,8 +2434,33 @@
 #
 # @config:          filename of the configuration file
 #
-# @align:           required alignment for requests in bytes,
-#                   must be power of 2, or 0 for default
+# @align:           required alignment for requests in bytes, must be
+#                   positive power of 2, or 0 for default
+#
+# @max-transfer:    maximum size for I/O transfers in bytes, must be
+#                   positive multiple of @align and of the underlying
+#                   file's request alignment (but need not be a power of
+#                   2), or 0 for default (since 2.10)
+#
+# @opt-write-zero:  preferred alignment for write zero requests in bytes,
+#                   must be positive multiple of @align and of the
+#                   underlying file's request alignment (but need not be a
+#                   power of 2), or 0 for default (since 2.10)
+#
+# @max-write-zero:  maximum size for write zero requests in bytes, must be
+#                   positive multiple of @align, of @opt-write-zero, and of
+#                   the underlying file's request alignment (but need not
+#                   be a power of 2), or 0 for default (since 2.10)
+#
+# @opt-discard:     preferred alignment for discard requests in bytes, must
+#                   be positive multiple of @align and of the underlying
+#                   file's request alignment (but need not be a power of
+#                   2), or 0 for default (since 2.10)
+#
+# @max-discard:     maximum size for discard requests in bytes, must be
+#                   positive multiple of @align, of @opt-discard, and of
+#                   the underlying file's request alignment (but need not
+#                   be a power of 2), or 0 for default (since 2.10)
 #
 # @inject-error:    array of error injection descriptions
 #
@@ -2446,7 +2471,9 @@
 { 'struct': 'BlockdevOptionsBlkdebug',
   'data': { 'image': 'BlockdevRef',
             '*config': 'str',
-            '*align': 'int',
+            '*align': 'int', '*max-transfer': 'int32',
+            '*opt-write-zero': 'int32', '*max-write-zero': 'int32',
+            '*opt-discard': 'int32', '*max-discard': 'int32',
             '*inject-error': ['BlkdebugInjectErrorOptions'],
             '*set-state': ['BlkdebugSetStateOptions'] } }
 

From 40812d937392fddc11f72a668aef251039cc15ce Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 29 Apr 2017 14:14:19 -0500
Subject: [PATCH 09/22] tests: Add coverage for recent block geometry fixes

Use blkdebug's new geometry constraints to emulate setups that
have needed past regression fixes: write zeroes asserting
when running through a loopback block device with max-transfer
smaller than cluster size, and discard rounding away portions
of requests not aligned to preferred boundaries.  Also, add
coverage that the block layer is honoring max transfer limits.

For now, a single iotest performs all actions, with the idea
that we can add future blkdebug constraint test cases in the
same file; but it can be split into multiple iotests if we find
reason to run one portion of the test in more setups than what
are possible in the other.

For reference, the final portion of the test (checking whether
discard passes as much as possible to the lowest layers of the
stack) works as follows:

qemu-io: discard 30M at 80000001, passed to blkdebug
  blkdebug: discard 511 bytes at 80000001, -ENOTSUP (smaller than
blkdebug's 512 align)
  blkdebug: discard 14371328 bytes at 80000512, passed to qcow2
    qcow2: discard 739840 bytes at 80000512, -ENOTSUP (smaller than
qcow2's 1M align)
    qcow2: discard 13M bytes at 77M, succeeds
  blkdebug: discard 15M bytes at 90M, passed to qcow2
    qcow2: discard 15M bytes at 90M, succeeds
  blkdebug: discard 1356800 bytes at 105M, passed to qcow2
    qcow2: discard 1M at 105M, succeeds
    qcow2: discard 308224 bytes at 106M, -ENOTSUP (smaller than qcow2's
1M align)
  blkdebug: discard 1 byte at 111457280, -ENOTSUP (smaller than
blkdebug's 512 align)

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170429191419.30051-10-eblake@redhat.com
[mreitz: For cooperation with image locking, add -r to the qemu-io
         invocation which verifies the image content]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/177     | 114 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/177.out |  49 ++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 164 insertions(+)
 create mode 100755 tests/qemu-iotests/177
 create mode 100644 tests/qemu-iotests/177.out

diff --git a/tests/qemu-iotests/177 b/tests/qemu-iotests/177
new file mode 100755
index 0000000000..2005c174f2
--- /dev/null
+++ b/tests/qemu-iotests/177
@@ -0,0 +1,114 @@
+#!/bin/bash
+#
+# Test corner cases with unusual block geometries
+#
+# Copyright (C) 2016-2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=eblake@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+
+CLUSTER_SIZE=1M
+size=128M
+options=driver=blkdebug,image.driver=qcow2
+
+echo
+echo "== setting up files =="
+
+TEST_IMG="$TEST_IMG.base" _make_test_img $size
+$QEMU_IO -c "write -P 11 0 $size" "$TEST_IMG.base" | _filter_qemu_io
+_make_test_img -b "$TEST_IMG.base"
+$QEMU_IO -c "write -P 22 0 $size" "$TEST_IMG" | _filter_qemu_io
+
+# Limited to 64k max-transfer
+echo
+echo "== constrained alignment and max-transfer =="
+limits=align=4k,max-transfer=64k
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+         -c "write -P 33 1000 128k" -c "read -P 33 1000 128k" | _filter_qemu_io
+
+echo
+echo "== write zero with constrained max-transfer =="
+limits=align=512,max-transfer=64k,opt-write-zero=$CLUSTER_SIZE
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+         -c "write -z 8003584 2093056" | _filter_qemu_io
+
+# non-power-of-2 write-zero/discard alignments
+echo
+echo "== non-power-of-2 write zeroes limits =="
+
+limits=align=512,opt-write-zero=15M,max-write-zero=15M,opt-discard=15M,max-discard=15M
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+         -c "write -z 32M 32M" | _filter_qemu_io
+
+echo
+echo "== non-power-of-2 discard limits =="
+
+limits=align=512,opt-write-zero=15M,max-write-zero=15M,opt-discard=15M,max-discard=15M
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+         -c "discard 80000001 30M" | _filter_qemu_io
+
+echo
+echo "== verify image content =="
+
+function verify_io()
+{
+    if ($QEMU_IMG info -f "$IMGFMT" "$TEST_IMG" |
+	    grep "compat: 0.10" > /dev/null); then
+        # For v2 images, discarded clusters are read from the backing file
+        discarded=11
+    else
+        # Discarded clusters are zeroed for v3 or later
+        discarded=0
+    fi
+
+    echo read -P 22 0 1000
+    echo read -P 33 1000 128k
+    echo read -P 22 132072 7871512
+    echo read -P 0 8003584 2093056
+    echo read -P 22 10096640 23457792
+    echo read -P 0 32M 32M
+    echo read -P 22 64M 13M
+    echo read -P $discarded 77M 29M
+    echo read -P 22 106M 22M
+}
+
+verify_io | $QEMU_IO -r "$TEST_IMG" | _filter_qemu_io
+
+_check_test_img
+
+# success, all done
+echo "*** done"
+status=0
diff --git a/tests/qemu-iotests/177.out b/tests/qemu-iotests/177.out
new file mode 100644
index 0000000000..e887542678
--- /dev/null
+++ b/tests/qemu-iotests/177.out
@@ -0,0 +1,49 @@
+QA output created by 177
+
+== setting up files ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134217728
+wrote 134217728/134217728 bytes at offset 0
+128 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 134217728/134217728 bytes at offset 0
+128 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== constrained alignment and max-transfer ==
+wrote 131072/131072 bytes at offset 1000
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 131072/131072 bytes at offset 1000
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== write zero with constrained max-transfer ==
+wrote 2093056/2093056 bytes at offset 8003584
+1.996 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== non-power-of-2 write zeroes limits ==
+wrote 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== non-power-of-2 discard limits ==
+discard 31457280/31457280 bytes at offset 80000001
+30 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify image content ==
+read 1000/1000 bytes at offset 0
+1000 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 131072/131072 bytes at offset 1000
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 7871512/7871512 bytes at offset 132072
+7.507 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2093056/2093056 bytes at offset 8003584
+1.996 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 23457792/23457792 bytes at offset 10096640
+22.371 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 13631488/13631488 bytes at offset 67108864
+13 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 80740352
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 23068672/23068672 bytes at offset 111149056
+22 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+No errors were found on the image.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index a4549d82b2..bb6df8f2d7 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -170,5 +170,6 @@
 174 auto
 175 auto quick
 176 rw auto backing
+177 rw auto quick
 181 rw auto migration
 182 rw auto quick

From b32cbae11107e9e172e5c58425a2a8362e7382ed Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:41 -0500
Subject: [PATCH 10/22] qcow2: Nicer variable names in
 qcow2_update_snapshot_refcount()

In order to keep checkpatch happy when the next patch changes
indentation, we first have to shorten some long lines.  The easiest
approach is to use a new variable in place of
'offset & L2E_OFFSET_MASK', except that 'offset' is the best name
for that variable.  Change '[old_]offset' to '[old_]entry' to
make room.

While touching things, also fix checkpatch warnings about unusual
'for' statements.

Suggested by Max Reitz <mreitz@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170507000552.20847-2-eblake@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-refcount.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4efca7ebdb..db0af2ce27 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1059,9 +1059,9 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
+    uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
     bool l1_allocated = false;
-    int64_t old_offset, old_l2_offset;
+    int64_t old_entry, old_l2_offset;
     int i, j, l1_modified = 0, nb_csectors;
     int ret;
 
@@ -1089,15 +1089,16 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
             goto fail;
         }
 
-        for(i = 0;i < l1_size; i++)
+        for (i = 0; i < l1_size; i++) {
             be64_to_cpus(&l1_table[i]);
+        }
     } else {
         assert(l1_size == s->l1_size);
         l1_table = s->l1_table;
         l1_allocated = false;
     }
 
-    for(i = 0; i < l1_size; i++) {
+    for (i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
         if (l2_offset) {
             old_l2_offset = l2_offset;
@@ -1117,20 +1118,22 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                 goto fail;
             }
 
-            for(j = 0; j < s->l2_size; j++) {
+            for (j = 0; j < s->l2_size; j++) {
                 uint64_t cluster_index;
+                uint64_t offset;
 
-                offset = be64_to_cpu(l2_table[j]);
-                old_offset = offset;
-                offset &= ~QCOW_OFLAG_COPIED;
+                entry = be64_to_cpu(l2_table[j]);
+                old_entry = entry;
+                entry &= ~QCOW_OFLAG_COPIED;
+                offset = entry & L2E_OFFSET_MASK;
 
-                switch (qcow2_get_cluster_type(offset)) {
+                switch (qcow2_get_cluster_type(entry)) {
                     case QCOW2_CLUSTER_COMPRESSED:
-                        nb_csectors = ((offset >> s->csize_shift) &
+                        nb_csectors = ((entry >> s->csize_shift) &
                                        s->csize_mask) + 1;
                         if (addend != 0) {
                             ret = update_refcount(bs,
-                                (offset & s->cluster_offset_mask) & ~511,
+                                (entry & s->cluster_offset_mask) & ~511,
                                 nb_csectors * 512, abs(addend), addend < 0,
                                 QCOW2_DISCARD_SNAPSHOT);
                             if (ret < 0) {
@@ -1143,18 +1146,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
                     case QCOW2_CLUSTER_NORMAL:
                     case QCOW2_CLUSTER_ZERO:
-                        if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
+                        if (offset_into_cluster(s, offset)) {
                             qcow2_signal_corruption(bs, true, -1, -1, "Data "
-                                                    "cluster offset %#llx "
-                                                    "unaligned (L2 offset: %#"
+                                                    "cluster offset %#" PRIx64
+                                                    " unaligned (L2 offset: %#"
                                                     PRIx64 ", L2 index: %#x)",
-                                                    offset & L2E_OFFSET_MASK,
-                                                    l2_offset, j);
+                                                    offset, l2_offset, j);
                             ret = -EIO;
                             goto fail;
                         }
 
-                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        cluster_index = offset >> s->cluster_bits;
                         if (!cluster_index) {
                             /* unallocated */
                             refcount = 0;
@@ -1184,14 +1186,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                 }
 
                 if (refcount == 1) {
-                    offset |= QCOW_OFLAG_COPIED;
+                    entry |= QCOW_OFLAG_COPIED;
                 }
-                if (offset != old_offset) {
+                if (entry != old_entry) {
                     if (addend > 0) {
                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
                             s->refcount_block_cache);
                     }
-                    l2_table[j] = cpu_to_be64(offset);
+                    l2_table[j] = cpu_to_be64(entry);
                     qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
                                                  l2_table);
                 }

From bbd995d8307e4a15a7724f11adf104b6e07a0849 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:42 -0500
Subject: [PATCH 11/22] qcow2: Use consistent switch indentation

Fix a couple of inconsistent indentations, before an upcoming
patch further tweaks the switch statements.
(best viewed with 'git diff -b').

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-3-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c  | 32 +++++++--------
 block/qcow2-refcount.c | 90 +++++++++++++++++++++---------------------
 2 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 31077d8102..335a50512f 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1504,25 +1504,25 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
          * but rather fall through to the backing file.
          */
         switch (qcow2_get_cluster_type(old_l2_entry)) {
-            case QCOW2_CLUSTER_UNALLOCATED:
-                if (full_discard || !bs->backing) {
-                    continue;
-                }
-                break;
+        case QCOW2_CLUSTER_UNALLOCATED:
+            if (full_discard || !bs->backing) {
+                continue;
+            }
+            break;
 
-            case QCOW2_CLUSTER_ZERO:
-                /* Preallocated zero clusters should be discarded in any case */
-                if (!full_discard && (old_l2_entry & L2E_OFFSET_MASK) == 0) {
-                    continue;
-                }
-                break;
+        case QCOW2_CLUSTER_ZERO:
+            /* Preallocated zero clusters should be discarded in any case */
+            if (!full_discard && (old_l2_entry & L2E_OFFSET_MASK) == 0) {
+                continue;
+            }
+            break;
 
-            case QCOW2_CLUSTER_NORMAL:
-            case QCOW2_CLUSTER_COMPRESSED:
-                break;
+        case QCOW2_CLUSTER_NORMAL:
+        case QCOW2_CLUSTER_COMPRESSED:
+            break;
 
-            default:
-                abort();
+        default:
+            abort();
         }
 
         /* First remove L2 entries */
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index db0af2ce27..908dbe5314 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1128,61 +1128,61 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                 offset = entry & L2E_OFFSET_MASK;
 
                 switch (qcow2_get_cluster_type(entry)) {
-                    case QCOW2_CLUSTER_COMPRESSED:
-                        nb_csectors = ((entry >> s->csize_shift) &
-                                       s->csize_mask) + 1;
-                        if (addend != 0) {
-                            ret = update_refcount(bs,
+                case QCOW2_CLUSTER_COMPRESSED:
+                    nb_csectors = ((entry >> s->csize_shift) &
+                                   s->csize_mask) + 1;
+                    if (addend != 0) {
+                        ret = update_refcount(bs,
                                 (entry & s->cluster_offset_mask) & ~511,
                                 nb_csectors * 512, abs(addend), addend < 0,
                                 QCOW2_DISCARD_SNAPSHOT);
-                            if (ret < 0) {
-                                goto fail;
-                            }
-                        }
-                        /* compressed clusters are never modified */
-                        refcount = 2;
-                        break;
-
-                    case QCOW2_CLUSTER_NORMAL:
-                    case QCOW2_CLUSTER_ZERO:
-                        if (offset_into_cluster(s, offset)) {
-                            qcow2_signal_corruption(bs, true, -1, -1, "Data "
-                                                    "cluster offset %#" PRIx64
-                                                    " unaligned (L2 offset: %#"
-                                                    PRIx64 ", L2 index: %#x)",
-                                                    offset, l2_offset, j);
-                            ret = -EIO;
-                            goto fail;
-                        }
-
-                        cluster_index = offset >> s->cluster_bits;
-                        if (!cluster_index) {
-                            /* unallocated */
-                            refcount = 0;
-                            break;
-                        }
-                        if (addend != 0) {
-                            ret = qcow2_update_cluster_refcount(bs,
-                                    cluster_index, abs(addend), addend < 0,
-                                    QCOW2_DISCARD_SNAPSHOT);
-                            if (ret < 0) {
-                                goto fail;
-                            }
-                        }
-
-                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
                         if (ret < 0) {
                             goto fail;
                         }
-                        break;
+                    }
+                    /* compressed clusters are never modified */
+                    refcount = 2;
+                    break;
 
-                    case QCOW2_CLUSTER_UNALLOCATED:
+                case QCOW2_CLUSTER_NORMAL:
+                case QCOW2_CLUSTER_ZERO:
+                    if (offset_into_cluster(s, offset)) {
+                        qcow2_signal_corruption(bs, true, -1, -1, "Data "
+                                                "cluster offset %#" PRIx64
+                                                " unaligned (L2 offset: %#"
+                                                PRIx64 ", L2 index: %#x)",
+                                                offset, l2_offset, j);
+                        ret = -EIO;
+                        goto fail;
+                    }
+
+                    cluster_index = offset >> s->cluster_bits;
+                    if (!cluster_index) {
+                        /* unallocated */
                         refcount = 0;
                         break;
+                    }
+                    if (addend != 0) {
+                        ret = qcow2_update_cluster_refcount(bs,
+                                    cluster_index, abs(addend), addend < 0,
+                                    QCOW2_DISCARD_SNAPSHOT);
+                        if (ret < 0) {
+                            goto fail;
+                        }
+                    }
 
-                    default:
-                        abort();
+                    ret = qcow2_get_refcount(bs, cluster_index, &refcount);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                    break;
+
+                case QCOW2_CLUSTER_UNALLOCATED:
+                    refcount = 0;
+                    break;
+
+                default:
+                    abort();
                 }
 
                 if (refcount == 1) {

From 4c41cb4955effff3447bd07a09e0af9c6e0453e3 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:43 -0500
Subject: [PATCH 12/22] block: Update comments on BDRV_BLOCK_* meanings

We had some conflicting documentation: a nice 8-way table that
described all possible combinations of DATA, ZERO, and
OFFSET_VALID, contrasted with text that implied that OFFSET_VALID
always meant raw data could be read directly.  Furthermore, the
text refers a lot to bs->file, even though the interface was
updated back in 67a0fd2a to let the driver pass back a specific
BDS (not necessarily bs->file).  As the 8-way table is the
intended semantics, simplify the rest of the text to get rid of
the confusion.

ALLOCATED is always set by the block layer for convenience (drivers
do not have to worry about it).  RAW is used only internally, but
by more than the raw driver.  Document these additional items on
the driver callback.

Suggested-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-4-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/block/block.h     | 35 +++++++++++++++++++----------------
 include/block/block_int.h |  7 +++++++
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 90932b4709..9b355e92d8 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -121,29 +121,32 @@ typedef struct HDGeometry {
 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
 
 /*
- * Allocation status flags
- * BDRV_BLOCK_DATA: data is read from a file returned by bdrv_get_block_status.
- * BDRV_BLOCK_ZERO: sectors read as zero
- * BDRV_BLOCK_OFFSET_VALID: sector stored as raw data in a file returned by
- *                          bdrv_get_block_status.
+ * Allocation status flags for bdrv_get_block_status() and friends.
+ *
+ * Public flags:
+ * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
+ * BDRV_BLOCK_ZERO: offset reads as zero
+ * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
  * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
- *                       layer (as opposed to the backing file)
- * BDRV_BLOCK_RAW: used internally to indicate that the request
- *                 was answered by the raw driver and that one
- *                 should look in bs->file directly.
+ *                       layer (short for DATA || ZERO), set by block layer
  *
- * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 represent the offset in
- * bs->file where sector data can be read from as raw data.
+ * Internal flag:
+ * BDRV_BLOCK_RAW: used internally to indicate that the request was
+ *                 answered by a passthrough driver such as raw and that the
+ *                 block layer should recompute the answer from bs->file.
  *
- * DATA == 0 && ZERO == 0 means that data is read from backing_hd if present.
+ * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 (BDRV_BLOCK_OFFSET_MASK)
+ * represent the offset in the returned BDS that is allocated for the
+ * corresponding raw data; however, whether that offset actually contains
+ * data also depends on BDRV_BLOCK_DATA and BDRV_BLOCK_ZERO, as follows:
  *
  * DATA ZERO OFFSET_VALID
- *  t    t        t       sectors read as zero, bs->file is zero at offset
- *  t    f        t       sectors read as valid from bs->file at offset
- *  f    t        t       sectors preallocated, read as zero, bs->file not
+ *  t    t        t       sectors read as zero, returned file is zero at offset
+ *  t    f        t       sectors read as valid from file at offset
+ *  f    t        t       sectors preallocated, read as zero, returned file not
  *                        necessarily zero at offset
  *  f    f        t       sectors preallocated but read from backing_hd,
- *                        bs->file contains garbage at offset
+ *                        returned file contains garbage at offset
  *  t    t        f       sectors preallocated, read as zero, unknown offset
  *  t    f        f       sectors read from unknown file or offset
  *  f    t        f       not allocated or unknown offset, read as zero
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 5750a448a6..8d3724cce6 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -165,6 +165,13 @@ struct BlockDriver {
         int64_t offset, int count, BdrvRequestFlags flags);
     int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
         int64_t offset, int count);
+
+    /*
+     * Building block for bdrv_block_status[_above]. The driver should
+     * answer only according to the current layer, and should not
+     * set BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW.  See block.h
+     * for the meaning of _DATA, _ZERO, and _OFFSET_VALID.
+     */
     int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum,
         BlockDriverState **file);

From 4341df8a83d6a528a1e2855735f87fc3aab42b70 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:44 -0500
Subject: [PATCH 13/22] qcow2: Correctly report status of preallocated zero
 clusters

We were throwing away the preallocation information associated with
zero clusters.  But we should be matching the well-defined semantics
in bdrv_get_block_status(), where (BDRV_BLOCK_ZERO |
BDRV_BLOCK_OFFSET_VALID) informs the user which offset is reserved,
while still reminding the user that reading from that offset is
likely to read garbage.

count_contiguous_clusters_by_type() is now used only for unallocated
cluster runs, hence it gets renamed and tightened.

Making this change lets us see which portions of an image are zero
but preallocated, when using qemu-img map --output=json.  The
--output=human side intentionally ignores all zero clusters, whether
or not they are preallocated.

The fact that there is no change to qemu-iotests './check -qcow2'
merely means that we aren't yet testing this aspect of qemu-img;
a later patch will add a test.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-5-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 45 +++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 335a50512f..f3bfce65f7 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -334,16 +334,23 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 	return i;
 }
 
-static int count_contiguous_clusters_by_type(int nb_clusters,
-                                             uint64_t *l2_table,
-                                             int wanted_type)
+/*
+ * Checks how many consecutive unallocated clusters in a given L2
+ * table have the same cluster type.
+ */
+static int count_contiguous_clusters_unallocated(int nb_clusters,
+                                                 uint64_t *l2_table,
+                                                 int wanted_type)
 {
     int i;
 
+    assert(wanted_type == QCOW2_CLUSTER_ZERO ||
+           wanted_type == QCOW2_CLUSTER_UNALLOCATED);
     for (i = 0; i < nb_clusters; i++) {
-        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+        uint64_t entry = be64_to_cpu(l2_table[i]);
+        int type = qcow2_get_cluster_type(entry);
 
-        if (type != wanted_type) {
+        if (type != wanted_type || entry & L2E_OFFSET_MASK) {
             break;
         }
     }
@@ -565,14 +572,32 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
             ret = -EIO;
             goto fail;
         }
-        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
-                                              QCOW2_CLUSTER_ZERO);
-        *cluster_offset = 0;
+        /* Distinguish between pure zero clusters and pre-allocated ones */
+        if (*cluster_offset & L2E_OFFSET_MASK) {
+            c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                                          &l2_table[l2_index], QCOW_OFLAG_ZERO);
+            *cluster_offset &= L2E_OFFSET_MASK;
+            if (offset_into_cluster(s, *cluster_offset)) {
+                qcow2_signal_corruption(bs, true, -1, -1,
+                                        "Preallocated zero cluster offset %#"
+                                        PRIx64 " unaligned (L2 offset: %#"
+                                        PRIx64 ", L2 index: %#x)",
+                                        *cluster_offset, l2_offset, l2_index);
+                ret = -EIO;
+                goto fail;
+            }
+        } else {
+            c = count_contiguous_clusters_unallocated(nb_clusters,
+                                                      &l2_table[l2_index],
+                                                      QCOW2_CLUSTER_ZERO);
+            *cluster_offset = 0;
+        }
         break;
     case QCOW2_CLUSTER_UNALLOCATED:
         /* how many empty clusters ? */
-        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
-                                              QCOW2_CLUSTER_UNALLOCATED);
+        c = count_contiguous_clusters_unallocated(nb_clusters,
+                                                  &l2_table[l2_index],
+                                                  QCOW2_CLUSTER_UNALLOCATED);
         *cluster_offset = 0;
         break;
     case QCOW2_CLUSTER_NORMAL:

From 3ef9521893c41638148d4452ee4da75fd8ef71a9 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:45 -0500
Subject: [PATCH 14/22] qcow2: Name typedef for cluster type

Although it doesn't add all that much type safety (this is C, after
all), it does add a bit of legibility to use the name QCow2ClusterType
instead of a plain int.

In particular, qcow2_get_cluster_offset() has an overloaded return
type; a QCow2ClusterType on success, and -errno on failure; keeping
the cluster type in a separate variable makes it slightly easier for
the next patch to make further computations based on the type.

Suggested-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170507000552.20847-6-eblake@redhat.com
[mreitz: Use the new type in two more places (one of them pulled from
         the next patch)]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c  | 21 +++++++++++----------
 block/qcow2-refcount.c |  2 +-
 block/qcow2.h          |  6 +++---
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f3bfce65f7..26f9e0e4b8 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -309,7 +309,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
         uint64_t *l2_table, uint64_t stop_flags)
 {
     int i;
-    int first_cluster_type;
+    QCow2ClusterType first_cluster_type;
     uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
     uint64_t first_entry = be64_to_cpu(l2_table[0]);
     uint64_t offset = first_entry & mask;
@@ -340,7 +340,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
  */
 static int count_contiguous_clusters_unallocated(int nb_clusters,
                                                  uint64_t *l2_table,
-                                                 int wanted_type)
+                                                 QCow2ClusterType wanted_type)
 {
     int i;
 
@@ -348,7 +348,7 @@ static int count_contiguous_clusters_unallocated(int nb_clusters,
            wanted_type == QCOW2_CLUSTER_UNALLOCATED);
     for (i = 0; i < nb_clusters; i++) {
         uint64_t entry = be64_to_cpu(l2_table[i]);
-        int type = qcow2_get_cluster_type(entry);
+        QCow2ClusterType type = qcow2_get_cluster_type(entry);
 
         if (type != wanted_type || entry & L2E_OFFSET_MASK) {
             break;
@@ -500,6 +500,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int l1_bits, c;
     unsigned int offset_in_cluster;
     uint64_t bytes_available, bytes_needed, nb_clusters;
+    QCow2ClusterType type;
     int ret;
 
     offset_in_cluster = offset_into_cluster(s, offset);
@@ -522,13 +523,13 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     l1_index = offset >> l1_bits;
     if (l1_index >= s->l1_size) {
-        ret = QCOW2_CLUSTER_UNALLOCATED;
+        type = QCOW2_CLUSTER_UNALLOCATED;
         goto out;
     }
 
     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
     if (!l2_offset) {
-        ret = QCOW2_CLUSTER_UNALLOCATED;
+        type = QCOW2_CLUSTER_UNALLOCATED;
         goto out;
     }
 
@@ -557,8 +558,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
      * true */
     assert(nb_clusters <= INT_MAX);
 
-    ret = qcow2_get_cluster_type(*cluster_offset);
-    switch (ret) {
+    type = qcow2_get_cluster_type(*cluster_offset);
+    switch (type) {
     case QCOW2_CLUSTER_COMPRESSED:
         /* Compressed clusters can only be processed one by one */
         c = 1;
@@ -633,7 +634,7 @@ out:
     assert(bytes_available - offset_in_cluster <= UINT_MAX);
     *bytes = bytes_available - offset_in_cluster;
 
-    return ret;
+    return type;
 
 fail:
     qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
@@ -891,7 +892,7 @@ static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
-        int cluster_type = qcow2_get_cluster_type(l2_entry);
+        QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
 
         switch(cluster_type) {
         case QCOW2_CLUSTER_NORMAL:
@@ -1757,7 +1758,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
         for (j = 0; j < s->l2_size; j++) {
             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
             int64_t offset = l2_entry & L2E_OFFSET_MASK;
-            int cluster_type = qcow2_get_cluster_type(l2_entry);
+            QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
             bool preallocated = offset != 0;
 
             if (cluster_type != QCOW2_CLUSTER_ZERO) {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 908dbe5314..e639b341e5 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1640,7 +1640,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
         for (j = 0; j < s->l2_size; j++) {
             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
-            int cluster_type = qcow2_get_cluster_type(l2_entry);
+            QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
 
             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
diff --git a/block/qcow2.h b/block/qcow2.h
index 8731f24b82..c148bbcdb6 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -349,12 +349,12 @@ typedef struct QCowL2Meta
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
 
-enum {
+typedef enum QCow2ClusterType {
     QCOW2_CLUSTER_UNALLOCATED,
     QCOW2_CLUSTER_NORMAL,
     QCOW2_CLUSTER_COMPRESSED,
     QCOW2_CLUSTER_ZERO
-};
+} QCow2ClusterType;
 
 typedef enum QCow2MetadataOverlap {
     QCOW2_OL_MAIN_HEADER_BITNR    = 0,
@@ -443,7 +443,7 @@ static inline uint64_t qcow2_max_refcount_clusters(BDRVQcow2State *s)
     return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits;
 }
 
-static inline int qcow2_get_cluster_type(uint64_t l2_entry)
+static inline QCow2ClusterType qcow2_get_cluster_type(uint64_t l2_entry)
 {
     if (l2_entry & QCOW_OFLAG_COMPRESSED) {
         return QCOW2_CLUSTER_COMPRESSED;

From fdfab37dfeffefbd4533b4158055c9b82d7c3e69 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:46 -0500
Subject: [PATCH 15/22] qcow2: Make distinction between zero cluster types
 obvious

Treat plain zero clusters differently from allocated ones, so that
we can simplify the logic of checking whether an offset is present.
Do this by splitting QCOW2_CLUSTER_ZERO into two new enums,
QCOW2_CLUSTER_ZERO_PLAIN and QCOW2_CLUSTER_ZERO_ALLOC.

I tried to arrange the enum so that we could use
'ret <= QCOW2_CLUSTER_ZERO_PLAIN' for all unallocated types, and
'ret >= QCOW2_CLUSTER_ZERO_ALLOC' for allocated types, although
I didn't actually end up taking advantage of the layout.

In many cases, this leads to simpler code, by properly combining
cases (sometimes, both zero types pair together, other times,
plain zero is more like unallocated while allocated zero is more
like normal).

Signed-off-by: Eric Blake <eblake@redhat.com>
Message-id: 20170507000552.20847-7-eblake@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c      | 79 +++++++++++++++-----------------------
 block/qcow2-refcount.c     | 44 +++++++++------------
 block/qcow2.c              |  9 +++--
 block/qcow2.h              |  8 +++-
 tests/qemu-iotests/060.out |  6 +--
 5 files changed, 63 insertions(+), 83 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 26f9e0e4b8..558c23957f 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -321,8 +321,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
     /* must be allocated */
     first_cluster_type = qcow2_get_cluster_type(first_entry);
     assert(first_cluster_type == QCOW2_CLUSTER_NORMAL ||
-           (first_cluster_type == QCOW2_CLUSTER_ZERO &&
-            (first_entry & L2E_OFFSET_MASK) != 0));
+           first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC);
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
@@ -344,13 +343,13 @@ static int count_contiguous_clusters_unallocated(int nb_clusters,
 {
     int i;
 
-    assert(wanted_type == QCOW2_CLUSTER_ZERO ||
+    assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN ||
            wanted_type == QCOW2_CLUSTER_UNALLOCATED);
     for (i = 0; i < nb_clusters; i++) {
         uint64_t entry = be64_to_cpu(l2_table[i]);
         QCow2ClusterType type = qcow2_get_cluster_type(entry);
 
-        if (type != wanted_type || entry & L2E_OFFSET_MASK) {
+        if (type != wanted_type) {
             break;
         }
     }
@@ -559,55 +558,36 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     assert(nb_clusters <= INT_MAX);
 
     type = qcow2_get_cluster_type(*cluster_offset);
+    if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN ||
+                                type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+        qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
+                                " in pre-v3 image (L2 offset: %#" PRIx64
+                                ", L2 index: %#x)", l2_offset, l2_index);
+        ret = -EIO;
+        goto fail;
+    }
     switch (type) {
     case QCOW2_CLUSTER_COMPRESSED:
         /* Compressed clusters can only be processed one by one */
         c = 1;
         *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
         break;
-    case QCOW2_CLUSTER_ZERO:
-        if (s->qcow_version < 3) {
-            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
-                                    " in pre-v3 image (L2 offset: %#" PRIx64
-                                    ", L2 index: %#x)", l2_offset, l2_index);
-            ret = -EIO;
-            goto fail;
-        }
-        /* Distinguish between pure zero clusters and pre-allocated ones */
-        if (*cluster_offset & L2E_OFFSET_MASK) {
-            c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                          &l2_table[l2_index], QCOW_OFLAG_ZERO);
-            *cluster_offset &= L2E_OFFSET_MASK;
-            if (offset_into_cluster(s, *cluster_offset)) {
-                qcow2_signal_corruption(bs, true, -1, -1,
-                                        "Preallocated zero cluster offset %#"
-                                        PRIx64 " unaligned (L2 offset: %#"
-                                        PRIx64 ", L2 index: %#x)",
-                                        *cluster_offset, l2_offset, l2_index);
-                ret = -EIO;
-                goto fail;
-            }
-        } else {
-            c = count_contiguous_clusters_unallocated(nb_clusters,
-                                                      &l2_table[l2_index],
-                                                      QCOW2_CLUSTER_ZERO);
-            *cluster_offset = 0;
-        }
-        break;
+    case QCOW2_CLUSTER_ZERO_PLAIN:
     case QCOW2_CLUSTER_UNALLOCATED:
         /* how many empty clusters ? */
         c = count_contiguous_clusters_unallocated(nb_clusters,
-                                                  &l2_table[l2_index],
-                                                  QCOW2_CLUSTER_UNALLOCATED);
+                                                  &l2_table[l2_index], type);
         *cluster_offset = 0;
         break;
+    case QCOW2_CLUSTER_ZERO_ALLOC:
     case QCOW2_CLUSTER_NORMAL:
         /* how many allocated clusters ? */
         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                &l2_table[l2_index], QCOW_OFLAG_ZERO);
+                                      &l2_table[l2_index], QCOW_OFLAG_ZERO);
         *cluster_offset &= L2E_OFFSET_MASK;
         if (offset_into_cluster(s, *cluster_offset)) {
-            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
+            qcow2_signal_corruption(bs, true, -1, -1,
+                                    "Cluster allocation offset %#"
                                     PRIx64 " unaligned (L2 offset: %#" PRIx64
                                     ", L2 index: %#x)", *cluster_offset,
                                     l2_offset, l2_index);
@@ -902,7 +882,8 @@ static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
             break;
         case QCOW2_CLUSTER_UNALLOCATED:
         case QCOW2_CLUSTER_COMPRESSED:
-        case QCOW2_CLUSTER_ZERO:
+        case QCOW2_CLUSTER_ZERO_PLAIN:
+        case QCOW2_CLUSTER_ZERO_ALLOC:
             break;
         default:
             abort();
@@ -1203,8 +1184,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
      * wrong with our code. */
     assert(nb_clusters > 0);
 
-    if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO &&
-        (entry & L2E_OFFSET_MASK) != 0 && (entry & QCOW_OFLAG_COPIED) &&
+    if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC &&
+        (entry & QCOW_OFLAG_COPIED) &&
         (!*host_offset ||
          start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK)))
     {
@@ -1536,13 +1517,13 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
             }
             break;
 
-        case QCOW2_CLUSTER_ZERO:
-            /* Preallocated zero clusters should be discarded in any case */
-            if (!full_discard && (old_l2_entry & L2E_OFFSET_MASK) == 0) {
+        case QCOW2_CLUSTER_ZERO_PLAIN:
+            if (!full_discard) {
                 continue;
             }
             break;
 
+        case QCOW2_CLUSTER_ZERO_ALLOC:
         case QCOW2_CLUSTER_NORMAL:
         case QCOW2_CLUSTER_COMPRESSED:
             break;
@@ -1759,13 +1740,13 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
             int64_t offset = l2_entry & L2E_OFFSET_MASK;
             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
-            bool preallocated = offset != 0;
 
-            if (cluster_type != QCOW2_CLUSTER_ZERO) {
+            if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
+                cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
                 continue;
             }
 
-            if (!preallocated) {
+            if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                 if (!bs->backing) {
                     /* not backed; therefore we can simply deallocate the
                      * cluster */
@@ -1800,7 +1781,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                                         "%#" PRIx64 " unaligned (L2 offset: %#"
                                         PRIx64 ", L2 index: %#x)", offset,
                                         l2_offset, j);
-                if (!preallocated) {
+                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                     qcow2_free_clusters(bs, offset, s->cluster_size,
                                         QCOW2_DISCARD_ALWAYS);
                 }
@@ -1810,7 +1791,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
 
             ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
             if (ret < 0) {
-                if (!preallocated) {
+                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                     qcow2_free_clusters(bs, offset, s->cluster_size,
                                         QCOW2_DISCARD_ALWAYS);
                 }
@@ -1819,7 +1800,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
 
             ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
             if (ret < 0) {
-                if (!preallocated) {
+                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
                     qcow2_free_clusters(bs, offset, s->cluster_size,
                                         QCOW2_DISCARD_ALWAYS);
                 }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index e639b341e5..7c06061aae 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1028,18 +1028,17 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
         }
         break;
     case QCOW2_CLUSTER_NORMAL:
-    case QCOW2_CLUSTER_ZERO:
-        if (l2_entry & L2E_OFFSET_MASK) {
-            if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
-                qcow2_signal_corruption(bs, false, -1, -1,
-                                        "Cannot free unaligned cluster %#llx",
-                                        l2_entry & L2E_OFFSET_MASK);
-            } else {
-                qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
-                                    nb_clusters << s->cluster_bits, type);
-            }
+    case QCOW2_CLUSTER_ZERO_ALLOC:
+        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
+            qcow2_signal_corruption(bs, false, -1, -1,
+                                    "Cannot free unaligned cluster %#llx",
+                                    l2_entry & L2E_OFFSET_MASK);
+        } else {
+            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+                                nb_clusters << s->cluster_bits, type);
         }
         break;
+    case QCOW2_CLUSTER_ZERO_PLAIN:
     case QCOW2_CLUSTER_UNALLOCATED:
         break;
     default:
@@ -1145,10 +1144,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                     break;
 
                 case QCOW2_CLUSTER_NORMAL:
-                case QCOW2_CLUSTER_ZERO:
+                case QCOW2_CLUSTER_ZERO_ALLOC:
                     if (offset_into_cluster(s, offset)) {
-                        qcow2_signal_corruption(bs, true, -1, -1, "Data "
-                                                "cluster offset %#" PRIx64
+                        qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
+                                                "allocation offset %#" PRIx64
                                                 " unaligned (L2 offset: %#"
                                                 PRIx64 ", L2 index: %#x)",
                                                 offset, l2_offset, j);
@@ -1157,11 +1156,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                     }
 
                     cluster_index = offset >> s->cluster_bits;
-                    if (!cluster_index) {
-                        /* unallocated */
-                        refcount = 0;
-                        break;
-                    }
+                    assert(cluster_index);
                     if (addend != 0) {
                         ret = qcow2_update_cluster_refcount(bs,
                                     cluster_index, abs(addend), addend < 0,
@@ -1177,6 +1172,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                     }
                     break;
 
+                case QCOW2_CLUSTER_ZERO_PLAIN:
                 case QCOW2_CLUSTER_UNALLOCATED:
                     refcount = 0;
                     break;
@@ -1443,12 +1439,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
             }
             break;
 
-        case QCOW2_CLUSTER_ZERO:
-            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
-                break;
-            }
-            /* fall through */
-
+        case QCOW2_CLUSTER_ZERO_ALLOC:
         case QCOW2_CLUSTER_NORMAL:
         {
             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
@@ -1478,6 +1469,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
             break;
         }
 
+        case QCOW2_CLUSTER_ZERO_PLAIN:
         case QCOW2_CLUSTER_UNALLOCATED:
             break;
 
@@ -1642,8 +1634,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
 
-            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
-                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
+            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
+                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
                 ret = qcow2_get_refcount(bs,
                                          data_offset >> s->cluster_bits,
                                          &refcount);
diff --git a/block/qcow2.c b/block/qcow2.c
index 8caf800529..3fa62b0096 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1385,7 +1385,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
         *file = bs->file->bs;
         status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
     }
-    if (ret == QCOW2_CLUSTER_ZERO) {
+    if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
         status |= BDRV_BLOCK_ZERO;
     } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
         status |= BDRV_BLOCK_DATA;
@@ -1482,7 +1482,8 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
             }
             break;
 
-        case QCOW2_CLUSTER_ZERO:
+        case QCOW2_CLUSTER_ZERO_PLAIN:
+        case QCOW2_CLUSTER_ZERO_ALLOC:
             qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
             break;
 
@@ -2491,7 +2492,9 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
         count = s->cluster_size;
         nr = s->cluster_size;
         ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
-        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
+        if (ret != QCOW2_CLUSTER_UNALLOCATED &&
+            ret != QCOW2_CLUSTER_ZERO_PLAIN &&
+            ret != QCOW2_CLUSTER_ZERO_ALLOC) {
             qemu_co_mutex_unlock(&s->lock);
             return -ENOTSUP;
         }
diff --git a/block/qcow2.h b/block/qcow2.h
index c148bbcdb6..810ceb84c8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -351,9 +351,10 @@ typedef struct QCowL2Meta
 
 typedef enum QCow2ClusterType {
     QCOW2_CLUSTER_UNALLOCATED,
+    QCOW2_CLUSTER_ZERO_PLAIN,
+    QCOW2_CLUSTER_ZERO_ALLOC,
     QCOW2_CLUSTER_NORMAL,
     QCOW2_CLUSTER_COMPRESSED,
-    QCOW2_CLUSTER_ZERO
 } QCow2ClusterType;
 
 typedef enum QCow2MetadataOverlap {
@@ -448,7 +449,10 @@ static inline QCow2ClusterType qcow2_get_cluster_type(uint64_t l2_entry)
     if (l2_entry & QCOW_OFLAG_COMPRESSED) {
         return QCOW2_CLUSTER_COMPRESSED;
     } else if (l2_entry & QCOW_OFLAG_ZERO) {
-        return QCOW2_CLUSTER_ZERO;
+        if (l2_entry & L2E_OFFSET_MASK) {
+            return QCOW2_CLUSTER_ZERO_ALLOC;
+        }
+        return QCOW2_CLUSTER_ZERO_PLAIN;
     } else if (!(l2_entry & L2E_OFFSET_MASK)) {
         return QCOW2_CLUSTER_UNALLOCATED;
     } else {
diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
index 5d40206ef8..9e8f5b9d79 100644
--- a/tests/qemu-iotests/060.out
+++ b/tests/qemu-iotests/060.out
@@ -135,7 +135,7 @@ qemu-img: Error while amending options: Input/output error
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-qcow2: Marking image as corrupt: Data cluster offset 0x52a00 unaligned (L2 offset: 0x40000, L2 index: 0); further corruption events will be suppressed
+qcow2: Marking image as corrupt: Cluster allocation offset 0x52a00 unaligned (L2 offset: 0x40000, L2 index: 0); further corruption events will be suppressed
 read failed: Input/output error
 
 === Testing unaligned pre-allocated zero cluster ===
@@ -166,7 +166,7 @@ discard 65536/65536 bytes at offset 0
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-qcow2: Image is corrupt: Data cluster offset 0x52a00 unaligned (L2 offset: 0x40000, L2 index: 0); further non-fatal corruption events will be suppressed
+qcow2: Image is corrupt: Cluster allocation offset 0x52a00 unaligned (L2 offset: 0x40000, L2 index: 0); further non-fatal corruption events will be suppressed
 read failed: Input/output error
 read failed: Input/output error
 
@@ -176,7 +176,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qcow2: Image is corrupt: Cannot free unaligned cluster 0x52a00; further non-fatal corruption events will be suppressed
-qcow2: Marking image as corrupt: Data cluster offset 0x62a00 unaligned (L2 offset: 0x40000, L2 index: 0x1); further corruption events will be suppressed
+qcow2: Marking image as corrupt: Cluster allocation offset 0x62a00 unaligned (L2 offset: 0x40000, L2 index: 0x1); further corruption events will be suppressed
 discard 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read failed: Input/output error

From 06cc5e2b2d01cb778c966e1b4135062556b3b054 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:47 -0500
Subject: [PATCH 16/22] qcow2: Optimize zero_single_l2() to minimize L2 churn

Similar to discard_single_l2(), we should try to avoid dirtying
the L2 cache when the cluster we are changing already has the
right characteristics.

Note that by the time we get to zero_single_l2(), BDRV_REQ_MAY_UNMAP
is a requirement to unallocate a cluster (this is because the block
layer clears that flag if discard.* flags during open requested that
we never punch holes - see the conversation around commit 170f4b2e,
https://lists.gnu.org/archive/html/qemu-devel/2016-09/msg07306.html).
Therefore, this patch can only reuse a zero cluster as-is if either
unmapping is not requested, or if the zero cluster was not associated
with an allocation.

Technically, there are some cases where an unallocated cluster
already reads as all zeroes (namely, when there is no backing file
[easy: check bs->backing], or when the backing file also reads as
zeroes [harder: we can't check bdrv_get_block_status since we are
already holding the lock]), where the guest would not immediately see
a difference if we left that cluster unallocated.  But if the user
did not request unmapping, leaving an unallocated cluster is wrong;
and even if the user DID request unmapping, keeping a cluster
unallocated risks a subtle semantic change of guest-visible contents
if a backing file is later added, and it is not worth auditing
whether all internal uses such as mirror properly avoid an unmap
request.  Thus, this patch is intentionally limited to just clusters
that are already marked as zero.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-8-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 558c23957f..e2c5759d94 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1601,6 +1601,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
     int l2_index;
     int ret;
     int i;
+    bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);
 
     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
@@ -1613,12 +1614,22 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t old_offset;
+        QCow2ClusterType cluster_type;
 
         old_offset = be64_to_cpu(l2_table[l2_index + i]);
 
-        /* Update L2 entries */
+        /*
+         * Minimize L2 changes if the cluster already reads back as
+         * zeroes with correct allocation.
+         */
+        cluster_type = qcow2_get_cluster_type(old_offset);
+        if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN ||
+            (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) {
+            continue;
+        }
+
         qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
-        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
+        if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
         } else {

From d9ca2214bd058cfb4371b8373123c0444e4a5ac8 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:48 -0500
Subject: [PATCH 17/22] iotests: Improve _filter_qemu_img_map

Although _filter_qemu_img_map documents that it scrubs offsets, it
was only doing so for human mode.  Of the existing tests using the
filter (97, 122, 150, 154, 176), two of them are affected, but it
does not hurt the validity of the tests to not require particular
mappings (another test, 66, uses offsets but intentionally does not
pass through _filter_qemu_img_map, because it checks that offsets
are unchanged before and after an operation).

Another justification for this patch is that it will allow a future
patch to utilize 'qemu-img map --output=json' to check the status of
preallocated zero clusters without regards to the mapping (since
the qcow2 mapping can be very sensitive to the chosen cluster size,
when preallocation is not in use).

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-9-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/122.out       | 16 ++++++++--------
 tests/qemu-iotests/154.out       | 30 +++++++++++++++---------------
 tests/qemu-iotests/common.filter |  4 +++-
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index 9317d801ad..47d8656db8 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -112,7 +112,7 @@ read 3145728/3145728 bytes at offset 0
 3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 63963136/63963136 bytes at offset 3145728
 61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
 
 convert -c -S 0:
 read 3145728/3145728 bytes at offset 0
@@ -134,7 +134,7 @@ read 30408704/30408704 bytes at offset 3145728
 29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 33554432/33554432 bytes at offset 33554432
 32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
 
 convert -c -S 0 with source backing file:
 read 3145728/3145728 bytes at offset 0
@@ -152,7 +152,7 @@ read 30408704/30408704 bytes at offset 3145728
 29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 33554432/33554432 bytes at offset 33554432
 32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
 
 convert -c -S 0 -B ...
 read 3145728/3145728 bytes at offset 0
@@ -176,11 +176,11 @@ wrote 1024/1024 bytes at offset 17408
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 convert -S 4k
-[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 8192},
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
-{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 9216},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
-{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 10240},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
 
 convert -c -S 4k
@@ -192,9 +192,9 @@ convert -c -S 4k
 { "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
 
 convert -S 8k
-[{ "start": 0, "length": 9216, "depth": 0, "zero": false, "data": true, "offset": 8192},
+[{ "start": 0, "length": 9216, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
-{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 17408},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
 
 convert -c -S 8k
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index da9eabdda8..d3b68e7729 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -42,9 +42,9 @@ read 1024/1024 bytes at offset 65536
 read 2048/2048 bytes at offset 67584
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
-{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 36864, "length": 28672, "depth": 1, "zero": true, "data": false},
-{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 69632, "length": 134148096, "depth": 1, "zero": true, "data": false}]
 
 == backing file contains non-zero data after write_zeroes ==
@@ -69,9 +69,9 @@ read 1024/1024 bytes at offset 44032
 read 3072/3072 bytes at offset 40960
 3 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
-{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 36864, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 40960, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 40960, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 45056, "length": 134172672, "depth": 1, "zero": true, "data": false}]
 
 == write_zeroes covers non-zero data ==
@@ -143,13 +143,13 @@ read 1024/1024 bytes at offset 67584
 read 5120/5120 bytes at offset 68608
 5 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
-{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 32768, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 36864, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false},
-{ "start": 49152, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 49152, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 53248, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false},
-{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 69632, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}]
 
@@ -186,13 +186,13 @@ read 1024/1024 bytes at offset 72704
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 32768, "depth": 1, "zero": true, "data": false},
 { "start": 32768, "length": 4096, "depth": 0, "zero": true, "data": false},
-{ "start": 36864, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 36864, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 40960, "length": 8192, "depth": 1, "zero": true, "data": false},
 { "start": 49152, "length": 4096, "depth": 0, "zero": true, "data": false},
-{ "start": 53248, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 53248, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 57344, "length": 8192, "depth": 1, "zero": true, "data": false},
 { "start": 65536, "length": 4096, "depth": 0, "zero": true, "data": false},
-{ "start": 69632, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 69632, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 73728, "length": 134144000, "depth": 1, "zero": true, "data": false}]
 
 == spanning two clusters, partially overwriting backing file ==
@@ -212,7 +212,7 @@ read 1024/1024 bytes at offset 5120
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 read 2048/2048 bytes at offset 6144
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-[{ "start": 0, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": 20480},
+[{ "start": 0, "length": 8192, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 8192, "length": 134209536, "depth": 1, "zero": true, "data": false}]
 
 == spanning multiple clusters, non-zero in first cluster ==
@@ -227,7 +227,7 @@ read 2048/2048 bytes at offset 65536
 read 10240/10240 bytes at offset 67584
 10 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
-{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 69632, "length": 8192, "depth": 0, "zero": true, "data": false},
 { "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
 
@@ -257,7 +257,7 @@ read 2048/2048 bytes at offset 75776
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
 { "start": 65536, "length": 8192, "depth": 0, "zero": true, "data": false},
-{ "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
 
 == spanning multiple clusters, partially overwriting backing file ==
@@ -278,8 +278,8 @@ read 2048/2048 bytes at offset 74752
 read 1024/1024 bytes at offset 76800
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 65536, "depth": 1, "zero": true, "data": false},
-{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 65536, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 69632, "length": 4096, "depth": 0, "zero": true, "data": false},
-{ "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
 *** done
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index f58548dc44..2f595b2ce2 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -152,10 +152,12 @@ _filter_img_info()
         -e "/log_size: [0-9]\\+/d"
 }
 
-# filter out offsets and file names from qemu-img map
+# filter out offsets and file names from qemu-img map; good for both
+# human and json output
 _filter_qemu_img_map()
 {
     sed -e 's/\([0-9a-fx]* *[0-9a-fx]* *\)[0-9a-fx]* */\1/g' \
+        -e 's/"offset": [0-9]\+/"offset": OFFSET/g' \
         -e 's/Mapped to *//' | _filter_testdir | _filter_imgfmt
 }
 

From e249d5195279cfadb7f15e2ad55dd4fe1d67c105 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:49 -0500
Subject: [PATCH 18/22] iotests: Add test 179 to cover write zeroes with unmap

No tests were covering write zeroes with unmap.  Additionally,
I needed to prove that my previous patches for correct status
reporting and write zeroes optimizations actually had an impact.

The test works for cluster_size between 8k and 2M (for smaller
sizes, it fails because our allocation patterns are not contiguous
with small clusters - in part, the largest consecutive allocation
we tend to get is often bounded by the size covered by one L2
table).

Note that testing for zero clusters is tricky: 'qemu-io map'
reports whether data comes from the current layer of the image
(useful for sniffing out which regions of the file have
QCOW_OFLAG_ZERO) - but doesn't show which clusters have mappings;
while 'qemu-img map' sees "zero":true for both unallocated and
zero clusters for any qcow2 with no backing layer (so less useful
at detecting true zero clusters), but reliably shows mappings.
So we have to rely on both queries side-by-side at each point of
the test.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-10-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/179     | 130 +++++++++++++++++++++++++++++++
 tests/qemu-iotests/179.out | 156 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 287 insertions(+)
 create mode 100755 tests/qemu-iotests/179
 create mode 100644 tests/qemu-iotests/179.out

diff --git a/tests/qemu-iotests/179 b/tests/qemu-iotests/179
new file mode 100755
index 0000000000..7bc8db8fe0
--- /dev/null
+++ b/tests/qemu-iotests/179
@@ -0,0 +1,130 @@
+#!/bin/bash
+#
+# Test case for write zeroes with unmap
+#
+# Copyright (C) 2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=eblake@redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+# v2 images can't mark clusters as zero
+_unsupported_imgopts compat=0.10
+
+echo
+echo '=== Testing write zeroes with unmap ==='
+echo
+
+TEST_IMG="$TEST_IMG.base" _make_test_img 64M
+_make_test_img -b "$TEST_IMG.base"
+
+# Offsets chosen at or near 2M boundaries so test works at all cluster sizes
+# 8k and larger (smaller clusters fail due to non-contiguous allocations)
+
+# Aligned writes to unallocated cluster should not allocate mapping, but must
+# mark cluster as zero, whether or not unmap was requested
+$QEMU_IO -c "write -z -u 2M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 6M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "map" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# Unaligned writes need not allocate mapping if the cluster already reads
+# as zero, but must mark cluster as zero, whether or not unmap was requested
+$QEMU_IO -c "write -z -u 10485761 2097150" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 14680065 2097150" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "map" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# Requesting unmap of normal data must deallocate; omitting unmap should
+# preserve the mapping
+$QEMU_IO -c "write 18M 14M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 20M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 24M 6M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "map" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# Likewise when writing on already-mapped zero data
+$QEMU_IO -c "write -z -u 26M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 28M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "map" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# Writing on unmapped zeroes does not allocate
+$QEMU_IO -c "write -z 32M 8M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 34M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 36M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "map" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# Writing zero overrides a backing file, regardless of backing cluster type
+$QEMU_IO -c "write -z 40M 8M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write 48M 8M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 42M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 44M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 50M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 52M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 58M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 60M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "map" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Final check that mappings are correct and images are still sane
+TEST_IMG="$TEST_IMG.base" _check_test_img
+_check_test_img
+
+echo
+echo '=== Testing cache optimization ==='
+echo
+
+BLKDBG_TEST_IMG="blkdebug:$TEST_DIR/blkdebug.conf:$TEST_IMG.base"
+
+cat > "$TEST_DIR/blkdebug.conf" <<EOF
+[inject-error]
+event = "l2_update"
+errno = "5"
+immediately = "on"
+once = "off"
+EOF
+
+# None of the following writes should trigger an L2 update, because the
+# cluster already reads as zero, and we don't have to change allocation
+$QEMU_IO -c "w -z -u 20M 2M" "$BLKDBG_TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "w -z 20M 2M" "$BLKDBG_TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "w -z 28M 2M" "$BLKDBG_TEST_IMG" | _filter_qemu_io
+
+# success, all done
+echo '*** done'
+status=0
diff --git a/tests/qemu-iotests/179.out b/tests/qemu-iotests/179.out
new file mode 100644
index 0000000000..80722b2289
--- /dev/null
+++ b/tests/qemu-iotests/179.out
@@ -0,0 +1,156 @@
+QA output created by 179
+
+=== Testing write zeroes with unmap ===
+
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2097152/2097152 bytes at offset 2097152
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 6291456
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2 MiB (0x200000) bytes not allocated at offset 0 bytes (0x0)
+2 MiB (0x200000) bytes     allocated at offset 2 MiB (0x200000)
+2 MiB (0x200000) bytes not allocated at offset 4 MiB (0x400000)
+2 MiB (0x200000) bytes     allocated at offset 6 MiB (0x600000)
+56 MiB (0x3800000) bytes not allocated at offset 8 MiB (0x800000)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": true, "data": false}]
+wrote 2097150/2097150 bytes at offset 10485761
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097150/2097150 bytes at offset 14680065
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2 MiB (0x200000) bytes not allocated at offset 0 bytes (0x0)
+2 MiB (0x200000) bytes     allocated at offset 2 MiB (0x200000)
+2 MiB (0x200000) bytes not allocated at offset 4 MiB (0x400000)
+2 MiB (0x200000) bytes     allocated at offset 6 MiB (0x600000)
+2 MiB (0x200000) bytes not allocated at offset 8 MiB (0x800000)
+2 MiB (0x200000) bytes     allocated at offset 10 MiB (0xa00000)
+2 MiB (0x200000) bytes not allocated at offset 12 MiB (0xc00000)
+2 MiB (0x200000) bytes     allocated at offset 14 MiB (0xe00000)
+48 MiB (0x3000000) bytes not allocated at offset 16 MiB (0x1000000)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": true, "data": false}]
+wrote 14680064/14680064 bytes at offset 18874368
+14 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 20971520
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 6291456/6291456 bytes at offset 25165824
+6 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2 MiB (0x200000) bytes not allocated at offset 0 bytes (0x0)
+2 MiB (0x200000) bytes     allocated at offset 2 MiB (0x200000)
+2 MiB (0x200000) bytes not allocated at offset 4 MiB (0x400000)
+2 MiB (0x200000) bytes     allocated at offset 6 MiB (0x600000)
+2 MiB (0x200000) bytes not allocated at offset 8 MiB (0x800000)
+2 MiB (0x200000) bytes     allocated at offset 10 MiB (0xa00000)
+2 MiB (0x200000) bytes not allocated at offset 12 MiB (0xc00000)
+2 MiB (0x200000) bytes     allocated at offset 14 MiB (0xe00000)
+2 MiB (0x200000) bytes not allocated at offset 16 MiB (0x1000000)
+14 MiB (0xe00000) bytes     allocated at offset 18 MiB (0x1200000)
+32 MiB (0x2000000) bytes not allocated at offset 32 MiB (0x2000000)
+[{ "start": 0, "length": 18874368, "depth": 0, "zero": true, "data": false},
+{ "start": 18874368, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 20971520, "length": 2097152, "depth": 0, "zero": true, "data": false},
+{ "start": 23068672, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 25165824, "length": 6291456, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 31457280, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 33554432, "length": 33554432, "depth": 0, "zero": true, "data": false}]
+wrote 2097152/2097152 bytes at offset 27262976
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 29360128
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2 MiB (0x200000) bytes not allocated at offset 0 bytes (0x0)
+2 MiB (0x200000) bytes     allocated at offset 2 MiB (0x200000)
+2 MiB (0x200000) bytes not allocated at offset 4 MiB (0x400000)
+2 MiB (0x200000) bytes     allocated at offset 6 MiB (0x600000)
+2 MiB (0x200000) bytes not allocated at offset 8 MiB (0x800000)
+2 MiB (0x200000) bytes     allocated at offset 10 MiB (0xa00000)
+2 MiB (0x200000) bytes not allocated at offset 12 MiB (0xc00000)
+2 MiB (0x200000) bytes     allocated at offset 14 MiB (0xe00000)
+2 MiB (0x200000) bytes not allocated at offset 16 MiB (0x1000000)
+14 MiB (0xe00000) bytes     allocated at offset 18 MiB (0x1200000)
+32 MiB (0x2000000) bytes not allocated at offset 32 MiB (0x2000000)
+[{ "start": 0, "length": 18874368, "depth": 0, "zero": true, "data": false},
+{ "start": 18874368, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 20971520, "length": 2097152, "depth": 0, "zero": true, "data": false},
+{ "start": 23068672, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 25165824, "length": 2097152, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 27262976, "length": 2097152, "depth": 0, "zero": true, "data": false},
+{ "start": 29360128, "length": 2097152, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 31457280, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 33554432, "length": 33554432, "depth": 0, "zero": true, "data": false}]
+wrote 8388608/8388608 bytes at offset 33554432
+8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 35651584
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 37748736
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2 MiB (0x200000) bytes not allocated at offset 0 bytes (0x0)
+2 MiB (0x200000) bytes     allocated at offset 2 MiB (0x200000)
+2 MiB (0x200000) bytes not allocated at offset 4 MiB (0x400000)
+2 MiB (0x200000) bytes     allocated at offset 6 MiB (0x600000)
+2 MiB (0x200000) bytes not allocated at offset 8 MiB (0x800000)
+2 MiB (0x200000) bytes     allocated at offset 10 MiB (0xa00000)
+2 MiB (0x200000) bytes not allocated at offset 12 MiB (0xc00000)
+2 MiB (0x200000) bytes     allocated at offset 14 MiB (0xe00000)
+2 MiB (0x200000) bytes not allocated at offset 16 MiB (0x1000000)
+22 MiB (0x1600000) bytes     allocated at offset 18 MiB (0x1200000)
+24 MiB (0x1800000) bytes not allocated at offset 40 MiB (0x2800000)
+[{ "start": 0, "length": 18874368, "depth": 0, "zero": true, "data": false},
+{ "start": 18874368, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 20971520, "length": 2097152, "depth": 0, "zero": true, "data": false},
+{ "start": 23068672, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 25165824, "length": 2097152, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 27262976, "length": 2097152, "depth": 0, "zero": true, "data": false},
+{ "start": 29360128, "length": 2097152, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 31457280, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 33554432, "length": 33554432, "depth": 0, "zero": true, "data": false}]
+wrote 8388608/8388608 bytes at offset 41943040
+8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 8388608/8388608 bytes at offset 50331648
+8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 44040192
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 46137344
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 52428800
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 54525952
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 60817408
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 62914560
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+42 MiB (0x2a00000) bytes not allocated at offset 0 bytes (0x0)
+4 MiB (0x400000) bytes     allocated at offset 42 MiB (0x2a00000)
+4 MiB (0x400000) bytes not allocated at offset 46 MiB (0x2e00000)
+4 MiB (0x400000) bytes     allocated at offset 50 MiB (0x3200000)
+4 MiB (0x400000) bytes not allocated at offset 54 MiB (0x3600000)
+4 MiB (0x400000) bytes     allocated at offset 58 MiB (0x3a00000)
+2 MiB (0x200000) bytes not allocated at offset 62 MiB (0x3e00000)
+[{ "start": 0, "length": 18874368, "depth": 1, "zero": true, "data": false},
+{ "start": 18874368, "length": 2097152, "depth": 1, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 20971520, "length": 2097152, "depth": 1, "zero": true, "data": false},
+{ "start": 23068672, "length": 2097152, "depth": 1, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 25165824, "length": 2097152, "depth": 1, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 27262976, "length": 2097152, "depth": 1, "zero": true, "data": false},
+{ "start": 29360128, "length": 2097152, "depth": 1, "zero": true, "data": false, "offset": OFFSET},
+{ "start": 31457280, "length": 2097152, "depth": 1, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 33554432, "length": 10485760, "depth": 1, "zero": true, "data": false},
+{ "start": 44040192, "length": 4194304, "depth": 0, "zero": true, "data": false},
+{ "start": 48234496, "length": 2097152, "depth": 1, "zero": true, "data": false},
+{ "start": 50331648, "length": 2097152, "depth": 1, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 52428800, "length": 4194304, "depth": 0, "zero": true, "data": false},
+{ "start": 56623104, "length": 2097152, "depth": 1, "zero": false, "data": true, "offset": OFFSET},
+{ "start": 58720256, "length": 2097152, "depth": 1, "zero": true, "data": false},
+{ "start": 60817408, "length": 4194304, "depth": 0, "zero": true, "data": false},
+{ "start": 65011712, "length": 2097152, "depth": 1, "zero": true, "data": false}]
+No errors were found on the image.
+No errors were found on the image.
+
+=== Testing cache optimization ===
+
+wrote 2097152/2097152 bytes at offset 20971520
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 20971520
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 29360128
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index bb6df8f2d7..5c8ea0f95c 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -171,5 +171,6 @@
 175 auto quick
 176 rw auto backing
 177 rw auto quick
+179 rw auto quick
 181 rw auto migration
 182 rw auto quick

From fbaa6bb3d3b4be71b7e234e908cb3c6bd280a222 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:50 -0500
Subject: [PATCH 19/22] qcow2: Optimize write zero of unaligned tail cluster

We've already improved discards to operate efficiently on the tail
of an unaligned qcow2 image; it's time to make a similar improvement
to write zeroes.  The special case is only valid at the tail
cluster of a file, where we must recognize that any sectors beyond
the image end would implicitly read as zero, and therefore should
not penalize our logic for widening a partial cluster into writing
the whole cluster as zero.

However, note that for now, the special case of end-of-file is only
recognized if there is no backing file, or if the backing file has
the same length; that's because when the backing file is shorter
than the active layer, we don't have code in place to recognize
that reads of a sector unallocated at the top and beyond the backing
end-of-file are implicitly zero.  It's not much of a real loss,
because most people don't use images that aren't cluster-aligned,
or where the active layer is a different size than the backing
layer (especially where the difference falls within a single cluster).

Update test 154 to cover the new scenarios, using two images of
intentionally differing length.

While at it, fix the test to gracefully skip when run as
./check -qcow2 -o compat=0.10 154
since the older format lacks zero clusters already required earlier
in the test.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-11-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.c              |   7 ++
 tests/qemu-iotests/154     | 160 ++++++++++++++++++++++++++++++++++++-
 tests/qemu-iotests/154.out | 128 +++++++++++++++++++++++++++++
 3 files changed, 293 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 3fa62b0096..72b1650953 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2451,6 +2451,10 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
     BlockDriverState *file;
     int64_t res;
 
+    if (start + count > bs->total_sectors) {
+        count = bs->total_sectors - start;
+    }
+
     if (!count) {
         return true;
     }
@@ -2469,6 +2473,9 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
     uint32_t tail = (offset + count) % s->cluster_size;
 
     trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
+    if (offset + count == bs->total_sectors * BDRV_SECTOR_SIZE) {
+        tail = 0;
+    }
 
     if (head || tail) {
         int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154
index 7ca7219f08..dd8a426dad 100755
--- a/tests/qemu-iotests/154
+++ b/tests/qemu-iotests/154
@@ -2,7 +2,7 @@
 #
 # qcow2 specific bdrv_pwrite_zeroes tests with backing files (complements 034)
 #
-# Copyright (C) 2016 Red Hat, Inc.
+# Copyright (C) 2016-2017 Red Hat, Inc.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -42,7 +42,10 @@ _supported_proto file
 _supported_os Linux
 
 CLUSTER_SIZE=4k
-size=128M
+size=$((128 * 1024 * 1024))
+
+# This test requires zero clusters, added in v3 images
+_unsupported_imgopts compat=0.10
 
 echo
 echo == backing file contains zeros ==
@@ -299,6 +302,159 @@ $QEMU_IO -c "read -P 0 75k 1k" "$TEST_IMG" | _filter_qemu_io
 
 $QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
 
+echo
+echo == unaligned image tail cluster, no allocation needed ==
+
+# With no backing file, write to all or part of unallocated partial cluster
+# will mark the cluster as zero, but does not allocate.
+# Re-create the image each time to get back to unallocated clusters.
+
+# Write at the front: sector-wise, the request is: 128m... | 00 -- -- --
+_make_test_img $((size + 2048))
+$QEMU_IO -c "write -z $size 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write at the back: sector-wise, the request is: 128m... | -- -- -- 00
+_make_test_img $((size + 2048))
+$QEMU_IO -c "write -z $((size + 1536)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write at middle: sector-wise, the request is: 128m... | -- 00 00 --
+_make_test_img $((size + 2048))
+$QEMU_IO -c "write -z $((size + 512)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write entire cluster: sector-wise, the request is: 128m... | 00 00 00 00
+_make_test_img $((size + 2048))
+$QEMU_IO -c "write -z $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Repeat with backing file holding unallocated cluster.
+# TODO: Note that this forces an allocation, because we aren't yet able to
+# quickly detect that reads beyond EOF of the backing file are always zero
+CLUSTER_SIZE=2048 TEST_IMG="$TEST_IMG.base" _make_test_img $((size + 1024))
+
+# Write at the front: sector-wise, the request is:
+# backing: 128m... | -- --
+# active:  128m... | 00 -- -- --
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $size 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write at the back: sector-wise, the request is:
+# backing: 128m... | -- --
+# active:  128m... | -- -- -- 00
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $((size + 1536)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write at middle: sector-wise, the request is:
+# backing: 128m... | -- --
+# active:  128m... | -- 00 00 --
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $((size + 512)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write entire cluster: sector-wise, the request is:
+# backing: 128m... | -- --
+# active:  128m... | 00 00 00 00
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Repeat with backing file holding zero'd cluster
+# TODO: Note that this forces an allocation, because we aren't yet able to
+# quickly detect that reads beyond EOF of the backing file are always zero
+$QEMU_IO -c "write -z $size 512" "$TEST_IMG.base" | _filter_qemu_io
+
+# Write at the front: sector-wise, the request is:
+# backing: 128m... | 00 00
+# active:  128m... | 00 -- -- --
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $size 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write at the back: sector-wise, the request is:
+# backing: 128m... | 00 00
+# active:  128m... | -- -- -- 00
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $((size + 1536)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write at middle: sector-wise, the request is:
+# backing: 128m... | 00 00
+# active:  128m... | -- 00 00 --
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $((size + 512)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Write entire cluster: sector-wise, the request is:
+# backing: 128m... | 00 00
+# active:  128m... | 00 00 00 00
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -z $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# A preallocated cluster maintains its allocation, whether it stays as
+# data due to a partial write:
+# Convert 128m... | XX XX => ... | XX 00
+_make_test_img $((size + 1024))
+$QEMU_IO -c "write -P 1 $((size)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z $((size + 512)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 1 $((size)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $((size + 512)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# or because it is the entire cluster and can use the zero flag:
+# Convert 128m... | XX XX => ... | 00 00
+$QEMU_IO -c "write -z $((size)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "alloc $size 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $size 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+echo
+echo == unaligned image tail cluster, allocation required ==
+
+# Write beyond backing file must COW
+# Backing file: 128m... | XX --
+# Active layer: 128m... | -- -- 00 --
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $((size + 1024))
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -P 1 $((size)) 512" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z $((size + 1024)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 1 $((size)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $((size + 512)) 1536" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Writes at boundaries of (partial) cluster must not lose mid-cluster data
+# Backing file: 128m: ... | -- XX
+# Active layer: 128m: ... | 00 -- -- 00
+CLUSTER_SIZE=512 TEST_IMG="$TEST_IMG.base" _make_test_img $((size + 1024))
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+$QEMU_IO -c "write -P 1 $((size + 512)) 512" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z $((size)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $((size)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 1 $((size + 512)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $((size + 1024)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z $((size + 1536)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $((size)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 1 $((size + 512)) 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 $((size + 1024)) 1024" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index d3b68e7729..d8485eeff2 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -282,4 +282,132 @@ read 1024/1024 bytes at offset 76800
 { "start": 69632, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 73728, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
 { "start": 77824, "length": 134139904, "depth": 1, "zero": true, "data": false}]
+
+== unaligned image tail cluster, no allocation needed ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776
+wrote 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134219776, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776
+wrote 512/512 bytes at offset 134219264
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134219776, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776
+wrote 1024/1024 bytes at offset 134218240
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134219776, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776
+wrote 2048/2048 bytes at offset 134217728
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134219776, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134218752
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 512/512 bytes at offset 134219264
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 134218240
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2048/2048 bytes at offset 134217728
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
+wrote 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 512/512 bytes at offset 134219264
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 1024/1024 bytes at offset 134218240
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 2048/2048 bytes at offset 134217728
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+2048/2048 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134218752
+wrote 1024/1024 bytes at offset 134217728
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 134218240
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134218240
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+1024/1024 bytes allocated at offset 128 MiB
+[{ "start": 0, "length": 134217728, "depth": 0, "zero": true, "data": false},
+{ "start": 134217728, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+wrote 1024/1024 bytes at offset 134217728
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+1024/1024 bytes allocated at offset 128 MiB
+read 1024/1024 bytes at offset 134217728
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 134217728, "depth": 0, "zero": true, "data": false},
+{ "start": 134217728, "length": 1024, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
+
+== unaligned image tail cluster, allocation required ==
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134218752
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 134218752
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 134218240
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=134218752
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
+wrote 512/512 bytes at offset 134218240
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134218240
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 134218752
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 134219264
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134217728
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 134218240
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 134218752
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
 *** done

From f10ee139adee1c18d399dc57a7ee22a03fa59513 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:51 -0500
Subject: [PATCH 20/22] qcow2: Assert that cluster operations are aligned

We already audited (in commit 0c1bd469) that qcow2_discard_clusters()
is only passed cluster-aligned start values; but we can further
tighten the assertion that the only unaligned end value is at EOF.

Recent commits have taken advantage of an unaligned tail cluster,
for both discard and write zeroes.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-12-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e2c5759d94..43bde569a7 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1559,11 +1559,10 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
 
     end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
 
-    /* The caller must cluster-align start; round end down except at EOF */
+    /* Caller must pass aligned values, except at image end */
     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
-    if (end_offset != bs->total_sectors * BDRV_SECTOR_SIZE) {
-        end_offset = start_of_cluster(s, end_offset);
-    }
+    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
 
     nb_clusters = size_to_clusters(s, end_offset - offset);
 
@@ -1646,9 +1645,17 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
+    uint64_t end_offset;
     uint64_t nb_clusters;
     int ret;
 
+    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
+
+    /* Caller must pass aligned values, except at image end */
+    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
+
     /* The zero flag is only supported by version 3 and newer */
     if (s->qcow_version < 3) {
         return -ENOTSUP;

From d2cb36af2b0040d421b347e6e4e803e07220f78d Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Sat, 6 May 2017 19:05:52 -0500
Subject: [PATCH 21/22] qcow2: Discard/zero clusters by byte count

Passing a byte offset, but sector count, when we ultimately
want to operate on cluster granularity, is madness.  Clean up
the external interfaces to take both offset and count as bytes,
while still keeping the assertion added previously that the
caller must align the values to a cluster.  Then rename things
to make sure backports don't get confused by changed units:
instead of qcow2_discard_clusters() and qcow2_zero_clusters(),
we now have qcow2_cluster_discard() and qcow2_cluster_zeroize().

The internal functions still operate on clusters at a time, and
return an int for number of cleared clusters; but on an image
with 2M clusters, a single L2 table holds 256k entries that each
represent a 2M cluster, totalling well over INT_MAX bytes if we
ever had a request for that many bytes at once.  All our callers
currently limit themselves to 32-bit bytes (and therefore fewer
clusters), but by making this function 64-bit clean, we have one
less place to clean up if we later improve the block layer to
support 64-bit bytes through all operations (with the block layer
auto-fragmenting on behalf of more-limited drivers), rather than
the current state where some interfaces are artificially limited
to INT_MAX at a time.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170507000552.20847-13-eblake@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c  | 42 ++++++++++++++++++++++--------------------
 block/qcow2-snapshot.c |  7 +++----
 block/qcow2.c          | 22 +++++++++-------------
 block/qcow2.h          |  9 +++++----
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 43bde569a7..347d94b0d2 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1549,34 +1549,36 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
     return nb_clusters;
 }
 
-int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, enum qcow2_discard_type type,
+                          bool full_discard)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t end_offset;
+    uint64_t end_offset = offset + bytes;
     uint64_t nb_clusters;
+    int64_t cleared;
     int ret;
 
-    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
-
     /* Caller must pass aligned values, except at image end */
     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
            end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
 
-    nb_clusters = size_to_clusters(s, end_offset - offset);
+    nb_clusters = size_to_clusters(s, bytes);
 
     s->cache_discards = true;
 
     /* Each L2 table is handled by its own loop iteration */
     while (nb_clusters > 0) {
-        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
-        if (ret < 0) {
+        cleared = discard_single_l2(bs, offset, nb_clusters, type,
+                                    full_discard);
+        if (cleared < 0) {
+            ret = cleared;
             goto fail;
         }
 
-        nb_clusters -= ret;
-        offset += (ret * s->cluster_size);
+        nb_clusters -= cleared;
+        offset += (cleared * s->cluster_size);
     }
 
     ret = 0;
@@ -1641,16 +1643,15 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
     return nb_clusters;
 }
 
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
-                        int flags)
+int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t end_offset;
+    uint64_t end_offset = offset + bytes;
     uint64_t nb_clusters;
+    int64_t cleared;
     int ret;
 
-    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
-
     /* Caller must pass aligned values, except at image end */
     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
@@ -1662,18 +1663,19 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
     }
 
     /* Each L2 table is handled by its own loop iteration */
-    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+    nb_clusters = size_to_clusters(s, bytes);
 
     s->cache_discards = true;
 
     while (nb_clusters > 0) {
-        ret = zero_single_l2(bs, offset, nb_clusters, flags);
-        if (ret < 0) {
+        cleared = zero_single_l2(bs, offset, nb_clusters, flags);
+        if (cleared < 0) {
+            ret = cleared;
             goto fail;
         }
 
-        nb_clusters -= ret;
-        offset += (ret * s->cluster_size);
+        nb_clusters -= cleared;
+        offset += (cleared * s->cluster_size);
     }
 
     ret = 0;
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 032424322a..44243e0e95 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -440,10 +440,9 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     /* The VM state isn't needed any more in the active L1 table; in fact, it
      * hurts by causing expensive COW for the next snapshot. */
-    qcow2_discard_clusters(bs, qcow2_vm_state_offset(s),
-                           align_offset(sn->vm_state_size, s->cluster_size)
-                                >> BDRV_SECTOR_BITS,
-                           QCOW2_DISCARD_NEVER, false);
+    qcow2_cluster_discard(bs, qcow2_vm_state_offset(s),
+                          align_offset(sn->vm_state_size, s->cluster_size),
+                          QCOW2_DISCARD_NEVER, false);
 
 #ifdef DEBUG_ALLOC
     {
diff --git a/block/qcow2.c b/block/qcow2.c
index 72b1650953..a8d61f0981 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2512,7 +2512,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
     trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
 
     /* Whatever is left can use real zero clusters */
-    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS, flags);
+    ret = qcow2_cluster_zeroize(bs, offset, count, flags);
     qemu_co_mutex_unlock(&s->lock);
 
     return ret;
@@ -2535,8 +2535,8 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
     }
 
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
-                                 QCOW2_DISCARD_REQUEST, false);
+    ret = qcow2_cluster_discard(bs, offset, count, QCOW2_DISCARD_REQUEST,
+                                false);
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
@@ -2843,9 +2843,8 @@ fail:
 static int qcow2_make_empty(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
-    uint64_t start_sector;
-    int sector_step = (QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size) /
-                       BDRV_SECTOR_SIZE);
+    uint64_t offset, end_offset;
+    int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
     int l1_clusters, ret = 0;
 
     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
@@ -2862,18 +2861,15 @@ static int qcow2_make_empty(BlockDriverState *bs)
 
     /* This fallback code simply discards every active cluster; this is slow,
      * but works in all cases */
-    for (start_sector = 0; start_sector < bs->total_sectors;
-         start_sector += sector_step)
-    {
+    end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
+    for (offset = 0; offset < end_offset; offset += step) {
         /* As this function is generally used after committing an external
          * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
          * default action for this kind of discard is to pass the discard,
          * which will ideally result in an actually smaller image file, as
          * is probably desired. */
-        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
-                                     MIN(sector_step,
-                                         bs->total_sectors - start_sector),
-                                     QCOW2_DISCARD_SNAPSHOT, true);
+        ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
+                                    QCOW2_DISCARD_SNAPSHOT, true);
         if (ret < 0) {
             break;
         }
diff --git a/block/qcow2.h b/block/qcow2.h
index 810ceb84c8..1801dc30dc 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -551,10 +551,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                          int compressed_size);
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
-int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-    int nb_sectors, enum qcow2_discard_type type, bool full_discard);
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
-                        int flags);
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, enum qcow2_discard_type type,
+                          bool full_discard);
+int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, int flags);
 
 int qcow2_expand_zero_clusters(BlockDriverState *bs,
                                BlockDriverAmendStatusCB *status_cb,

From 8dd30c86dd41a727f726a086fff97c7ce06cbe1d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 28 Apr 2017 18:55:17 +0200
Subject: [PATCH 22/22] MAINTAINERS: Add qemu-progress to the block layer

util/qemu-progress.c is currently unmaintained. The only user of its
functionality is qemu-img, so it effectively is part of the block layer.

Suggested-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170428165517.30341-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8224be0d75..145be0f45f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1170,6 +1170,7 @@ F: include/block/
 F: qemu-img*
 F: qemu-io*
 F: tests/qemu-iotests/
+F: util/qemu-progress.c
 T: git git://repo.or.cz/qemu/kevin.git block
 
 Block I/O path