From 142c6b1a89c3af769fbab6a22f51eefa7a3b0330 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 21 Mar 2013 13:07:10 +0100
Subject: [PATCH 01/23] vl.c: call bdrv_init_with_whitelist() before cmdline
 parsing

commit 4d454574 "qemu-option: move standard option definitions
out of qemu-config.c" broke support for commandline option
groups that where registered during bdrv_init(). In particular
support for -iscsi options was broken since that commit.

Fix by moving the bdrv_init_with_whitelist() before command
line argument parsing.

Signed-off-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 vl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vl.c b/vl.c
index 7643f16351..7f86a40f6f 100644
--- a/vl.c
+++ b/vl.c
@@ -2941,6 +2941,8 @@ int main(int argc, char **argv, char **envp)
     nb_numa_nodes = 0;
     nb_nics = 0;
 
+    bdrv_init_with_whitelist();
+
     autostart= 1;
 
     /* first pass of option parsing */
@@ -4199,8 +4201,6 @@ int main(int argc, char **argv, char **envp)
 
     cpu_exec_init_all();
 
-    bdrv_init_with_whitelist();
-
     blk_mig_init();
 
     /* open the virtual block devices */

From 6f74928192e8e8a16f64b6208171eb13af890bbc Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:53 +0100
Subject: [PATCH 02/23] qemu-iotests: More concurrent allocation scenarios

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/qemu-iotests/046     | 49 +++++++++++++++++++++++-
 tests/qemu-iotests/046.out | 76 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/046 b/tests/qemu-iotests/046
index e0176f42df..987bfff8fa 100755
--- a/tests/qemu-iotests/046
+++ b/tests/qemu-iotests/046
@@ -66,7 +66,7 @@ function backing_io()
     done
 }
 
-backing_io 0 16 write | $QEMU_IO $TEST_IMG | _filter_qemu_io
+backing_io 0 32 write | $QEMU_IO $TEST_IMG | _filter_qemu_io
 
 mv $TEST_IMG $TEST_IMG.base
 
@@ -153,6 +153,36 @@ aio_write -P 101 0xaa000 0xe000
 resume A
 aio_flush
 EOF
+
+# Reverse sequential write
+cat  <<EOF
+break write_aio A
+aio_write -P 121 0xdc000 0x2000
+wait_break A
+aio_write -P 120 0xc4000 0x18000
+resume A
+aio_flush
+EOF
+
+# Reverse sequential write with a gap
+cat  <<EOF
+break write_aio A
+aio_write -P 141 0xfc000 0x2000
+wait_break A
+aio_write -P 140 0xe4000 0x14000
+resume A
+aio_flush
+EOF
+
+# Allocate an area in the middle and then overwrite with a larger request
+cat  <<EOF
+break write_aio A
+aio_write -P 161 0x10c000 0x8000
+wait_break A
+aio_write -P 160 0x104000 0x18000
+resume A
+aio_flush
+EOF
 }
 
 overlay_io | $QEMU_IO blkdebug::$TEST_IMG | _filter_qemu_io |\
@@ -203,6 +233,23 @@ function verify_io()
     echo read -P 10  0xa8000 0x2000
     echo read -P 101 0xaa000 0xe000
     echo read -P 110 0xb8000 0x8000
+
+    echo read -P 12  0xc0000 0x4000
+    echo read -P 120 0xc4000 0x18000
+    echo read -P 121 0xdc000 0x2000
+    echo read -P 13  0xde000 0x2000
+
+    echo read -P 14  0xe0000 0x4000
+    echo read -P 140 0xe4000 0x14000
+    echo read -P 15  0xf8000 0x4000
+    echo read -P 141 0xfc000 0x2000
+    echo read -P 15  0xfe000 0x2000
+
+    echo read -P 16  0x100000 0x4000
+    echo read -P 160 0x104000 0x8000
+    # Undefined content for 0x10c000 0x8000
+    echo read -P 160 0x114000 0x8000
+    echo read -P 17  0x11c000 0x4000
 }
 
 verify_io | $QEMU_IO $TEST_IMG | _filter_qemu_io
diff --git a/tests/qemu-iotests/046.out b/tests/qemu-iotests/046.out
index 565360fe60..4b50a17eeb 100644
--- a/tests/qemu-iotests/046.out
+++ b/tests/qemu-iotests/046.out
@@ -34,6 +34,38 @@ qemu-io> wrote 65536/65536 bytes at offset 917504
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qemu-io> wrote 65536/65536 bytes at offset 983040
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1048576
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1114112
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1179648
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1245184
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1310720
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1376256
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1441792
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1507328
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1572864
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1638400
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1703936
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1769472
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1835008
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1900544
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 1966080
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> wrote 65536/65536 bytes at offset 2031616
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qemu-io> Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=6442450944 backing_file='TEST_DIR/t.IMGFMT.base' 
 
 == Some concurrent requests touching the same cluster ==
@@ -89,6 +121,24 @@ qemu-io> wrote 8192/8192 bytes at offset XXX
 8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 57344/57344 bytes at offset XXX
 56 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> qemu-io> blkdebug: Suspended request 'A'
+qemu-io> qemu-io> qemu-io> blkdebug: Resuming request 'A'
+qemu-io> wrote 8192/8192 bytes at offset XXX
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 98304/98304 bytes at offset XXX
+96 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> qemu-io> blkdebug: Suspended request 'A'
+qemu-io> qemu-io> qemu-io> blkdebug: Resuming request 'A'
+qemu-io> wrote 8192/8192 bytes at offset XXX
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 81920/81920 bytes at offset XXX
+80 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> qemu-io> blkdebug: Suspended request 'A'
+qemu-io> qemu-io> qemu-io> blkdebug: Resuming request 'A'
+qemu-io> wrote 32768/32768 bytes at offset XXX
+32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 98304/98304 bytes at offset XXX
+96 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qemu-io> 
 == Verify image content ==
 qemu-io> read 65536/65536 bytes at offset 0
@@ -159,5 +209,31 @@ qemu-io> read 57344/57344 bytes at offset 696320
 56 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qemu-io> read 32768/32768 bytes at offset 753664
 32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 16384/16384 bytes at offset 786432
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 98304/98304 bytes at offset 802816
+96 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 8192/8192 bytes at offset 901120
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 8192/8192 bytes at offset 909312
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 16384/16384 bytes at offset 917504
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 81920/81920 bytes at offset 933888
+80 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 16384/16384 bytes at offset 1015808
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 8192/8192 bytes at offset 1032192
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 8192/8192 bytes at offset 1040384
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 16384/16384 bytes at offset 1048576
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 32768/32768 bytes at offset 1064960
+32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 32768/32768 bytes at offset 1130496
+32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-io> read 16384/16384 bytes at offset 1163264
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qemu-io> No errors were found on the image.
 *** done

From c349ca4bb2dbca53c15147d283ea9f6c94376c6c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:54 +0100
Subject: [PATCH 03/23] qcow2: Fix "total clusters" number in bdrv_check

This should be based on the virtual disk size, not on the size of the
image.

Interesting observation: With some VM state stored in the image file,
percentages higher than 100% are possible, even though snapshots
themselves are ignored. This is a qcow2 bug to be fixed another day: The
VM state should be discarded in the active L2 tables after completing
the snapshot creation.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-refcount.c     | 4 +++-
 tests/qemu-iotests/044.out | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 9bfb390519..c38e970bf2 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1152,9 +1152,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
     size = bdrv_getlength(bs->file);
     nb_clusters = size_to_clusters(s, size);
-    res->bfi.total_clusters = nb_clusters;
     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
 
+    res->bfi.total_clusters =
+        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
+
     /* header */
     inc_refcounts(bs, res, refcount_table, nb_clusters,
         0, s->cluster_size);
diff --git a/tests/qemu-iotests/044.out b/tests/qemu-iotests/044.out
index 5eed3f87a3..34c25c793e 100644
--- a/tests/qemu-iotests/044.out
+++ b/tests/qemu-iotests/044.out
@@ -1,5 +1,5 @@
 No errors were found on the image.
-7292415/8391499= 86.90% allocated, 0.00% fragmented, 0.00% compressed clusters
+7292415/33554432 = 21.73% allocated, 0.00% fragmented, 0.00% compressed clusters
 Image end offset: 4296447488
 .
 ----------------------------------------------------------------------

From 9ee6439e27d15c528fde6d9da1e4c238a23b6b7a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:55 +0100
Subject: [PATCH 04/23] qcow2: Remove bogus unlock of s->lock

The unlock wakes up the next coroutine, but the currently running
coroutine will lock it again before it yields, so this doesn't make a
lot of sense.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 8ea696a1aa..3f7edf5652 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -869,9 +869,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 QLIST_REMOVE(l2meta, next_in_flight);
             }
 
-            qemu_co_mutex_unlock(&s->lock);
             qemu_co_queue_restart_all(&l2meta->dependent_requests);
-            qemu_co_mutex_lock(&s->lock);
 
             g_free(l2meta);
             l2meta = NULL;

From 17a71e58238138c3f02be7e9f5dc8de5d72a9a9d Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:56 +0100
Subject: [PATCH 05/23] qcow2: Handle dependencies earlier

Handling overlapping allocations isn't just a detail of cluster
allocation. It is rather one of three ways to get the host cluster
offset for a write request:

1. If a request overlaps an in-flight allocations, the cluster offset
   can be taken from there (this is what handle_dependencies will evolve
   into) or the request must just wait until the allocation has
   completed. Accessing the L2 is not valid in this case, it has
   outdated information.

2. Outside overlapping areas, check the clusters that can be written to
   as they are, with no COW involved.

3. If a COW is required, allocate new clusters

Changing the code to reflect this doesn't change the behaviour because
overlaps cannot exist for clusters that are kept in step 2. It does
however make it easier for later patches to work on clusters that belong
to an allocation that is still in flight.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 59 +++++++++++++++++++++++++++++++------------
 block/qcow2.h         |  5 ++++
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d72d063e6d..71e027adc9 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -824,16 +824,10 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
     uint64_t *host_offset, unsigned int *nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret;
 
     trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
                                          *host_offset, *nb_clusters);
 
-    ret = handle_dependencies(bs, guest_offset, nb_clusters);
-    if (ret < 0) {
-        return ret;
-    }
-
     /* Allocate new clusters */
     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
     if (*host_offset == 0) {
@@ -845,7 +839,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
         *host_offset = cluster_offset;
         return 0;
     } else {
-        ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
         if (ret < 0) {
             return ret;
         }
@@ -885,20 +879,55 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
                                       n_start, n_end);
 
-    /* Find L2 entry for the first involved cluster */
 again:
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
-    if (ret < 0) {
-        return ret;
-    }
-
     /*
      * Calculate the number of clusters to look for. We stop at L2 table
      * boundaries to keep things simple.
      */
+    l2_index = offset_to_l2_index(s, offset);
     nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS),
                       s->l2_size - l2_index);
 
+    /*
+     * Now start gathering as many contiguous clusters as possible:
+     *
+     * 1. Check for overlaps with in-flight allocations
+     *
+     *      a) Overlap not in the first cluster -> shorten this request and let
+     *         the caller handle the rest in its next loop iteration.
+     *
+     *      b) Real overlaps of two requests. Yield and restart the search for
+     *         contiguous clusters (the situation could have changed while we
+     *         were sleeping)
+     *
+     *      c) TODO: Request starts in the same cluster as the in-flight
+     *         allocation ends. Shorten the COW of the in-fight allocation, set
+     *         cluster_offset to write to the same cluster and set up the right
+     *         synchronisation between the in-flight request and the new one.
+     *
+     * 2. Count contiguous COPIED clusters.
+     *    TODO: Consider cluster_offset if set in step 1c.
+     *
+     * 3. If the request still hasn't completed, allocate new clusters,
+     *    considering any cluster_offset of steps 1c or 2.
+     */
+    ret = handle_dependencies(bs, offset, &nb_clusters);
+    if (ret == -EAGAIN) {
+        goto again;
+    } else if (ret < 0) {
+        return ret;
+    } else {
+        /* handle_dependencies() may have decreased cur_bytes (shortened
+         * the allocations below) so that the next dependency is processed
+         * correctly during the next loop iteration. */
+    }
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
     cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
     /*
@@ -963,9 +992,7 @@ again:
         /* Allocate, if necessary at a given offset in the image file */
         ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
                                       &nb_clusters);
-        if (ret == -EAGAIN) {
-            goto again;
-        } else if (ret < 0) {
+        if (ret < 0) {
             goto fail;
         }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index e4b5e11a91..0940b1b3e8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -277,6 +277,11 @@ static inline int size_to_l1(BDRVQcowState *s, int64_t size)
     return (size + (1ULL << shift) - 1) >> shift;
 }
 
+static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset)
+{
+    return (offset >> s->cluster_bits) & (s->l2_size - 1);
+}
+
 static inline int64_t align_offset(int64_t offset, int n)
 {
     offset = (offset + n - 1) & ~(n - 1);

From d9d74f4177af59bec23baa480d640709f56df0aa Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:57 +0100
Subject: [PATCH 06/23] qcow2: Improve check for overlapping allocations

The old code detected an overlapping allocation even when the
allocations didn't actually overlap, but were only adjacent.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c      |   2 +-
 tests/qemu-iotests/038.out | 238 ++++++++++++++++++-------------------
 2 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 71e027adc9..7f4f73eaca 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -773,7 +773,7 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
         uint64_t old_start = old_alloc->offset >> s->cluster_bits;
         uint64_t old_end = old_start + old_alloc->nb_clusters;
 
-        if (end < old_start || start > old_end) {
+        if (end <= old_start || start >= old_end) {
             /* No intersection */
         } else {
             if (start < old_start) {
diff --git a/tests/qemu-iotests/038.out b/tests/qemu-iotests/038.out
index acc7629267..9cd0cd8771 100644
--- a/tests/qemu-iotests/038.out
+++ b/tests/qemu-iotests/038.out
@@ -517,7 +517,67 @@ qemu-io> wrote 65536/65536 bytes at offset 16711680
 qemu-io> Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=6442450944 backing_file='TEST_DIR/t.IMGFMT.base' 
 
 == Some concurrent requests touching the same cluster ==
-qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> wrote 81920/81920 bytes at offset XXX
+qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> qemu-io> wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 81920/81920 bytes at offset XXX
 80 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 65536/65536 bytes at offset XXX
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
@@ -585,126 +645,66 @@ wrote 65536/65536 bytes at offset XXX
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 65536/65536 bytes at offset XXX
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 81920/81920 bytes at offset XXX
 80 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset XXX
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 == Verify image content ==
 qemu-io> read 4096/4096 bytes at offset 2064384

From 65eb2e35c07632eb5d26f15a57461e321bacb883 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:58 +0100
Subject: [PATCH 07/23] qcow2: Change handle_dependency to byte granularity

This is a more precise description of what really constitutes a
dependency. The behaviour doesn't change at this point because the COW
area of the old request is still aligned to cluster boundaries and
therefore an overlap is detected wheneven the requests touch any part of
the same cluster.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 40 ++++++++++++++++++++++++++++------------
 block/qcow2.h         | 11 +++++++++++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 7f4f73eaca..202adb4807 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -759,31 +759,41 @@ out:
  * Check if there already is an AIO write request in flight which allocates
  * the same cluster. In this case we need to wait until the previous
  * request has completed and updated the L2 table accordingly.
+ *
+ * Returns:
+ *   0       if there was no dependency. *cur_bytes indicates the number of
+ *           bytes from guest_offset that can be read before the next
+ *           dependency must be processed (or the request is complete)
+ *
+ *   -EAGAIN if we had to wait for another request, previously gathered
+ *           information on cluster allocation may be invalid now. The caller
+ *           must start over anyway, so consider *cur_bytes undefined.
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-    unsigned int *nb_clusters)
+    uint64_t *cur_bytes)
 {
     BDRVQcowState *s = bs->opaque;
     QCowL2Meta *old_alloc;
+    uint64_t bytes = *cur_bytes;
 
     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 
-        uint64_t start = guest_offset >> s->cluster_bits;
-        uint64_t end = start + *nb_clusters;
-        uint64_t old_start = old_alloc->offset >> s->cluster_bits;
-        uint64_t old_end = old_start + old_alloc->nb_clusters;
+        uint64_t start = guest_offset;
+        uint64_t end = start + bytes;
+        uint64_t old_start = l2meta_cow_start(old_alloc);
+        uint64_t old_end = l2meta_cow_end(old_alloc);
 
         if (end <= old_start || start >= old_end) {
             /* No intersection */
         } else {
             if (start < old_start) {
                 /* Stop at the start of a running allocation */
-                *nb_clusters = old_start - start;
+                bytes = old_start - start;
             } else {
-                *nb_clusters = 0;
+                bytes = 0;
             }
 
-            if (*nb_clusters == 0) {
+            if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
                 qemu_co_mutex_unlock(&s->lock);
@@ -794,9 +804,9 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
         }
     }
 
-    if (!*nb_clusters) {
-        abort();
-    }
+    /* Make sure that existing clusters and new allocations are only used up to
+     * the next dependency if we shortened the request above */
+    *cur_bytes = bytes;
 
     return 0;
 }
@@ -875,6 +885,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     uint64_t *l2_table;
     unsigned int nb_clusters, keep_clusters;
     uint64_t cluster_offset;
+    uint64_t cur_bytes;
 
     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
                                       n_start, n_end);
@@ -887,6 +898,7 @@ again:
     l2_index = offset_to_l2_index(s, offset);
     nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS),
                       s->l2_size - l2_index);
+    n_end = MIN(n_end, nb_clusters * s->cluster_sectors);
 
     /*
      * Now start gathering as many contiguous clusters as possible:
@@ -911,7 +923,8 @@ again:
      * 3. If the request still hasn't completed, allocate new clusters,
      *    considering any cluster_offset of steps 1c or 2.
      */
-    ret = handle_dependencies(bs, offset, &nb_clusters);
+    cur_bytes = (n_end - n_start) * BDRV_SECTOR_SIZE;
+    ret = handle_dependencies(bs, offset, &cur_bytes);
     if (ret == -EAGAIN) {
         goto again;
     } else if (ret < 0) {
@@ -922,6 +935,9 @@ again:
          * correctly during the next loop iteration. */
     }
 
+    nb_clusters = size_to_clusters(s, offset + cur_bytes)
+                - (offset >> s->cluster_bits);
+
     /* Find L2 entry for the first involved cluster */
     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
diff --git a/block/qcow2.h b/block/qcow2.h
index 0940b1b3e8..a99d51b6a1 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -307,6 +307,17 @@ static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
     return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
 }
 
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+    return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+    return m->offset + m->cow_end.offset
+        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+}
+
 // FIXME Need qcow2_ prefix to global functions
 
 /* qcow2.c functions */

From 037689d8969c493d39153fd920ad81e161b0d55c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:49:59 +0100
Subject: [PATCH 08/23] qcow2: Decouple cluster allocation from cluster reuse
 code

This moves some code that prepares the allocation of new clusters to
where the actual allocation happens. This is the minimum required to be
able to move it to a separate function in the next patch.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 202adb4807..9550927c4d 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -946,10 +946,7 @@ again:
 
     cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
-    /*
-     * Check how many clusters are already allocated and don't need COW, and how
-     * many need a new allocation.
-     */
+    /* Check how many clusters are already allocated and don't need COW */
     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
         && (cluster_offset & QCOW_OFLAG_COPIED))
     {
@@ -965,17 +962,6 @@ again:
         cluster_offset = 0;
     }
 
-    if (nb_clusters > 0) {
-        /* For the moment, overwrite compressed clusters one by one */
-        uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
-        if (entry & QCOW_OFLAG_COMPRESSED) {
-            nb_clusters = 1;
-        } else {
-            nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
-                                             l2_index + keep_clusters);
-        }
-    }
-
     cluster_offset &= L2E_OFFSET_MASK;
 
     /*
@@ -996,6 +982,25 @@ again:
         uint64_t alloc_cluster_offset;
         uint64_t keep_bytes = keep_clusters * s->cluster_size;
 
+        ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* For the moment, overwrite compressed clusters one by one */
+        uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
+        if (entry & QCOW_OFLAG_COMPRESSED) {
+            nb_clusters = 1;
+        } else {
+            nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
+                                             l2_index + keep_clusters);
+        }
+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
+
         /* Calculate start and size of allocation */
         alloc_offset = offset + keep_bytes;
 

From 10f0ed8b2f0d3e9f0476b6f00868dd13b524066a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:00 +0100
Subject: [PATCH 09/23] qcow2: Factor out handle_alloc()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 242 ++++++++++++++++++++++++++----------------
 trace-events          |   1 +
 2 files changed, 153 insertions(+), 90 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 9550927c4d..454a30cab0 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -858,6 +858,146 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
     }
 }
 
+/*
+ * Allocates new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *host_offset is non-zero, clusters are only allocated if
+ * the new allocation can match the specified host offset.
+ *
+ * Note that guest_offset may not be cluster aligned.
+ *
+ * Returns:
+ *   0:     if no clusters could be allocated. *bytes is set to 0,
+ *          *host_offset is left unchanged.
+ *
+ *   1:     if new clusters were allocated. *bytes may be decreased if the
+ *          new allocation doesn't cover all of the requested area.
+ *          *host_offset is updated to contain the host offset of the first
+ *          newly allocated cluster.
+ *
+ *  -errno: in error cases
+ *
+ * TODO Get rid of nb_clusters, keep_clusters, n_start, n_end
+ * TODO Make *bytes actually behave as specified above
+ */
+static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
+    unsigned int nb_clusters, int keep_clusters, int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t *l2_table;
+    uint64_t entry;
+    int ret;
+
+    uint64_t alloc_offset;
+    uint64_t alloc_cluster_offset;
+    uint64_t keep_bytes = keep_clusters * s->cluster_size;
+
+    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
+                             *bytes);
+    assert(*bytes > 0);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
+
+    /* For the moment, overwrite compressed clusters one by one */
+    if (entry & QCOW_OFLAG_COMPRESSED) {
+        nb_clusters = 1;
+    } else {
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
+                                         l2_index + keep_clusters);
+    }
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (nb_clusters == 0) {
+        *bytes = 0;
+        return 0;
+    }
+
+    /* Calculate start and size of allocation */
+    alloc_offset = guest_offset + keep_bytes;
+
+    if (keep_clusters == 0) {
+        alloc_cluster_offset = 0;
+    } else {
+        alloc_cluster_offset = *host_offset + keep_bytes;
+    }
+
+    /* Allocate, if necessary at a given offset in the image file */
+    ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
+                                  &nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* save info needed for meta data update */
+    if (nb_clusters > 0) {
+        /*
+         * requested_sectors: Number of sectors from the start of the first
+         * newly allocated cluster to the end of the (possibly shortened
+         * before) write request.
+         *
+         * avail_sectors: Number of sectors from the start of the first
+         * newly allocated to the end of the last newly allocated cluster.
+         *
+         * nb_sectors: The number of sectors from the start of the first
+         * newly allocated cluster to the end of the aread that the write
+         * request actually writes to (excluding COW at the end)
+         */
+        int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
+        int avail_sectors = nb_clusters
+                            << (s->cluster_bits - BDRV_SECTOR_BITS);
+        int alloc_n_start = keep_clusters == 0 ? n_start : 0;
+        int nb_sectors = MIN(requested_sectors, avail_sectors);
+
+        if (keep_clusters == 0) {
+            *host_offset = alloc_cluster_offset;
+        }
+
+        *m = g_malloc0(sizeof(**m));
+
+        **m = (QCowL2Meta) {
+            .alloc_offset   = alloc_cluster_offset,
+            .offset         = alloc_offset & ~(s->cluster_size - 1),
+            .nb_clusters    = nb_clusters,
+            .nb_available   = nb_sectors,
+
+            .cow_start = {
+                .offset     = 0,
+                .nb_sectors = alloc_n_start,
+            },
+            .cow_end = {
+                .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+                .nb_sectors = avail_sectors - nb_sectors,
+            },
+        };
+        qemu_co_queue_init(&(*m)->dependent_requests);
+        QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+        *bytes = nb_clusters * s->cluster_size;
+    } else {
+        *bytes = 0;
+        return 0;
+    }
+
+    return 1;
+
+fail:
+    if (*m && (*m)->nb_clusters > 0) {
+        QLIST_REMOVE(*m, next_in_flight);
+    }
+    return ret;
+}
+
 /*
  * alloc_cluster_offset
  *
@@ -977,93 +1117,21 @@ again:
     }
 
     /* If there is something left to allocate, do that now */
-    if (nb_clusters > 0) {
-        uint64_t alloc_offset;
-        uint64_t alloc_cluster_offset;
-        uint64_t keep_bytes = keep_clusters * s->cluster_size;
-
-        ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
-        if (ret < 0) {
-            return ret;
-        }
-
-        /* For the moment, overwrite compressed clusters one by one */
-        uint64_t entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
-        if (entry & QCOW_OFLAG_COMPRESSED) {
-            nb_clusters = 1;
-        } else {
-            nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
-                                             l2_index + keep_clusters);
-        }
-
-        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-        if (ret < 0) {
-            return ret;
-        }
-
-        /* Calculate start and size of allocation */
-        alloc_offset = offset + keep_bytes;
-
-        if (keep_clusters == 0) {
-            alloc_cluster_offset = 0;
-        } else {
-            alloc_cluster_offset = cluster_offset + keep_bytes;
-        }
-
-        /* Allocate, if necessary at a given offset in the image file */
-        ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
-                                      &nb_clusters);
-        if (ret < 0) {
-            goto fail;
-        }
-
-        /* save info needed for meta data update */
-        if (nb_clusters > 0) {
-            /*
-             * requested_sectors: Number of sectors from the start of the first
-             * newly allocated cluster to the end of the (possibly shortened
-             * before) write request.
-             *
-             * avail_sectors: Number of sectors from the start of the first
-             * newly allocated to the end of the last newly allocated cluster.
-             *
-             * nb_sectors: The number of sectors from the start of the first
-             * newly allocated cluster to the end of the aread that the write
-             * request actually writes to (excluding COW at the end)
-             */
-            int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
-            int avail_sectors = nb_clusters
-                                << (s->cluster_bits - BDRV_SECTOR_BITS);
-            int alloc_n_start = keep_clusters == 0 ? n_start : 0;
-            int nb_sectors = MIN(requested_sectors, avail_sectors);
-
-            if (keep_clusters == 0) {
-                cluster_offset = alloc_cluster_offset;
-            }
-
-            *m = g_malloc0(sizeof(**m));
-
-            **m = (QCowL2Meta) {
-                .alloc_offset   = alloc_cluster_offset,
-                .offset         = alloc_offset & ~(s->cluster_size - 1),
-                .nb_clusters    = nb_clusters,
-                .nb_available   = nb_sectors,
-
-                .cow_start = {
-                    .offset     = 0,
-                    .nb_sectors = alloc_n_start,
-                },
-                .cow_end = {
-                    .offset     = nb_sectors * BDRV_SECTOR_SIZE,
-                    .nb_sectors = avail_sectors - nb_sectors,
-                },
-            };
-            qemu_co_queue_init(&(*m)->dependent_requests);
-            QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
-        }
+    if (nb_clusters == 0) {
+        goto done;
     }
 
+    cur_bytes = nb_clusters * s->cluster_size;
+    ret = handle_alloc(bs, offset, &cluster_offset, &cur_bytes, m,
+                       nb_clusters, keep_clusters, n_start, n_end);
+    if (ret < 0) {
+        return ret;
+    }
+
+    nb_clusters = size_to_clusters(s, cur_bytes);
+
     /* Some cleanup work */
+done:
     sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
     if (sectors > n_end) {
         sectors = n_end;
@@ -1074,12 +1142,6 @@ again:
     *host_offset = cluster_offset;
 
     return 0;
-
-fail:
-    if (*m && (*m)->nb_clusters > 0) {
-        QLIST_REMOVE(*m, next_in_flight);
-    }
-    return ret;
 }
 
 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
diff --git a/trace-events b/trace-events
index 406fe5f408..9511a28ca7 100644
--- a/trace-events
+++ b/trace-events
@@ -483,6 +483,7 @@ qcow2_writev_done_part(void *co, int cur_nr_sectors) "co %p cur_nr_sectors %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
 
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int n_start, int n_end) "co %p offet %" PRIx64 " n_start %d n_end %d"
+qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offet %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
 qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offet %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"
 qcow2_cluster_alloc_phys(void *co) "co %p"
 qcow2_cluster_link_l2(void *co, int nb_clusters) "co %p nb_clusters %d"

From f5bc63509471299176066d5f63bb8ff2e15af279 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:01 +0100
Subject: [PATCH 10/23] qcow2: handle_alloc(): Get rid of nb_clusters parameter

We already communicate the same information in *bytes.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 454a30cab0..009f62a552 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -876,17 +876,18 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
  *
  *  -errno: in error cases
  *
- * TODO Get rid of nb_clusters, keep_clusters, n_start, n_end
+ * TODO Get rid of keep_clusters, n_start, n_end
  * TODO Make *bytes actually behave as specified above
  */
 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
-    unsigned int nb_clusters, int keep_clusters, int n_start, int n_end)
+    int keep_clusters, int n_start, int n_end)
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index;
     uint64_t *l2_table;
     uint64_t entry;
+    unsigned int nb_clusters;
     int ret;
 
     uint64_t alloc_offset;
@@ -897,6 +898,13 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
                              *bytes);
     assert(*bytes > 0);
 
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(size_to_clusters(s, *bytes), s->l2_size - l2_index);
+
     /* Find L2 entry for the first involved cluster */
     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
     if (ret < 0) {
@@ -1103,6 +1111,7 @@ again:
     }
 
     cluster_offset &= L2E_OFFSET_MASK;
+    *host_offset = cluster_offset;
 
     /*
      * The L2 table isn't used any more after this. As long as the cache works
@@ -1123,11 +1132,14 @@ again:
 
     cur_bytes = nb_clusters * s->cluster_size;
     ret = handle_alloc(bs, offset, &cluster_offset, &cur_bytes, m,
-                       nb_clusters, keep_clusters, n_start, n_end);
+                       keep_clusters, n_start, n_end);
     if (ret < 0) {
         return ret;
     }
 
+    if (!*host_offset) {
+        *host_offset = cluster_offset;
+    }
     nb_clusters = size_to_clusters(s, cur_bytes);
 
     /* Some cleanup work */
@@ -1139,7 +1151,6 @@ done:
 
     assert(sectors > n_start);
     *num = sectors - n_start;
-    *host_offset = cluster_offset;
 
     return 0;
 }

From 3b8e2e260c8cee63c9253718983a6682dc2771d7 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:02 +0100
Subject: [PATCH 11/23] qcow2: handle_alloc(): Get rid of keep_clusters
 parameter

handle_alloc() is now called with the offset at which the actual new
allocation starts instead of the offset at which the whole write request
starts, part of which may already be processed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 44 ++++++++++++++++++++++++++-----------------
 block/qcow2.h         |  5 +++++
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 009f62a552..8f4ef0d48a 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -876,12 +876,12 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
  *
  *  -errno: in error cases
  *
- * TODO Get rid of keep_clusters, n_start, n_end
+ * TODO Get rid of n_start, n_end
  * TODO Make *bytes actually behave as specified above
  */
 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
-    int keep_clusters, int n_start, int n_end)
+    int n_start, int n_end)
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index;
@@ -892,7 +892,6 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
 
     uint64_t alloc_offset;
     uint64_t alloc_cluster_offset;
-    uint64_t keep_bytes = keep_clusters * s->cluster_size;
 
     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
                              *bytes);
@@ -911,14 +910,13 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         return ret;
     }
 
-    entry = be64_to_cpu(l2_table[l2_index + keep_clusters]);
+    entry = be64_to_cpu(l2_table[l2_index]);
 
     /* For the moment, overwrite compressed clusters one by one */
     if (entry & QCOW_OFLAG_COMPRESSED) {
         nb_clusters = 1;
     } else {
-        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table,
-                                         l2_index + keep_clusters);
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
     }
 
     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
@@ -932,13 +930,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     }
 
     /* Calculate start and size of allocation */
-    alloc_offset = guest_offset + keep_bytes;
-
-    if (keep_clusters == 0) {
-        alloc_cluster_offset = 0;
-    } else {
-        alloc_cluster_offset = *host_offset + keep_bytes;
-    }
+    alloc_offset = guest_offset;
+    alloc_cluster_offset = *host_offset;
 
     /* Allocate, if necessary at a given offset in the image file */
     ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
@@ -961,13 +954,13 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
          * newly allocated cluster to the end of the aread that the write
          * request actually writes to (excluding COW at the end)
          */
-        int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
+        int requested_sectors = n_end;
         int avail_sectors = nb_clusters
                             << (s->cluster_bits - BDRV_SECTOR_BITS);
-        int alloc_n_start = keep_clusters == 0 ? n_start : 0;
+        int alloc_n_start = *host_offset == 0 ? n_start : 0;
         int nb_sectors = MIN(requested_sectors, avail_sectors);
 
-        if (keep_clusters == 0) {
+        if (*host_offset == 0) {
             *host_offset = alloc_cluster_offset;
         }
 
@@ -1130,9 +1123,26 @@ again:
         goto done;
     }
 
+    int alloc_n_start;
+    int alloc_n_end;
+
+    if (keep_clusters != 0) {
+        offset         = start_of_cluster(s, offset
+                                             + keep_clusters * s->cluster_size);
+        cluster_offset = start_of_cluster(s, cluster_offset
+                                             + keep_clusters * s->cluster_size);
+
+        alloc_n_start = 0;
+        alloc_n_end = n_end - keep_clusters * s->cluster_sectors;
+    } else {
+        alloc_n_start = n_start;
+        alloc_n_end = n_end;
+    }
+
     cur_bytes = nb_clusters * s->cluster_size;
+
     ret = handle_alloc(bs, offset, &cluster_offset, &cur_bytes, m,
-                       keep_clusters, n_start, n_end);
+                       alloc_n_start, alloc_n_end);
     if (ret < 0) {
         return ret;
     }
diff --git a/block/qcow2.h b/block/qcow2.h
index a99d51b6a1..c4eaf670ec 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -266,6 +266,11 @@ enum {
 
 #define REFT_OFFSET_MASK 0xffffffffffffff00ULL
 
+static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & ~(s->cluster_size - 1);
+}
+
 static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
 {
     return (size + (s->cluster_size - 1)) >> s->cluster_bits;

From c37f4cd71d99b7658d238bd8399048fc6e506958 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:03 +0100
Subject: [PATCH 12/23] qcow2: Finalise interface of handle_alloc()

The interface works completely on a byte granularity now and duplicated
parameters are removed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 29 ++++++++++++++++-------------
 block/qcow2.h         |  5 +++++
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 8f4ef0d48a..8ed1f7dd2a 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -875,13 +875,9 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
  *          newly allocated cluster.
  *
  *  -errno: in error cases
- *
- * TODO Get rid of n_start, n_end
- * TODO Make *bytes actually behave as specified above
  */
 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
-    int n_start, int n_end)
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index;
@@ -901,8 +897,11 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
      * Calculate the number of clusters to look for. We stop at L2 table
      * boundaries to keep things simple.
      */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
     l2_index = offset_to_l2_index(s, guest_offset);
-    nb_clusters = MIN(size_to_clusters(s, *bytes), s->l2_size - l2_index);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
 
     /* Find L2 entry for the first involved cluster */
     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
@@ -954,10 +953,13 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
          * newly allocated cluster to the end of the aread that the write
          * request actually writes to (excluding COW at the end)
          */
-        int requested_sectors = n_end;
+        int requested_sectors =
+            (*bytes + offset_into_cluster(s, guest_offset))
+            >> BDRV_SECTOR_BITS;
         int avail_sectors = nb_clusters
                             << (s->cluster_bits - BDRV_SECTOR_BITS);
-        int alloc_n_start = *host_offset == 0 ? n_start : 0;
+        int alloc_n_start = offset_into_cluster(s, guest_offset)
+                            >> BDRV_SECTOR_BITS;
         int nb_sectors = MIN(requested_sectors, avail_sectors);
 
         if (*host_offset == 0) {
@@ -984,7 +986,9 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         qemu_co_queue_init(&(*m)->dependent_requests);
         QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
 
-        *bytes = nb_clusters * s->cluster_size;
+        *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+                             - offset_into_cluster(s, guest_offset));
+        assert(*bytes != 0);
     } else {
         *bytes = 0;
         return 0;
@@ -1139,10 +1143,9 @@ again:
         alloc_n_end = n_end;
     }
 
-    cur_bytes = nb_clusters * s->cluster_size;
+    cur_bytes = MIN(cur_bytes, ((alloc_n_end - alloc_n_start) << BDRV_SECTOR_BITS));
 
-    ret = handle_alloc(bs, offset, &cluster_offset, &cur_bytes, m,
-                       alloc_n_start, alloc_n_end);
+    ret = handle_alloc(bs, offset, &cluster_offset, &cur_bytes, m);
     if (ret < 0) {
         return ret;
     }
@@ -1150,7 +1153,7 @@ again:
     if (!*host_offset) {
         *host_offset = cluster_offset;
     }
-    nb_clusters = size_to_clusters(s, cur_bytes);
+    nb_clusters = size_to_clusters(s, cur_bytes + offset_into_cluster(s, offset));
 
     /* Some cleanup work */
 done:
diff --git a/block/qcow2.h b/block/qcow2.h
index c4eaf670ec..32806bd035 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -271,6 +271,11 @@ static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
     return offset & ~(s->cluster_size - 1);
 }
 
+static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & (s->cluster_size - 1);
+}
+
 static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
 {
     return (size + (s->cluster_size - 1)) >> s->cluster_bits;

From 83baa9a4719b42bc28d525fa28af643523cc2bf3 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:04 +0100
Subject: [PATCH 13/23] qcow2: Clean up handle_alloc()

Things can be simplified a bit now. No semantic changes.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 110 ++++++++++++++++++++----------------------
 1 file changed, 53 insertions(+), 57 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 8ed1f7dd2a..11414830ec 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -886,7 +886,6 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     unsigned int nb_clusters;
     int ret;
 
-    uint64_t alloc_offset;
     uint64_t alloc_cluster_offset;
 
     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
@@ -928,72 +927,69 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         return 0;
     }
 
-    /* Calculate start and size of allocation */
-    alloc_offset = guest_offset;
-    alloc_cluster_offset = *host_offset;
-
     /* Allocate, if necessary at a given offset in the image file */
-    ret = do_alloc_cluster_offset(bs, alloc_offset, &alloc_cluster_offset,
+    alloc_cluster_offset = *host_offset;
+    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
                                   &nb_clusters);
     if (ret < 0) {
         goto fail;
     }
 
-    /* save info needed for meta data update */
-    if (nb_clusters > 0) {
-        /*
-         * requested_sectors: Number of sectors from the start of the first
-         * newly allocated cluster to the end of the (possibly shortened
-         * before) write request.
-         *
-         * avail_sectors: Number of sectors from the start of the first
-         * newly allocated to the end of the last newly allocated cluster.
-         *
-         * nb_sectors: The number of sectors from the start of the first
-         * newly allocated cluster to the end of the aread that the write
-         * request actually writes to (excluding COW at the end)
-         */
-        int requested_sectors =
-            (*bytes + offset_into_cluster(s, guest_offset))
-            >> BDRV_SECTOR_BITS;
-        int avail_sectors = nb_clusters
-                            << (s->cluster_bits - BDRV_SECTOR_BITS);
-        int alloc_n_start = offset_into_cluster(s, guest_offset)
-                            >> BDRV_SECTOR_BITS;
-        int nb_sectors = MIN(requested_sectors, avail_sectors);
-
-        if (*host_offset == 0) {
-            *host_offset = alloc_cluster_offset;
-        }
-
-        *m = g_malloc0(sizeof(**m));
-
-        **m = (QCowL2Meta) {
-            .alloc_offset   = alloc_cluster_offset,
-            .offset         = alloc_offset & ~(s->cluster_size - 1),
-            .nb_clusters    = nb_clusters,
-            .nb_available   = nb_sectors,
-
-            .cow_start = {
-                .offset     = 0,
-                .nb_sectors = alloc_n_start,
-            },
-            .cow_end = {
-                .offset     = nb_sectors * BDRV_SECTOR_SIZE,
-                .nb_sectors = avail_sectors - nb_sectors,
-            },
-        };
-        qemu_co_queue_init(&(*m)->dependent_requests);
-        QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
-
-        *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
-                             - offset_into_cluster(s, guest_offset));
-        assert(*bytes != 0);
-    } else {
+    /* Can't extend contiguous allocation */
+    if (nb_clusters == 0) {
         *bytes = 0;
         return 0;
     }
 
+    /*
+     * Save info needed for meta data update.
+     *
+     * requested_sectors: Number of sectors from the start of the first
+     * newly allocated cluster to the end of the (possibly shortened
+     * before) write request.
+     *
+     * avail_sectors: Number of sectors from the start of the first
+     * newly allocated to the end of the last newly allocated cluster.
+     *
+     * nb_sectors: The number of sectors from the start of the first
+     * newly allocated cluster to the end of the area that the write
+     * request actually writes to (excluding COW at the end)
+     */
+    int requested_sectors =
+        (*bytes + offset_into_cluster(s, guest_offset))
+        >> BDRV_SECTOR_BITS;
+    int avail_sectors = nb_clusters
+                        << (s->cluster_bits - BDRV_SECTOR_BITS);
+    int alloc_n_start = offset_into_cluster(s, guest_offset)
+                        >> BDRV_SECTOR_BITS;
+    int nb_sectors = MIN(requested_sectors, avail_sectors);
+
+    *host_offset = alloc_cluster_offset;
+
+    *m = g_malloc0(sizeof(**m));
+
+    **m = (QCowL2Meta) {
+        .alloc_offset   = *host_offset,
+        .offset         = start_of_cluster(s, guest_offset),
+        .nb_clusters    = nb_clusters,
+        .nb_available   = nb_sectors,
+
+        .cow_start = {
+            .offset     = 0,
+            .nb_sectors = alloc_n_start,
+        },
+        .cow_end = {
+            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+            .nb_sectors = avail_sectors - nb_sectors,
+        },
+    };
+    qemu_co_queue_init(&(*m)->dependent_requests);
+    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+                         - offset_into_cluster(s, guest_offset));
+    assert(*bytes != 0);
+
     return 1;
 
 fail:

From 0af729ec007ea4d103a2e3f3fc5db522610a2290 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:05 +0100
Subject: [PATCH 14/23] qcow2: Factor out handle_copied()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 134 +++++++++++++++++++++++++++++-------------
 trace-events          |   1 +
 2 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 11414830ec..9036bd8fae 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -811,6 +811,84 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
     return 0;
 }
 
+/*
+ * Checks how many already allocated clusters that don't require a copy on
+ * write there are at the given guest_offset (up to *bytes). If
+ * *host_offset is not zero, only physically contiguous clusters beginning at
+ * this host offset are counted.
+ *
+ * Note that guest_offset may not be cluster aligned.
+ *
+ * Returns:
+ *   0:     if no allocated clusters are available at the given offset.
+ *          *bytes is normally unchanged. It is set to 0 if the cluster
+ *          is allocated and doesn't need COW, but doesn't have the right
+ *          physical offset.
+ *
+ *   1:     if allocated clusters that don't require a COW are available at
+ *          the requested offset. *bytes may have decreased and describes
+ *          the length of the area that can be written to.
+ *
+ *  -errno: in error cases
+ *
+ * TODO Get rid of keep_clusters, nb_clusters parameters
+ * TODO Make bytes behave like described above
+ * TODO Make non-zero host_offset behave like describe above
+ */
+static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
+    unsigned int *keep_clusters, unsigned int *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t cluster_offset;
+    uint64_t *l2_table;
+    int ret, pret;
+
+    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
+                              *bytes);
+    assert(*host_offset == 0);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* Check how many clusters are already allocated and don't need COW */
+    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+        && (cluster_offset & QCOW_OFLAG_COPIED))
+    {
+        /* We keep all QCOW_OFLAG_COPIED clusters */
+        *keep_clusters =
+            count_contiguous_clusters(*nb_clusters, s->cluster_size,
+                                      &l2_table[l2_index], 0,
+                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+        assert(*keep_clusters <= *nb_clusters);
+        *nb_clusters -= *keep_clusters;
+
+        ret = 1;
+    } else {
+        *keep_clusters = 0;
+        cluster_offset = 0;
+
+        ret = 0;
+    }
+
+    cluster_offset &= L2E_OFFSET_MASK;
+    *host_offset = cluster_offset;
+
+    /* Cleanup */
+    pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (pret < 0) {
+        return pret;
+    }
+
+    return ret;
+}
+
 /*
  * Allocates new clusters for the given guest_offset.
  *
@@ -1023,7 +1101,6 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index, ret, sectors;
-    uint64_t *l2_table;
     unsigned int nb_clusters, keep_clusters;
     uint64_t cluster_offset;
     uint64_t cur_bytes;
@@ -1032,6 +1109,9 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
                                       n_start, n_end);
 
 again:
+    cluster_offset = 0;
+    *host_offset = 0;
+
     /*
      * Calculate the number of clusters to look for. We stop at L2 table
      * boundaries to keep things simple.
@@ -1057,12 +1137,6 @@ again:
      *         allocation ends. Shorten the COW of the in-fight allocation, set
      *         cluster_offset to write to the same cluster and set up the right
      *         synchronisation between the in-flight request and the new one.
-     *
-     * 2. Count contiguous COPIED clusters.
-     *    TODO: Consider cluster_offset if set in step 1c.
-     *
-     * 3. If the request still hasn't completed, allocate new clusters,
-     *    considering any cluster_offset of steps 1c or 2.
      */
     cur_bytes = (n_end - n_start) * BDRV_SECTOR_SIZE;
     ret = handle_dependencies(bs, offset, &cur_bytes);
@@ -1079,43 +1153,19 @@ again:
     nb_clusters = size_to_clusters(s, offset + cur_bytes)
                 - (offset >> s->cluster_bits);
 
-    /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
-    if (ret < 0) {
-        return ret;
-    }
-
-    cluster_offset = be64_to_cpu(l2_table[l2_index]);
-
-    /* Check how many clusters are already allocated and don't need COW */
-    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
-        && (cluster_offset & QCOW_OFLAG_COPIED))
-    {
-        /* We keep all QCOW_OFLAG_COPIED clusters */
-        keep_clusters =
-            count_contiguous_clusters(nb_clusters, s->cluster_size,
-                                      &l2_table[l2_index], 0,
-                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
-        assert(keep_clusters <= nb_clusters);
-        nb_clusters -= keep_clusters;
-    } else {
-        keep_clusters = 0;
-        cluster_offset = 0;
-    }
-
-    cluster_offset &= L2E_OFFSET_MASK;
-    *host_offset = cluster_offset;
-
     /*
-     * The L2 table isn't used any more after this. As long as the cache works
-     * synchronously, it's important to release it before calling
-     * do_alloc_cluster_offset, which may yield if we need to wait for another
-     * request to complete. If we still had the reference, we could use up the
-     * whole cache with sleeping requests.
+     * 2. Count contiguous COPIED clusters.
+     *    TODO: Consider cluster_offset if set in step 1c.
      */
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    uint64_t tmp_bytes = cur_bytes;
+    ret = handle_copied(bs, offset, &cluster_offset, &tmp_bytes, m,
+                        &keep_clusters, &nb_clusters);
     if (ret < 0) {
         return ret;
+    } else if (ret) {
+        if (!*host_offset) {
+            *host_offset = cluster_offset;
+        }
     }
 
     /* If there is something left to allocate, do that now */
@@ -1123,6 +1173,10 @@ again:
         goto done;
     }
 
+    /*
+     * 3. If the request still hasn't completed, allocate new clusters,
+     *    considering any cluster_offset of steps 1c or 2.
+     */
     int alloc_n_start;
     int alloc_n_end;
 
diff --git a/trace-events b/trace-events
index 9511a28ca7..5a6ef4b406 100644
--- a/trace-events
+++ b/trace-events
@@ -483,6 +483,7 @@ qcow2_writev_done_part(void *co, int cur_nr_sectors) "co %p cur_nr_sectors %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
 
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int n_start, int n_end) "co %p offet %" PRIx64 " n_start %d n_end %d"
+qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offet %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
 qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offet %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
 qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offet %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"
 qcow2_cluster_alloc_phys(void *co) "co %p"

From acb0467f8df7e9dbc8bbcb9a2e1e8cfe17f79691 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:06 +0100
Subject: [PATCH 15/23] qcow2: handle_copied(): Get rid of nb_clusters
 parameter

handle_copied() uses its bytes parameter now to determine how many
clusters it should try to find.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 9036bd8fae..d640328b5c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -831,24 +831,35 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
  *
  *  -errno: in error cases
  *
- * TODO Get rid of keep_clusters, nb_clusters parameters
+ * TODO Get rid of keep_clusters parameter
  * TODO Make bytes behave like described above
  * TODO Make non-zero host_offset behave like describe above
  */
 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
-    unsigned int *keep_clusters, unsigned int *nb_clusters)
+    unsigned int *keep_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index;
     uint64_t cluster_offset;
     uint64_t *l2_table;
+    unsigned int nb_clusters;
     int ret, pret;
 
     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
                               *bytes);
     assert(*host_offset == 0);
 
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
     /* Find L2 entry for the first involved cluster */
     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
     if (ret < 0) {
@@ -863,11 +874,10 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     {
         /* We keep all QCOW_OFLAG_COPIED clusters */
         *keep_clusters =
-            count_contiguous_clusters(*nb_clusters, s->cluster_size,
+            count_contiguous_clusters(nb_clusters, s->cluster_size,
                                       &l2_table[l2_index], 0,
                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
-        assert(*keep_clusters <= *nb_clusters);
-        *nb_clusters -= *keep_clusters;
+        assert(*keep_clusters <= nb_clusters);
 
         ret = 1;
     } else {
@@ -1159,10 +1169,12 @@ again:
      */
     uint64_t tmp_bytes = cur_bytes;
     ret = handle_copied(bs, offset, &cluster_offset, &tmp_bytes, m,
-                        &keep_clusters, &nb_clusters);
+                        &keep_clusters);
     if (ret < 0) {
         return ret;
     } else if (ret) {
+        nb_clusters -= keep_clusters;
+
         if (!*host_offset) {
             *host_offset = cluster_offset;
         }

From c53ede9f6d8f0de7939eea676c1398c4073ff35e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:07 +0100
Subject: [PATCH 16/23] qcow2: handle_copied(): Get rid of keep_clusters
 parameter

Now *bytes is used to return the length of the area that can be written
to without performing an allocation or COW.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d640328b5c..5e5465dcac 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -831,19 +831,17 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
  *
  *  -errno: in error cases
  *
- * TODO Get rid of keep_clusters parameter
- * TODO Make bytes behave like described above
  * TODO Make non-zero host_offset behave like describe above
  */
 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m,
-    unsigned int *keep_clusters)
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index;
     uint64_t cluster_offset;
     uint64_t *l2_table;
     unsigned int nb_clusters;
+    unsigned int keep_clusters;
     int ret, pret;
 
     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
@@ -873,17 +871,19 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
         && (cluster_offset & QCOW_OFLAG_COPIED))
     {
         /* We keep all QCOW_OFLAG_COPIED clusters */
-        *keep_clusters =
+        keep_clusters =
             count_contiguous_clusters(nb_clusters, s->cluster_size,
                                       &l2_table[l2_index], 0,
                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
-        assert(*keep_clusters <= nb_clusters);
+        assert(keep_clusters <= nb_clusters);
+
+        *bytes = MIN(*bytes,
+                 keep_clusters * s->cluster_size
+                 - offset_into_cluster(s, guest_offset));
 
         ret = 1;
     } else {
-        *keep_clusters = 0;
         cluster_offset = 0;
-
         ret = 0;
     }
 
@@ -1168,16 +1168,19 @@ again:
      *    TODO: Consider cluster_offset if set in step 1c.
      */
     uint64_t tmp_bytes = cur_bytes;
-    ret = handle_copied(bs, offset, &cluster_offset, &tmp_bytes, m,
-                        &keep_clusters);
+    ret = handle_copied(bs, offset, &cluster_offset, &tmp_bytes, m);
     if (ret < 0) {
         return ret;
     } else if (ret) {
+        keep_clusters =
+            size_to_clusters(s, tmp_bytes + offset_into_cluster(s, offset));
         nb_clusters -= keep_clusters;
 
         if (!*host_offset) {
             *host_offset = cluster_offset;
         }
+    } else {
+        keep_clusters = 0;
     }
 
     /* If there is something left to allocate, do that now */

From e62daaf67958e8274547ddac87cb0a177a869216 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:08 +0100
Subject: [PATCH 17/23] qcow2: handle_copied(): Implement non-zero host_offset

Look only for clusters that start at a given physical offset.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 5e5465dcac..239a997027 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -830,8 +830,6 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
  *          the length of the area that can be written to.
  *
  *  -errno: in error cases
- *
- * TODO Make non-zero host_offset behave like describe above
  */
 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
@@ -846,7 +844,6 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 
     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
                               *bytes);
-    assert(*host_offset == 0);
 
     /*
      * Calculate the number of clusters to look for. We stop at L2 table
@@ -870,6 +867,16 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
         && (cluster_offset & QCOW_OFLAG_COPIED))
     {
+        /* If a specific host_offset is required, check it */
+        bool offset_matches =
+            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
+
+        if (*host_offset != 0 && !offset_matches) {
+            *bytes = 0;
+            ret = 0;
+            goto out;
+        }
+
         /* We keep all QCOW_OFLAG_COPIED clusters */
         keep_clusters =
             count_contiguous_clusters(nb_clusters, s->cluster_size,
@@ -883,19 +890,22 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 
         ret = 1;
     } else {
-        cluster_offset = 0;
         ret = 0;
     }
 
-    cluster_offset &= L2E_OFFSET_MASK;
-    *host_offset = cluster_offset;
-
     /* Cleanup */
+out:
     pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     if (pret < 0) {
         return pret;
     }
 
+    /* Only return a host offset if we actually made progress. Otherwise we
+     * would make requirements for handle_alloc() that it can't fulfill */
+    if (ret) {
+        *host_offset = cluster_offset & L2E_OFFSET_MASK;
+    }
+
     return ret;
 }
 
@@ -1165,7 +1175,6 @@ again:
 
     /*
      * 2. Count contiguous COPIED clusters.
-     *    TODO: Consider cluster_offset if set in step 1c.
      */
     uint64_t tmp_bytes = cur_bytes;
     ret = handle_copied(bs, offset, &cluster_offset, &tmp_bytes, m);
@@ -1179,6 +1188,9 @@ again:
         if (!*host_offset) {
             *host_offset = cluster_offset;
         }
+    } else if (cur_bytes == 0) {
+        keep_clusters = 0;
+        goto done;
     } else {
         keep_clusters = 0;
     }

From 411d62b04b4cd2d3a6cea310689dbafa2479bc28 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:09 +0100
Subject: [PATCH 18/23] qcow2: Prepare handle_alloc/copied() for byte
 granularity

This makes handle_alloc() and handle_copied() return byte-granularity
host offsets instead of returning always the cluster start. This is
required so that qcow2_alloc_cluster_offset() can stop aligning
everything to cluster boundaries.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 239a997027..4f43d41132 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -817,7 +817,9 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
  * *host_offset is not zero, only physically contiguous clusters beginning at
  * this host offset are counted.
  *
- * Note that guest_offset may not be cluster aligned.
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
  *
  * Returns:
  *   0:     if no allocated clusters are available at the given offset.
@@ -845,6 +847,9 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
                               *bytes);
 
+    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
+                                == offset_into_cluster(s, *host_offset));
+
     /*
      * Calculate the number of clusters to look for. We stop at L2 table
      * boundaries to keep things simple.
@@ -903,7 +908,8 @@ out:
     /* Only return a host offset if we actually made progress. Otherwise we
      * would make requirements for handle_alloc() that it can't fulfill */
     if (ret) {
-        *host_offset = cluster_offset & L2E_OFFSET_MASK;
+        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
+                     + offset_into_cluster(s, guest_offset);
     }
 
     return ret;
@@ -961,7 +967,9 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
  * copy on write. If *host_offset is non-zero, clusters are only allocated if
  * the new allocation can match the specified host offset.
  *
- * Note that guest_offset may not be cluster aligned.
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
  *
  * Returns:
  *   0:     if no clusters could be allocated. *bytes is set to 0,
@@ -1026,7 +1034,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     }
 
     /* Allocate, if necessary at a given offset in the image file */
-    alloc_cluster_offset = *host_offset;
+    alloc_cluster_offset = start_of_cluster(s, *host_offset);
     ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
                                   &nb_clusters);
     if (ret < 0) {
@@ -1062,12 +1070,10 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
                         >> BDRV_SECTOR_BITS;
     int nb_sectors = MIN(requested_sectors, avail_sectors);
 
-    *host_offset = alloc_cluster_offset;
-
     *m = g_malloc0(sizeof(**m));
 
     **m = (QCowL2Meta) {
-        .alloc_offset   = *host_offset,
+        .alloc_offset   = alloc_cluster_offset,
         .offset         = start_of_cluster(s, guest_offset),
         .nb_clusters    = nb_clusters,
         .nb_available   = nb_sectors,
@@ -1084,6 +1090,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     qemu_co_queue_init(&(*m)->dependent_requests);
     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
 
+    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
     *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
                          - offset_into_cluster(s, guest_offset));
     assert(*bytes != 0);
@@ -1186,7 +1193,7 @@ again:
         nb_clusters -= keep_clusters;
 
         if (!*host_offset) {
-            *host_offset = cluster_offset;
+            *host_offset = start_of_cluster(s, cluster_offset);
         }
     } else if (cur_bytes == 0) {
         keep_clusters = 0;
@@ -1228,7 +1235,7 @@ again:
     }
 
     if (!*host_offset) {
-        *host_offset = cluster_offset;
+        *host_offset = start_of_cluster(s, cluster_offset);
     }
     nb_clusters = size_to_clusters(s, cur_bytes + offset_into_cluster(s, offset));
 

From 710c2496d8cecc92568d439a3cf9d5874b3a55e2 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:10 +0100
Subject: [PATCH 19/23] qcow2: Use byte granularity in
 qcow2_alloc_cluster_offset()

This gets rid of the nb_clusters and keep_clusters and the associated
complicated calculations. Just advance the number of bytes that have
been processed and everything is fine.

This patch advances the variables even after the last operation even
though they aren't used any more afterwards to make things look more
uniform. A later patch will turn the whole thing into a loop and then
it actually starts making sense.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 84 +++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 56 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4f43d41132..78e7db9b28 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1127,27 +1127,23 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
-    int l2_index, ret, sectors;
-    unsigned int nb_clusters, keep_clusters;
+    uint64_t start, remaining;
     uint64_t cluster_offset;
     uint64_t cur_bytes;
+    int ret;
 
     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
                                       n_start, n_end);
 
+    assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
+    offset = start_of_cluster(s, offset);
+
 again:
+    start = offset + (n_start << BDRV_SECTOR_BITS);
+    remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
     cluster_offset = 0;
     *host_offset = 0;
 
-    /*
-     * Calculate the number of clusters to look for. We stop at L2 table
-     * boundaries to keep things simple.
-     */
-    l2_index = offset_to_l2_index(s, offset);
-    nb_clusters = MIN(size_to_clusters(s, n_end << BDRV_SECTOR_BITS),
-                      s->l2_size - l2_index);
-    n_end = MIN(n_end, nb_clusters * s->cluster_sectors);
-
     /*
      * Now start gathering as many contiguous clusters as possible:
      *
@@ -1165,8 +1161,8 @@ again:
      *         cluster_offset to write to the same cluster and set up the right
      *         synchronisation between the in-flight request and the new one.
      */
-    cur_bytes = (n_end - n_start) * BDRV_SECTOR_SIZE;
-    ret = handle_dependencies(bs, offset, &cur_bytes);
+    cur_bytes = remaining;
+    ret = handle_dependencies(bs, start, &cur_bytes);
     if (ret == -EAGAIN) {
         goto again;
     } else if (ret < 0) {
@@ -1177,33 +1173,28 @@ again:
          * correctly during the next loop iteration. */
     }
 
-    nb_clusters = size_to_clusters(s, offset + cur_bytes)
-                - (offset >> s->cluster_bits);
-
     /*
      * 2. Count contiguous COPIED clusters.
      */
-    uint64_t tmp_bytes = cur_bytes;
-    ret = handle_copied(bs, offset, &cluster_offset, &tmp_bytes, m);
+    ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
     if (ret < 0) {
         return ret;
     } else if (ret) {
-        keep_clusters =
-            size_to_clusters(s, tmp_bytes + offset_into_cluster(s, offset));
-        nb_clusters -= keep_clusters;
-
         if (!*host_offset) {
             *host_offset = start_of_cluster(s, cluster_offset);
         }
+
+        start           += cur_bytes;
+        remaining       -= cur_bytes;
+        cluster_offset  += cur_bytes;
+
+        cur_bytes = remaining;
     } else if (cur_bytes == 0) {
-        keep_clusters = 0;
         goto done;
-    } else {
-        keep_clusters = 0;
     }
 
     /* If there is something left to allocate, do that now */
-    if (nb_clusters == 0) {
+    if (remaining == 0) {
         goto done;
     }
 
@@ -1211,43 +1202,24 @@ again:
      * 3. If the request still hasn't completed, allocate new clusters,
      *    considering any cluster_offset of steps 1c or 2.
      */
-    int alloc_n_start;
-    int alloc_n_end;
-
-    if (keep_clusters != 0) {
-        offset         = start_of_cluster(s, offset
-                                             + keep_clusters * s->cluster_size);
-        cluster_offset = start_of_cluster(s, cluster_offset
-                                             + keep_clusters * s->cluster_size);
-
-        alloc_n_start = 0;
-        alloc_n_end = n_end - keep_clusters * s->cluster_sectors;
-    } else {
-        alloc_n_start = n_start;
-        alloc_n_end = n_end;
-    }
-
-    cur_bytes = MIN(cur_bytes, ((alloc_n_end - alloc_n_start) << BDRV_SECTOR_BITS));
-
-    ret = handle_alloc(bs, offset, &cluster_offset, &cur_bytes, m);
+    ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
     if (ret < 0) {
         return ret;
-    }
+    } else if (ret) {
+        if (!*host_offset) {
+            *host_offset = start_of_cluster(s, cluster_offset);
+        }
 
-    if (!*host_offset) {
-        *host_offset = start_of_cluster(s, cluster_offset);
+        start           += cur_bytes;
+        remaining       -= cur_bytes;
+        cluster_offset  += cur_bytes;
     }
-    nb_clusters = size_to_clusters(s, cur_bytes + offset_into_cluster(s, offset));
 
     /* Some cleanup work */
 done:
-    sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
-    if (sectors > n_end) {
-        sectors = n_end;
-    }
-
-    assert(sectors > n_start);
-    *num = sectors - n_start;
+    *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+    assert(*num > 0);
+    assert(*host_offset != 0);
 
     return 0;
 }

From 88c6588c5165da1526f735ed850861c5b74670bd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:11 +0100
Subject: [PATCH 20/23] qcow2: Allow requests with multiple l2metas

Instead of expecting a single l2meta, have a list of them. This allows
to still have a single I/O request for the guest data, even though
multiple l2meta may be needed in order to describe both a COW overwrite
and a new cluster allocation (typical sequential write case).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c |  3 +++
 block/qcow2.c         | 14 +++++++++++---
 block/qcow2.h         |  3 +++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 78e7db9b28..6dc7f7f36b 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1069,10 +1069,13 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     int alloc_n_start = offset_into_cluster(s, guest_offset)
                         >> BDRV_SECTOR_BITS;
     int nb_sectors = MIN(requested_sectors, avail_sectors);
+    QCowL2Meta *old_m = *m;
 
     *m = g_malloc0(sizeof(**m));
 
     **m = (QCowL2Meta) {
+        .next           = old_m,
+
         .alloc_offset   = alloc_cluster_offset,
         .offset         = start_of_cluster(s, guest_offset),
         .nb_clusters    = nb_clusters,
diff --git a/block/qcow2.c b/block/qcow2.c
index 3f7edf5652..7e7d775b37 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -858,7 +858,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             goto fail;
         }
 
-        if (l2meta != NULL) {
+        while (l2meta != NULL) {
+            QCowL2Meta *next;
+
             ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
             if (ret < 0) {
                 goto fail;
@@ -871,8 +873,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
             qemu_co_queue_restart_all(&l2meta->dependent_requests);
 
+            next = l2meta->next;
             g_free(l2meta);
-            l2meta = NULL;
+            l2meta = next;
         }
 
         remaining_sectors -= cur_nr_sectors;
@@ -885,12 +888,17 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 fail:
     qemu_co_mutex_unlock(&s->lock);
 
-    if (l2meta != NULL) {
+    while (l2meta != NULL) {
+        QCowL2Meta *next;
+
         if (l2meta->nb_clusters != 0) {
             QLIST_REMOVE(l2meta, next_in_flight);
         }
         qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+        next = l2meta->next;
         g_free(l2meta);
+        l2meta = next;
     }
 
     qemu_iovec_destroy(&hd_qiov);
diff --git a/block/qcow2.h b/block/qcow2.h
index 32806bd035..bf8db2abd3 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -250,6 +250,9 @@ typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /** Pointer to next L2Meta of the same write request */
+    struct QCowL2Meta *next;
+
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
 

From 2c3b32d25620c26e26fd590c198ec6d9cf91da57 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 26 Mar 2013 17:50:12 +0100
Subject: [PATCH 21/23] qcow2: Move cluster gathering to a non-looping loop

This patch is mainly to separate the indentation change from the
semantic changes. All that really changes here is that everything moves
into a while loop, all 'goto done' become 'break' and at the end of the
loop a new 'break is inserted.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c | 142 ++++++++++++++++++++++--------------------
 1 file changed, 74 insertions(+), 68 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 6dc7f7f36b..960d446417 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1147,79 +1147,85 @@ again:
     cluster_offset = 0;
     *host_offset = 0;
 
-    /*
-     * Now start gathering as many contiguous clusters as possible:
-     *
-     * 1. Check for overlaps with in-flight allocations
-     *
-     *      a) Overlap not in the first cluster -> shorten this request and let
-     *         the caller handle the rest in its next loop iteration.
-     *
-     *      b) Real overlaps of two requests. Yield and restart the search for
-     *         contiguous clusters (the situation could have changed while we
-     *         were sleeping)
-     *
-     *      c) TODO: Request starts in the same cluster as the in-flight
-     *         allocation ends. Shorten the COW of the in-fight allocation, set
-     *         cluster_offset to write to the same cluster and set up the right
-     *         synchronisation between the in-flight request and the new one.
-     */
-    cur_bytes = remaining;
-    ret = handle_dependencies(bs, start, &cur_bytes);
-    if (ret == -EAGAIN) {
-        goto again;
-    } else if (ret < 0) {
-        return ret;
-    } else {
-        /* handle_dependencies() may have decreased cur_bytes (shortened
-         * the allocations below) so that the next dependency is processed
-         * correctly during the next loop iteration. */
-    }
-
-    /*
-     * 2. Count contiguous COPIED clusters.
-     */
-    ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
-    if (ret < 0) {
-        return ret;
-    } else if (ret) {
-        if (!*host_offset) {
-            *host_offset = start_of_cluster(s, cluster_offset);
-        }
-
-        start           += cur_bytes;
-        remaining       -= cur_bytes;
-        cluster_offset  += cur_bytes;
-
+    while (true) {
+        /*
+         * Now start gathering as many contiguous clusters as possible:
+         *
+         * 1. Check for overlaps with in-flight allocations
+         *
+         *      a) Overlap not in the first cluster -> shorten this request and
+         *         let the caller handle the rest in its next loop iteration.
+         *
+         *      b) Real overlaps of two requests. Yield and restart the search
+         *         for contiguous clusters (the situation could have changed
+         *         while we were sleeping)
+         *
+         *      c) TODO: Request starts in the same cluster as the in-flight
+         *         allocation ends. Shorten the COW of the in-fight allocation,
+         *         set cluster_offset to write to the same cluster and set up
+         *         the right synchronisation between the in-flight request and
+         *         the new one.
+         */
         cur_bytes = remaining;
-    } else if (cur_bytes == 0) {
-        goto done;
-    }
-
-    /* If there is something left to allocate, do that now */
-    if (remaining == 0) {
-        goto done;
-    }
-
-    /*
-     * 3. If the request still hasn't completed, allocate new clusters,
-     *    considering any cluster_offset of steps 1c or 2.
-     */
-    ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
-    if (ret < 0) {
-        return ret;
-    } else if (ret) {
-        if (!*host_offset) {
-            *host_offset = start_of_cluster(s, cluster_offset);
+        ret = handle_dependencies(bs, start, &cur_bytes);
+        if (ret == -EAGAIN) {
+            goto again;
+        } else if (ret < 0) {
+            return ret;
+        } else {
+            /* handle_dependencies() may have decreased cur_bytes (shortened
+             * the allocations below) so that the next dependency is processed
+             * correctly during the next loop iteration. */
         }
 
-        start           += cur_bytes;
-        remaining       -= cur_bytes;
-        cluster_offset  += cur_bytes;
+        /*
+         * 2. Count contiguous COPIED clusters.
+         */
+        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            if (!*host_offset) {
+                *host_offset = start_of_cluster(s, cluster_offset);
+            }
+
+            start           += cur_bytes;
+            remaining       -= cur_bytes;
+            cluster_offset  += cur_bytes;
+
+            cur_bytes = remaining;
+        } else if (cur_bytes == 0) {
+            break;
+        }
+
+        /* If there is something left to allocate, do that now */
+        if (remaining == 0) {
+            break;
+        }
+
+        /*
+         * 3. If the request still hasn't completed, allocate new clusters,
+         *    considering any cluster_offset of steps 1c or 2.
+         */
+        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            if (!*host_offset) {
+                *host_offset = start_of_cluster(s, cluster_offset);
+            }
+
+            start           += cur_bytes;
+            remaining       -= cur_bytes;
+            cluster_offset  += cur_bytes;
+
+            break;
+        } else {
+            assert(cur_bytes == 0);
+            break;
+        }
     }
 
-    /* Some cleanup work */
-done:
     *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
     assert(*num > 0);
     assert(*host_offset != 0);

From ecdd5333ab9ed3f2b848066aaaef02c027b25e36 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 27 Mar 2013 11:43:49 +0100
Subject: [PATCH 22/23] qcow2: Gather clusters in a looping loop

Instead of just checking once in exactly this order if there are
dependendies, non-COW clusters and new allocation, this starts looping
around these. This way we can, for example, gather non-COW clusters after
new allocations as long as the host cluster offsets stay contiguous.

Once handle_dependencies() is extended so that COW areas of in-flight
allocations can be overwritten, this allows to continue with gathering
other clusters (we wouldn't be able to do that without this change
because we would have missed a possible second dependency in one of the
next clusters).

This means that in the typical sequential write case, we can combine the
COW overwrite of one cluster with the allocation of the next cluster as
soon as something like Delayed COW gets actually implemented. It is only
by avoiding splitting requests this way that Delayed COW actually starts
improving performance noticably.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2-cluster.c      | 74 ++++++++++++++++++++++----------------
 tests/qemu-iotests/044.out |  2 +-
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 960d446417..c71470a3db 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -770,7 +770,7 @@ out:
  *           must start over anyway, so consider *cur_bytes undefined.
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *cur_bytes)
+    uint64_t *cur_bytes, QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     QCowL2Meta *old_alloc;
@@ -793,6 +793,15 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
                 bytes = 0;
             }
 
+            /* Stop if already an l2meta exists. After yielding, it wouldn't
+             * be valid any more, so we'd have to clean up the old L2Metas
+             * and deal with requests depending on them before starting to
+             * gather new ones. Not worth the trouble. */
+            if (bytes == 0 && *m) {
+                *cur_bytes = 0;
+                return 0;
+            }
+
             if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
@@ -1023,16 +1032,16 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
     }
 
+    /* This function is only called when there were no non-COW clusters, so if
+     * we can't find any unallocated or COW clusters either, something is
+     * wrong with our code. */
+    assert(nb_clusters > 0);
+
     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
     if (ret < 0) {
         return ret;
     }
 
-    if (nb_clusters == 0) {
-        *bytes = 0;
-        return 0;
-    }
-
     /* Allocate, if necessary at a given offset in the image file */
     alloc_cluster_offset = start_of_cluster(s, *host_offset);
     ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
@@ -1146,8 +1155,27 @@ again:
     remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
     cluster_offset = 0;
     *host_offset = 0;
+    cur_bytes = 0;
+    *m = NULL;
 
     while (true) {
+
+        if (!*host_offset) {
+            *host_offset = start_of_cluster(s, cluster_offset);
+        }
+
+        assert(remaining >= cur_bytes);
+
+        start           += cur_bytes;
+        remaining       -= cur_bytes;
+        cluster_offset  += cur_bytes;
+
+        if (remaining == 0) {
+            break;
+        }
+
+        cur_bytes = remaining;
+
         /*
          * Now start gathering as many contiguous clusters as possible:
          *
@@ -1166,12 +1194,17 @@ again:
          *         the right synchronisation between the in-flight request and
          *         the new one.
          */
-        cur_bytes = remaining;
-        ret = handle_dependencies(bs, start, &cur_bytes);
+        ret = handle_dependencies(bs, start, &cur_bytes, m);
         if (ret == -EAGAIN) {
+            /* Currently handle_dependencies() doesn't yield if we already had
+             * an allocation. If it did, we would have to clean up the L2Meta
+             * structs before starting over. */
+            assert(*m == NULL);
             goto again;
         } else if (ret < 0) {
             return ret;
+        } else if (cur_bytes == 0) {
+            break;
         } else {
             /* handle_dependencies() may have decreased cur_bytes (shortened
              * the allocations below) so that the next dependency is processed
@@ -1185,24 +1218,11 @@ again:
         if (ret < 0) {
             return ret;
         } else if (ret) {
-            if (!*host_offset) {
-                *host_offset = start_of_cluster(s, cluster_offset);
-            }
-
-            start           += cur_bytes;
-            remaining       -= cur_bytes;
-            cluster_offset  += cur_bytes;
-
-            cur_bytes = remaining;
+            continue;
         } else if (cur_bytes == 0) {
             break;
         }
 
-        /* If there is something left to allocate, do that now */
-        if (remaining == 0) {
-            break;
-        }
-
         /*
          * 3. If the request still hasn't completed, allocate new clusters,
          *    considering any cluster_offset of steps 1c or 2.
@@ -1211,15 +1231,7 @@ again:
         if (ret < 0) {
             return ret;
         } else if (ret) {
-            if (!*host_offset) {
-                *host_offset = start_of_cluster(s, cluster_offset);
-            }
-
-            start           += cur_bytes;
-            remaining       -= cur_bytes;
-            cluster_offset  += cur_bytes;
-
-            break;
+            continue;
         } else {
             assert(cur_bytes == 0);
             break;
diff --git a/tests/qemu-iotests/044.out b/tests/qemu-iotests/044.out
index 34c25c793e..5c5aa929fb 100644
--- a/tests/qemu-iotests/044.out
+++ b/tests/qemu-iotests/044.out
@@ -1,6 +1,6 @@
 No errors were found on the image.
 7292415/33554432 = 21.73% allocated, 0.00% fragmented, 0.00% compressed clusters
-Image end offset: 4296447488
+Image end offset: 4296448000
 .
 ----------------------------------------------------------------------
 Ran 1 tests

From 5d186eb03eb37b257e29a4731ca484362d5fc4e4 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 27 Mar 2013 17:28:18 +0100
Subject: [PATCH 23/23] block: Fix direct use of protocols as driver for
 bdrv_open()

bdrv_open_common() implements direct use of protocols by copying the
pre-opened BlockDriverStates to bs using bdrv_swap(). It did however
first set some fields in bs, which end up in file after the swap. When
bdrv_open() destroys file, it appears to be open, and because it isn't,
qemu could segfault while trying to close it.

Reorder the operations to return immediately in such cases so that file
is correctly detected as closed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index 16a92a4e08..0ae2e93982 100644
--- a/block.c
+++ b/block.c
@@ -680,6 +680,18 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
 
+    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
+        return -ENOTSUP;
+    }
+
+    /* bdrv_open() with directly using a protocol as drv. This layer is already
+     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
+     * and return immediately. */
+    if (file != NULL && drv->bdrv_file_open) {
+        bdrv_swap(file, bs);
+        return 0;
+    }
+
     bs->open_flags = flags;
     bs->buffer_alignment = 512;
 
@@ -694,10 +706,6 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
         bs->filename[0] = '\0';
     }
 
-    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
-        return -ENOTSUP;
-    }
-
     bs->drv = drv;
     bs->opaque = g_malloc0(drv->instance_size);
 
@@ -708,13 +716,9 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     /* Open the image, either directly or using a protocol */
     if (drv->bdrv_file_open) {
-        if (file != NULL) {
-            bdrv_swap(file, bs);
-            ret = 0;
-        } else {
-            assert(drv->bdrv_parse_filename || filename != NULL);
-            ret = drv->bdrv_file_open(bs, filename, options, open_flags);
-        }
+        assert(file == NULL);
+        assert(drv->bdrv_parse_filename || filename != NULL);
+        ret = drv->bdrv_file_open(bs, filename, options, open_flags);
     } else {
         assert(file != NULL);
         bs->file = file;