From ac48e389d073bf2c8703745eef4824fabe0427ba Mon Sep 17 00:00:00 2001
From: Kevin Wolf <mail@kevin-wolf.de>
Date: Fri, 10 Sep 2010 12:27:02 +0200
Subject: [PATCH 01/20] vvfat: Fix segfault on write to read-only disk

vvfat tries to set the readonly flag in its open function, but nowadays
this is overwritted with the readonly=... command line option. Check in
bdrv_write if the vvfat was opened read-only and return an error in this
case.

Without this check, vvfat tries to access the qcow bs, which is NULL
without enabled write support.

Signed-off-by: Kevin Wolf <mail@kevin-wolf.de>
---
 block/vvfat.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/vvfat.c b/block/vvfat.c
index 365332aa21..5898d664b0 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2665,6 +2665,11 @@ static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
 
 DLOG(checkpoint());
 
+    /* Check if we're operating in read-only mode */
+    if (s->qcow == NULL) {
+        return -EACCES;
+    }
+
     vvfat_close_current_file(s);
 
     /*

From 9217e26f43df4aab7deaea35b21caacc1f1f854b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <mail@kevin-wolf.de>
Date: Fri, 10 Sep 2010 12:27:03 +0200
Subject: [PATCH 02/20] vvfat: Fix double free for opening the image rw

Allocation and deallocation of bs->opaque is not in the control of a
block driver. Therefore it should not set bs->opaque to a data structure
used by another bs, or closing the image will lead to a double free.

Signed-off-by: Kevin Wolf <mail@kevin-wolf.de>
---
 block/vvfat.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 5898d664b0..07720371c9 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2768,12 +2768,12 @@ static int vvfat_is_allocated(BlockDriverState *bs,
 
 static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
 	const uint8_t* buffer, int nb_sectors) {
-    BDRVVVFATState* s = bs->opaque;
+    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     return try_commit(s);
 }
 
 static void write_target_close(BlockDriverState *bs) {
-    BDRVVVFATState* s = bs->opaque;
+    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     bdrv_delete(s->qcow);
     free(s->qcow_filename);
 }
@@ -2816,7 +2816,8 @@ static int enable_write_target(BDRVVVFATState *s)
 
     s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
     s->bs->backing_hd->drv = &vvfat_write_target;
-    s->bs->backing_hd->opaque = s;
+    s->bs->backing_hd->opaque = qemu_malloc(sizeof(void*));
+    *(void**)s->bs->backing_hd->opaque = s;
 
     return 0;
 }

From a655211ac6d379c5b0813761e5d11415780f41fd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <mail@kevin-wolf.de>
Date: Fri, 10 Sep 2010 12:27:04 +0200
Subject: [PATCH 03/20] vvfat: Use cache=unsafe

The qcow file used for write support in vvfat is a temporary file,
so we can use cache=unsafe there. Without this, write support is just
too slow to be of any use.

Signed-off-by: Kevin Wolf <mail@kevin-wolf.de>
---
 block/vvfat.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 07720371c9..53e57bf228 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2788,6 +2788,7 @@ static int enable_write_target(BDRVVVFATState *s)
 {
     BlockDriver *bdrv_qcow;
     QEMUOptionParameter *options;
+    int ret;
     int size = sector2cluster(s, s->sector_count);
     s->used_clusters = calloc(size, 1);
 
@@ -2803,11 +2804,16 @@ static int enable_write_target(BDRVVVFATState *s)
 
     if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
 	return -1;
+
     s->qcow = bdrv_new("");
-    if (s->qcow == NULL ||
-        bdrv_open(s->qcow, s->qcow_filename, BDRV_O_RDWR, bdrv_qcow) < 0)
-    {
-	return -1;
+    if (s->qcow == NULL) {
+        return -1;
+    }
+
+    ret = bdrv_open(s->qcow, s->qcow_filename,
+            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
+    if (ret < 0) {
+	return ret;
     }
 
 #ifndef _WIN32

From 72aef7318f54f0ec8c84c2bf2bb8edc5702d7dd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 12 Sep 2010 23:42:56 +0200
Subject: [PATCH 04/20] use qemu_blockalign consistently

Use qemu_blockalign for all allocations in the block layer.  This allows
increasing the required alignment, which is need to support O_DIRECT on
devices with large block sizes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-disk.c     | 9 +++++----
 hw/sd.c            | 2 +-
 posix-aio-compat.c | 2 +-
 qemu-io.c          | 2 +-
 qemu-nbd.c         | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 1446ca634c..ee20e8f04a 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -70,14 +70,15 @@ struct SCSIDiskState
     char *serial;
 };
 
-static SCSIDiskReq *scsi_new_request(SCSIDevice *d, uint32_t tag, uint32_t lun)
+static SCSIDiskReq *scsi_new_request(SCSIDiskState *s, uint32_t tag,
+        uint32_t lun)
 {
     SCSIRequest *req;
     SCSIDiskReq *r;
 
-    req = scsi_req_alloc(sizeof(SCSIDiskReq), d, tag, lun);
+    req = scsi_req_alloc(sizeof(SCSIDiskReq), &s->qdev, tag, lun);
     r = DO_UPCAST(SCSIDiskReq, req, req);
-    r->iov.iov_base = qemu_memalign(512, SCSI_DMA_BUF_SIZE);
+    r->iov.iov_base = qemu_blockalign(s->bs, SCSI_DMA_BUF_SIZE);
     return r;
 }
 
@@ -939,7 +940,7 @@ static int32_t scsi_send_command(SCSIDevice *d, uint32_t tag,
     }
     /* ??? Tags are not unique for different luns.  We only implement a
        single lun, so this should not matter.  */
-    r = scsi_new_request(d, tag, lun);
+    r = scsi_new_request(s, tag, lun);
     outbuf = (uint8_t *)r->iov.iov_base;
     is_write = 0;
     DPRINTF("Command: lun=%d tag=0x%x data=0x%02x", lun, tag, buf[0]);
diff --git a/hw/sd.c b/hw/sd.c
index c928120abd..4bcf1c01a6 100644
--- a/hw/sd.c
+++ b/hw/sd.c
@@ -440,7 +440,7 @@ SDState *sd_init(BlockDriverState *bs, int is_spi)
     SDState *sd;
 
     sd = (SDState *) qemu_mallocz(sizeof(SDState));
-    sd->buf = qemu_memalign(512, 512);
+    sd->buf = qemu_blockalign(bs, 512);
     sd->spi = is_spi;
     sd->enable = 1;
     sd_reset(sd, bs);
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index 10f1f037fb..842f1a24aa 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -270,7 +270,7 @@ static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
      * Ok, we have to do it the hard way, copy all segments into
      * a single aligned buffer.
      */
-    buf = qemu_memalign(512, aiocb->aio_nbytes);
+    buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes);
     if (aiocb->aio_type & QEMU_AIO_WRITE) {
         char *p = buf;
         int i;
diff --git a/qemu-io.c b/qemu-io.c
index bd3bd16fdf..b4e5cc8fe6 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -61,7 +61,7 @@ static void *qemu_io_alloc(size_t len, int pattern)
 
 	if (misalign)
 		len += MISALIGN_OFFSET;
-	buf = qemu_memalign(512, len);
+	buf = qemu_blockalign(bs, len);
 	memset(buf, pattern, len);
 	if (misalign)
 		buf += MISALIGN_OFFSET;
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 91b569f8e6..923a3bfd34 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -446,7 +446,7 @@ int main(int argc, char **argv)
     max_fd = sharing_fds[0];
     nb_fds++;
 
-    data = qemu_memalign(512, NBD_BUFFER_SIZE);
+    data = qemu_blockalign(bs, NBD_BUFFER_SIZE);
     if (data == NULL)
         errx(EXIT_FAILURE, "Cannot allocate data buffer");
 

From 581b9e29f36eec5de0779c3dbade980e4405d92e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 12 Sep 2010 23:43:21 +0200
Subject: [PATCH 05/20] raw-posix: handle > 512 byte alignment correctly

Replace the hardcoded handling of 512 byte alignment with bs->buffer_alignment
to handle larger sector size devices correctly.

Note that we can not rely on it to be initialize in bdrv_open, so deal
with the worst case there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 79 +++++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 813372a6c7..a5cbb7e929 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -97,12 +97,12 @@
 #define FTYPE_CD     1
 #define FTYPE_FD     2
 
-#define ALIGNED_BUFFER_SIZE (32 * 512)
-
 /* if the FD is not accessed during that time (in ms), we try to
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
+#define MAX_BLOCKSIZE	4096
+
 typedef struct BDRVRawState {
     int fd;
     int type;
@@ -118,7 +118,8 @@ typedef struct BDRVRawState {
     int use_aio;
     void *aio_ctx;
 #endif
-    uint8_t* aligned_buf;
+    uint8_t *aligned_buf;
+    unsigned aligned_buf_size;
 } BDRVRawState;
 
 static int fd_open(BlockDriverState *bs);
@@ -161,7 +162,12 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     s->aligned_buf = NULL;
 
     if ((bdrv_flags & BDRV_O_NOCACHE)) {
-        s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
+        /*
+         * Allocate a buffer for read/modify/write cycles.  Chose the size
+         * pessimistically as we don't know the block size yet.
+         */
+        s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
+        s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
         if (s->aligned_buf == NULL) {
             goto out_close;
         }
@@ -278,8 +284,9 @@ static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
 }
 
 /*
- * offset and count are in bytes, but must be multiples of 512 for files
- * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ * offset and count are in bytes, but must be multiples of the sector size
+ * for files opened with O_DIRECT. buf must be aligned to sector size bytes
+ * then.
  *
  * This function may be called without alignment if the caller ensures
  * that O_DIRECT is not in effect.
@@ -316,24 +323,25 @@ static int raw_pread(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
+    unsigned sector_mask = bs->buffer_alignment - 1;
     int size, ret, shift, sum;
 
     sum = 0;
 
     if (s->aligned_buf != NULL)  {
 
-        if (offset & 0x1ff) {
-            /* align offset on a 512 bytes boundary */
+        if (offset & sector_mask) {
+            /* align offset on a sector size bytes boundary */
 
-            shift = offset & 0x1ff;
-            size = (shift + count + 0x1ff) & ~0x1ff;
-            if (size > ALIGNED_BUFFER_SIZE)
-                size = ALIGNED_BUFFER_SIZE;
+            shift = offset & sector_mask;
+            size = (shift + count + sector_mask) & ~sector_mask;
+            if (size > s->aligned_buf_size)
+                size = s->aligned_buf_size;
             ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
             if (ret < 0)
                 return ret;
 
-            size = 512 - shift;
+            size = bs->buffer_alignment - shift;
             if (size > count)
                 size = count;
             memcpy(buf, s->aligned_buf + shift, size);
@@ -346,15 +354,15 @@ static int raw_pread(BlockDriverState *bs, int64_t offset,
             if (count == 0)
                 return sum;
         }
-        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+        if (count & sector_mask || (uintptr_t) buf & sector_mask) {
 
             /* read on aligned buffer */
 
             while (count) {
 
-                size = (count + 0x1ff) & ~0x1ff;
-                if (size > ALIGNED_BUFFER_SIZE)
-                    size = ALIGNED_BUFFER_SIZE;
+                size = (count + sector_mask) & ~sector_mask;
+                if (size > s->aligned_buf_size)
+                    size = s->aligned_buf_size;
 
                 ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
                 if (ret < 0) {
@@ -404,25 +412,28 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
+    unsigned sector_mask = bs->buffer_alignment - 1;
     int size, ret, shift, sum;
 
     sum = 0;
 
     if (s->aligned_buf != NULL) {
 
-        if (offset & 0x1ff) {
-            /* align offset on a 512 bytes boundary */
-            shift = offset & 0x1ff;
-            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+        if (offset & sector_mask) {
+            /* align offset on a sector size bytes boundary */
+            shift = offset & sector_mask;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf,
+                                    bs->buffer_alignment);
             if (ret < 0)
                 return ret;
 
-            size = 512 - shift;
+            size = bs->buffer_alignment - shift;
             if (size > count)
                 size = count;
             memcpy(s->aligned_buf + shift, buf, size);
 
-            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf,
+                                     bs->buffer_alignment);
             if (ret < 0)
                 return ret;
 
@@ -434,12 +445,12 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
             if (count == 0)
                 return sum;
         }
-        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+        if (count & sector_mask || (uintptr_t) buf & sector_mask) {
 
-            while ((size = (count & ~0x1ff)) != 0) {
+            while ((size = (count & ~sector_mask)) != 0) {
 
-                if (size > ALIGNED_BUFFER_SIZE)
-                    size = ALIGNED_BUFFER_SIZE;
+                if (size > s->aligned_buf_size)
+                    size = s->aligned_buf_size;
 
                 memcpy(s->aligned_buf, buf, size);
 
@@ -452,14 +463,16 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
                 count -= ret;
                 sum += ret;
             }
-            /* here, count < 512 because (count & ~0x1ff) == 0 */
+            /* here, count < 512 because (count & ~sector_mask) == 0 */
             if (count) {
-                ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf,
+                                     bs->buffer_alignment);
                 if (ret < 0)
                     return ret;
                  memcpy(s->aligned_buf, buf, count);
 
-                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf,
+                                     bs->buffer_alignment);
                  if (ret < 0)
                      return ret;
                  if (count < ret)
@@ -487,12 +500,12 @@ static int raw_write(BlockDriverState *bs, int64_t sector_num,
 /*
  * Check if all memory in this vector is sector aligned.
  */
-static int qiov_is_aligned(QEMUIOVector *qiov)
+static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 {
     int i;
 
     for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % BDRV_SECTOR_SIZE) {
+        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
             return 0;
         }
     }
@@ -515,7 +528,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
      * driver that it needs to copy the buffer.
      */
     if (s->aligned_buf) {
-        if (!qiov_is_aligned(qiov)) {
+        if (!qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
         } else if (s->use_aio) {

From 5fe16888d33af7aca0d613d888c161af68c755c3 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <laurent@vivier.eu>
Date: Fri, 17 Sep 2010 18:13:57 +0200
Subject: [PATCH 06/20] Improve qemu-nbd performance by 4400 %

This patch allows to reduce the boot time from an NBD server from 225 seconds to
5 seconds (time between the "boot cd:0" and the kernel init) for the
following command lines:

./qemu-nbd -t ../ISO/debian-500-powerpc-netinst.iso
and
./ppc-softmmu/qemu-system-ppc -cdrom nbd:localhost:1024

This patch combines the reply header and payload send operation.

Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 nbd.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/nbd.c b/nbd.c
index 011b50f166..4bf2eb7cb0 100644
--- a/nbd.c
+++ b/nbd.c
@@ -49,6 +49,7 @@
 
 /* This is all part of the "official" NBD API */
 
+#define NBD_REPLY_SIZE		(4 + 4 + 8)
 #define NBD_REQUEST_MAGIC       0x25609513
 #define NBD_REPLY_MAGIC         0x67446698
 
@@ -588,7 +589,7 @@ static int nbd_receive_request(int csock, struct nbd_request *request)
 
 int nbd_receive_reply(int csock, struct nbd_reply *reply)
 {
-	uint8_t buf[4 + 4 + 8];
+	uint8_t buf[NBD_REPLY_SIZE];
 	uint32_t magic;
 
 	memset(buf, 0xAA, sizeof(buf));
@@ -655,9 +656,9 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
 	if (nbd_receive_request(csock, &request) == -1)
 		return -1;
 
-	if (request.len > data_size) {
+	if (request.len + NBD_REPLY_SIZE > data_size) {
 		LOG("len (%u) is larger than max len (%u)",
-		    request.len, data_size);
+		    request.len + NBD_REPLY_SIZE, data_size);
 		errno = EINVAL;
 		return -1;
 	}
@@ -687,7 +688,8 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
 	case NBD_CMD_READ:
 		TRACE("Request type is READ");
 
-		if (bdrv_read(bs, (request.from + dev_offset) / 512, data,
+		if (bdrv_read(bs, (request.from + dev_offset) / 512,
+			      data + NBD_REPLY_SIZE,
 			      request.len / 512) == -1) {
 			LOG("reading from file failed");
 			errno = EINVAL;
@@ -697,12 +699,21 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
 
 		TRACE("Read %u byte(s)", request.len);
 
-		if (nbd_send_reply(csock, &reply) == -1)
-			return -1;
+		/* Reply
+		   [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
+		   [ 4 ..  7]    error   (0 == no error)
+		   [ 7 .. 15]    handle
+		 */
+
+		cpu_to_be32w((uint32_t*)data, NBD_REPLY_MAGIC);
+		cpu_to_be32w((uint32_t*)(data + 4), reply.error);
+		cpu_to_be64w((uint64_t*)(data + 8), reply.handle);
 
 		TRACE("Sending data to client");
 
-		if (write_sync(csock, data, request.len) != request.len) {
+		if (write_sync(csock, data,
+			       request.len + NBD_REPLY_SIZE) !=
+			       request.len + NBD_REPLY_SIZE) {
 			LOG("writing to socket failed");
 			errno = EINVAL;
 			return -1;

From c2e2872bf49884d2b2382613891ce979e95c11ca Mon Sep 17 00:00:00 2001
From: Laurent Vivier <laurent@vivier.eu>
Date: Fri, 17 Sep 2010 20:37:25 +0200
Subject: [PATCH 07/20] nbd: correctly manage default port

block/nbd.c: use default port number when none is specified
qemu-nbd.c:  use IANA-assigned port number: 10809

Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nbd.c | 2 --
 qemu-nbd.c  | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 5e9d6cb8e6..c8dc763c6b 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -95,8 +95,6 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
             if (r == p) {
                 goto out;
             }
-        } else if (name == NULL) {
-            goto out;
         }
 
         sock = tcp_socket_outgoing(hostname, port);
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 923a3bfd34..99f1d22884 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -44,7 +44,7 @@ static void usage(const char *name)
 "Usage: %s [OPTIONS] FILE\n"
 "QEMU Disk Network Block Device Server\n"
 "\n"
-"  -p, --port=PORT      port to listen on (default `1024')\n"
+"  -p, --port=PORT      port to listen on (default `%d')\n"
 "  -o, --offset=OFFSET  offset into the image\n"
 "  -b, --bind=IFACE     interface to bind to (default `0.0.0.0')\n"
 "  -k, --socket=PATH    path to the unix socket\n"
@@ -62,7 +62,7 @@ static void usage(const char *name)
 "  -V, --version        output version information and exit\n"
 "\n"
 "Report bugs to <anthony@codemonkey.ws>\n"
-    , name, "DEVICE");
+    , name, NBD_DEFAULT_PORT, "DEVICE");
 }
 
 static void version(const char *name)
@@ -188,7 +188,7 @@ int main(int argc, char **argv)
     bool readonly = false;
     bool disconnect = false;
     const char *bindto = "0.0.0.0";
-    int port = 1024;
+    int port = NBD_DEFAULT_PORT;
     struct sockaddr_in addr;
     socklen_t addr_len = sizeof(addr);
     off_t fd_size;

From c01828fb518561235812d1035804e6efca31182a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Sep 2010 16:31:03 +0200
Subject: [PATCH 08/20] qcow2: Move sync out of write_refcount_block_entries

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4c19e7ebd8..7dc75d19f7 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -444,7 +444,7 @@ static int write_refcount_block_entries(BlockDriverState *bs,
     size = (last_index - first_index) << REFCOUNT_SHIFT;
 
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
-    ret = bdrv_pwrite_sync(bs->file,
+    ret = bdrv_pwrite(bs->file,
         refcount_block_offset + (first_index << REFCOUNT_SHIFT),
         &s->refcount_block_cache[first_index], size);
     if (ret < 0) {
@@ -551,6 +551,8 @@ fail:
         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
     }
 
+    bdrv_flush(bs->file);
+
     return ret;
 }
 

From 1c4c28149fff77b8c983fdabe4e76bdc8cadd572 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Sep 2010 16:36:58 +0200
Subject: [PATCH 09/20] qcow2: Move sync out of update_refcount

Note that the flush is omitted intentionally in qcow2_free_clusters. If
anything, we can leak clusters here if we lose the writes.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 7dc75d19f7..4fc3f80a9e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -261,6 +261,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
             goto fail_block;
         }
 
+        bdrv_flush(bs->file);
+
         /* Initialize the new refcount block only after updating its refcount,
          * update_refcount uses the refcount cache itself */
         memset(s->refcount_block_cache, 0, s->cluster_size);
@@ -551,8 +553,6 @@ fail:
         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
     }
 
-    bdrv_flush(bs->file);
-
     return ret;
 }
 
@@ -575,6 +575,8 @@ static int update_cluster_refcount(BlockDriverState *bs,
         return ret;
     }
 
+    bdrv_flush(bs->file);
+
     return get_refcount(bs, cluster_index);
 }
 
@@ -626,6 +628,9 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
     if (ret < 0) {
         return ret;
     }
+
+    bdrv_flush(bs->file);
+
     return offset;
 }
 
@@ -803,6 +808,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             if (ret < 0) {
                                 goto fail;
                             }
+
+                            /* TODO Flushing once for the whole function should
+                             * be enough */
+                            bdrv_flush(bs->file);
                         }
                         /* compressed clusters are never modified */
                         refcount = 2;

From 29216ed14f0bae35d1d9bb114a1aee7ee6837670 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Sep 2010 16:57:48 +0200
Subject: [PATCH 10/20] qcow2: Move sync out of qcow2_alloc_clusters

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c  | 3 +++
 block/qcow2-refcount.c | 4 ++--
 block/qcow2-snapshot.c | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f562b1602e..818c0dbd44 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -60,6 +60,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size)
         qemu_free(new_l1_table);
         return new_l1_table_offset;
     }
+    bdrv_flush(bs->file);
 
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
@@ -243,6 +244,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     if (l2_offset < 0) {
         return l2_offset;
     }
+    bdrv_flush(bs->file);
 
     /* allocate a new entry in the l2 cache */
 
@@ -863,6 +865,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
         QLIST_REMOVE(m, next_in_flight);
         return cluster_offset;
     }
+    bdrv_flush(bs->file);
 
     /* save info needed for meta data update */
     m->offset = offset;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4fc3f80a9e..7082601f59 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -629,8 +629,6 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
         return ret;
     }
 
-    bdrv_flush(bs->file);
-
     return offset;
 }
 
@@ -678,6 +676,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
             goto redo;
         }
     }
+
+    bdrv_flush(bs->file);
     return offset;
 }
 
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 6228612f84..bbfcaaae22 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -138,6 +138,7 @@ static int qcow_write_snapshots(BlockDriverState *bs)
     snapshots_size = offset;
 
     snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+    bdrv_flush(bs->file);
     offset = snapshots_offset;
     if (offset < 0) {
         return offset;
@@ -271,6 +272,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     if (l1_table_offset < 0) {
         goto fail;
     }
+    bdrv_flush(bs->file);
 
     sn->l1_table_offset = l1_table_offset;
     sn->l1_size = s->l1_size;

From 9f8e668eb1826434f61a63ff260d6c8b466e483a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Sep 2010 17:02:09 +0200
Subject: [PATCH 11/20] qcow2: Get rid of additional sync on COW

We always have a sync for the refcount update when a new cluster is
allocated. If we move this past the COW, we can save an additional sync.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 818c0dbd44..cb2e33f1ac 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -415,7 +415,7 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
                         &s->aes_encrypt_key);
     }
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_write_sync(bs->file, (cluster_offset >> 9) + n_start,
+    ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start,
         s->cluster_data, n);
     if (ret < 0)
         return ret;
@@ -714,6 +714,13 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
      }
 
+    /*
+     * Before we update the L2 table to actually point to the new cluster, we
+     * need to be sure that the refcounts have been increased and COW was
+     * handled.
+     */
+    bdrv_flush(bs->file);
+
     ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
     if (ret < 0) {
         qcow2_l2_cache_reset(bs);
@@ -865,7 +872,6 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
         QLIST_REMOVE(m, next_in_flight);
         return cluster_offset;
     }
-    bdrv_flush(bs->file);
 
     /* save info needed for meta data update */
     m->offset = offset;

From 316a7af35029d52c971f5df044eb69901d6f16ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 12 Sep 2010 23:43:39 +0200
Subject: [PATCH 12/20] virtio-blk: propagate the required alignment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/virtio-blk.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index bd6bbe6b14..a1df26dbcf 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -540,6 +540,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf)
     register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
                     virtio_blk_save, virtio_blk_load, s);
     bdrv_set_removable(s->bs, 0);
+    s->bs->buffer_alignment = conf->logical_block_size;
 
     return &s->vdev;
 }

From 73fdb1e1956c6ac10e76882a2e8426d794e02bb2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 12 Sep 2010 23:43:50 +0200
Subject: [PATCH 13/20] scsi-disk: propagate the required alignment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-disk.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index ee20e8f04a..9628b39a21 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -1178,6 +1178,7 @@ static int scsi_disk_initfn(SCSIDevice *dev)
         s->qdev.blocksize = s->qdev.conf.logical_block_size;
     }
     s->cluster_size = s->qdev.blocksize / 512;
+    s->bs->buffer_alignment = s->qdev.blocksize;
 
     s->qdev.type = TYPE_DISK;
     qemu_add_vm_change_state_handler(scsi_dma_restart_cb, s);

From 1b2adf28030ee3a570ba3a22401d44da2b18fe01 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 12 Sep 2010 23:44:00 +0200
Subject: [PATCH 14/20] ide: propagate the required alignment

IDE is a bit ugly in this respect.  For one it doesn't really keep track
of a sector size - most of the protocol is in units of 512 bytes, and we
assume 2048 bytes for CDROMs which is correct most of the time.

Second IDE allocates an I/O buffer long before we know if we're dealing
with a CDROM or not, so increase the alignment for the io_buffer
unconditionally.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 1e466d1ff5..06b6e14e56 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2645,6 +2645,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs,
     if (bdrv_get_type_hint(bs) == BDRV_TYPE_CDROM) {
         s->drive_kind = IDE_CD;
         bdrv_set_change_cb(bs, cdrom_change_cb, s);
+        bs->buffer_alignment = 2048;
     } else {
         if (!bdrv_is_inserted(s->bs)) {
             error_report("Device needs media, but drive is empty");
@@ -2679,7 +2680,8 @@ static void ide_init1(IDEBus *bus, int unit)
     s->bus = bus;
     s->unit = unit;
     s->drive_serial = drive_serial++;
-    s->io_buffer = qemu_blockalign(s->bs, IDE_DMA_BUF_SECTORS*512 + 4);
+    /* we need at least 2k alignment for accessing CDROMs using O_DIRECT */
+    s->io_buffer = qemu_memalign(2048, IDE_DMA_BUF_SECTORS*512 + 4);
     s->io_buffer_total_len = IDE_DMA_BUF_SECTORS*512 + 4;
     s->smart_selftest_data = qemu_blockalign(s->bs, 512);
     s->sector_write_timer = qemu_new_timer(vm_clock,

From b8a83a4f79ca4cd0689117b119ffaa1a91b00d52 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 13 Sep 2010 18:06:11 +0200
Subject: [PATCH 15/20] cutils: qemu_iovec_copy and qemu_iovec_memset

This adds two functions that work on QEMUIOVectors and will be used by the next
qcow2 patches.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 cutils.c      | 50 +++++++++++++++++++++++++++++++++++++++++---------
 qemu-common.h |  3 +++
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/cutils.c b/cutils.c
index 036ae3ce31..588373774a 100644
--- a/cutils.c
+++ b/cutils.c
@@ -168,30 +168,50 @@ void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
 }
 
 /*
- * Copies iovecs from src to the end dst until src is completely copied or the
- * total size of the copied iovec reaches size. The size of the last copied
- * iovec is changed in order to fit the specified total size if it isn't a
- * perfect fit already.
+ * Copies iovecs from src to the end of dst. It starts copying after skipping
+ * the given number of bytes in src and copies until src is completely copied
+ * or the total size of the copied iovec reaches size.The size of the last
+ * copied iovec is changed in order to fit the specified total size if it isn't
+ * a perfect fit already.
  */
-void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
+void qemu_iovec_copy(QEMUIOVector *dst, QEMUIOVector *src, uint64_t skip,
+    size_t size)
 {
     int i;
     size_t done;
+    void *iov_base;
+    uint64_t iov_len;
 
     assert(dst->nalloc != -1);
 
     done = 0;
     for (i = 0; (i < src->niov) && (done != size); i++) {
-        if (done + src->iov[i].iov_len > size) {
-            qemu_iovec_add(dst, src->iov[i].iov_base, size - done);
+        if (skip >= src->iov[i].iov_len) {
+            /* Skip the whole iov */
+            skip -= src->iov[i].iov_len;
+            continue;
+        } else {
+            /* Skip only part (or nothing) of the iov */
+            iov_base = (uint8_t*) src->iov[i].iov_base + skip;
+            iov_len = src->iov[i].iov_len - skip;
+            skip = 0;
+        }
+
+        if (done + iov_len > size) {
+            qemu_iovec_add(dst, iov_base, size - done);
             break;
         } else {
-            qemu_iovec_add(dst, src->iov[i].iov_base, src->iov[i].iov_len);
+            qemu_iovec_add(dst, iov_base, iov_len);
         }
-        done += src->iov[i].iov_len;
+        done += iov_len;
     }
 }
 
+void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
+{
+    qemu_iovec_copy(dst, src, 0, size);
+}
+
 void qemu_iovec_destroy(QEMUIOVector *qiov)
 {
     assert(qiov->nalloc != -1);
@@ -234,6 +254,18 @@ void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count)
     }
 }
 
+void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count)
+{
+    size_t n;
+    int i;
+
+    for (i = 0; i < qiov->niov && count; ++i) {
+        n = MIN(count, qiov->iov[i].iov_len);
+        memset(qiov->iov[i].iov_base, c, n);
+        count -= n;
+    }
+}
+
 #ifndef _WIN32
 /* Sets a specific flag */
 int fcntl_setfl(int fd, int flag)
diff --git a/qemu-common.h b/qemu-common.h
index dfd3dc08a2..5544ffd8cf 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -278,11 +278,14 @@ typedef struct QEMUIOVector {
 void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
+void qemu_iovec_copy(QEMUIOVector *dst, QEMUIOVector *src, uint64_t skip,
+    size_t size);
 void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size);
 void qemu_iovec_destroy(QEMUIOVector *qiov);
 void qemu_iovec_reset(QEMUIOVector *qiov);
 void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf);
 void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count);
+void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count);
 
 struct Monitor;
 typedef struct Monitor Monitor;

From bd28f835652e396841bb73080a07bcc08fe21bf0 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 13 Sep 2010 18:08:52 +0200
Subject: [PATCH 16/20] qcow2: Avoid bounce buffers for AIO read requests

qcow2 used to use bounce buffers for any AIO requests. This does not only imply
unnecessary copying, but also unbounded allocations which should be avoided.

This patch removes bounce buffers from the normal AIO read path, and constrains
them to a constant size for encrypted images.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c |  8 +++-
 block/qcow2.c         | 86 +++++++++++++++++++++++++++++--------------
 block/qcow2.h         |  4 +-
 3 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index cb2e33f1ac..fb4224a669 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -350,6 +350,8 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num,
     BDRVQcowState *s = bs->opaque;
     int ret, index_in_cluster, n, n1;
     uint64_t cluster_offset;
+    struct iovec iov;
+    QEMUIOVector qiov;
 
     while (nb_sectors > 0) {
         n = nb_sectors;
@@ -364,7 +366,11 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num,
         if (!cluster_offset) {
             if (bs->backing_hd) {
                 /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing_hd, sector_num, buf, n);
+                iov.iov_base = buf;
+                iov.iov_len = n * 512;
+                qemu_iovec_init_external(&qiov, &iov, 1);
+
+                n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n);
                 if (n1 > 0) {
                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING);
                     ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
diff --git a/block/qcow2.c b/block/qcow2.c
index a53014dbda..4a688b4863 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -311,8 +311,8 @@ static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
 }
 
 /* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs,
-                  int64_t sector_num, uint8_t *buf, int nb_sectors)
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors)
 {
     int n1;
     if ((sector_num + nb_sectors) <= bs->total_sectors)
@@ -321,7 +321,9 @@ int qcow2_backing_read1(BlockDriverState *bs,
         n1 = 0;
     else
         n1 = bs->total_sectors - sector_num;
-    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
+
+    qemu_iovec_memset(qiov, 0, 512 * (nb_sectors - n1));
+
     return n1;
 }
 
@@ -333,6 +335,7 @@ typedef struct QCowAIOCB {
     void *orig_buf;
     int remaining_sectors;
     int cur_nr_sectors;	/* number of sectors in current iteration */
+    uint64_t bytes_done;
     uint64_t cluster_offset;
     uint8_t *cluster_data;
     BlockDriverAIOCB *hd_aiocb;
@@ -397,15 +400,19 @@ static void qcow_aio_read_cb(void *opaque, int ret)
         /* nothing to do */
     } else {
         if (s->crypt_method) {
-            qcow2_encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
-                            acb->cur_nr_sectors, 0,
-                            &s->aes_decrypt_key);
+            qcow2_encrypt_sectors(s, acb->sector_num,  acb->cluster_data,
+                acb->cluster_data, acb->cur_nr_sectors, 0, &s->aes_decrypt_key);
+            qemu_iovec_reset(&acb->hd_qiov);
+            qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
+                acb->cur_nr_sectors * 512);
+            qemu_iovec_from_buffer(&acb->hd_qiov, acb->cluster_data,
+                512 * acb->cur_nr_sectors);
         }
     }
 
     acb->remaining_sectors -= acb->cur_nr_sectors;
     acb->sector_num += acb->cur_nr_sectors;
-    acb->buf += acb->cur_nr_sectors * 512;
+    acb->bytes_done += acb->cur_nr_sectors * 512;
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
@@ -415,6 +422,11 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     /* prepare next AIO request */
     acb->cur_nr_sectors = acb->remaining_sectors;
+    if (s->crypt_method) {
+        acb->cur_nr_sectors = MIN(acb->cur_nr_sectors,
+            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+    }
+
     ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9,
         &acb->cur_nr_sectors, &acb->cluster_offset);
     if (ret < 0) {
@@ -423,15 +435,17 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
 
+    qemu_iovec_reset(&acb->hd_qiov);
+    qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
+        acb->cur_nr_sectors * 512);
+
     if (!acb->cluster_offset) {
+
         if (bs->backing_hd) {
             /* read from the base image */
-            n1 = qcow2_backing_read1(bs->backing_hd, acb->sector_num,
-                               acb->buf, acb->cur_nr_sectors);
+            n1 = qcow2_backing_read1(bs->backing_hd, &acb->hd_qiov,
+                acb->sector_num, acb->cur_nr_sectors);
             if (n1 > 0) {
-                acb->hd_iov.iov_base = (void *)acb->buf;
-                acb->hd_iov.iov_len = acb->cur_nr_sectors * 512;
-                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                 acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
                                     &acb->hd_qiov, acb->cur_nr_sectors,
@@ -445,7 +459,7 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             }
         } else {
             /* Note: in this case, no need to wait */
-            memset(acb->buf, 0, 512 * acb->cur_nr_sectors);
+            qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors);
             ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
             if (ret < 0)
                 goto done;
@@ -454,8 +468,11 @@ static void qcow_aio_read_cb(void *opaque, int ret)
         /* add AIO support for compressed blocks ? */
         if (qcow2_decompress_cluster(bs, acb->cluster_offset) < 0)
             goto done;
-        memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512,
-               512 * acb->cur_nr_sectors);
+
+        qemu_iovec_from_buffer(&acb->hd_qiov,
+            s->cluster_cache + index_in_cluster * 512,
+            512 * acb->cur_nr_sectors);
+
         ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
         if (ret < 0)
             goto done;
@@ -465,9 +482,23 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             goto done;
         }
 
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = acb->cur_nr_sectors * 512;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+        if (s->crypt_method) {
+            /*
+             * For encrypted images, read everything into a temporary
+             * contiguous buffer on which the AES functions can work.
+             */
+            if (!acb->cluster_data) {
+                acb->cluster_data =
+                    qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+            }
+
+            assert(acb->cur_nr_sectors <=
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+            qemu_iovec_reset(&acb->hd_qiov);
+            qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
+                512 * acb->cur_nr_sectors);
+        }
+
         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
         acb->hd_aiocb = bdrv_aio_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
@@ -481,11 +512,8 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     return;
 done:
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
     acb->common.cb(acb->common.opaque, ret);
+    qemu_iovec_destroy(&acb->hd_qiov);
     qemu_aio_release(acb);
 }
 
@@ -501,13 +529,17 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
     acb->hd_aiocb = NULL;
     acb->sector_num = sector_num;
     acb->qiov = qiov;
-    if (qiov->niov > 1) {
-        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
-        if (is_write)
-            qemu_iovec_to_buffer(qiov, acb->buf);
-    } else {
+
+    if (!is_write) {
+        qemu_iovec_init(&acb->hd_qiov, qiov->niov);
+    } else if (qiov->niov == 1) {
         acb->buf = (uint8_t *)qiov->iov->iov_base;
+    } else {
+        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+        qemu_iovec_to_buffer(qiov, acb->buf);
     }
+
+    acb->bytes_done = 0;
     acb->remaining_sectors = nb_sectors;
     acb->cur_nr_sectors = 0;
     acb->cluster_offset = 0;
diff --git a/block/qcow2.h b/block/qcow2.h
index 3ff162efcd..356a34af43 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -166,8 +166,8 @@ static inline int64_t align_offset(int64_t offset, int n)
 // FIXME Need qcow2_ prefix to global functions
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs,
-                  int64_t sector_num, uint8_t *buf, int nb_sectors);
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors);
 
 /* qcow2-refcount.c functions */
 int qcow2_refcount_init(BlockDriverState *bs);

From 6f5f060b736cedc2d962114d20805fdd2889513a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 13 Sep 2010 18:24:10 +0200
Subject: [PATCH 17/20] qcow2: Avoid bounce buffers for AIO write requests

qcow2 used to use bounce buffers for any AIO requests. This does not only imply
unnecessary copying, but also unbounded allocations which should be avoided.

This patch removes bounce buffers from the normal AIO write path. Encrypted
images continue to use a bounce buffer, however with constant size.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 4a688b4863..ee3481b786 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -331,15 +331,12 @@ typedef struct QCowAIOCB {
     BlockDriverAIOCB common;
     int64_t sector_num;
     QEMUIOVector *qiov;
-    uint8_t *buf;
-    void *orig_buf;
     int remaining_sectors;
     int cur_nr_sectors;	/* number of sectors in current iteration */
     uint64_t bytes_done;
     uint64_t cluster_offset;
     uint8_t *cluster_data;
     BlockDriverAIOCB *hd_aiocb;
-    struct iovec hd_iov;
     QEMUIOVector hd_qiov;
     QEMUBH *bh;
     QCowL2Meta l2meta;
@@ -530,14 +527,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
     acb->sector_num = sector_num;
     acb->qiov = qiov;
 
-    if (!is_write) {
-        qemu_iovec_init(&acb->hd_qiov, qiov->niov);
-    } else if (qiov->niov == 1) {
-        acb->buf = (uint8_t *)qiov->iov->iov_base;
-    } else {
-        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
-        qemu_iovec_to_buffer(qiov, acb->buf);
-    }
+    qemu_iovec_init(&acb->hd_qiov, qiov->niov);
 
     acb->bytes_done = 0;
     acb->remaining_sectors = nb_sectors;
@@ -589,7 +579,6 @@ static void qcow_aio_write_cb(void *opaque, int ret)
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
-    const uint8_t *src_buf;
     int n_end;
 
     acb->hd_aiocb = NULL;
@@ -605,7 +594,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
 
     acb->remaining_sectors -= acb->cur_nr_sectors;
     acb->sector_num += acb->cur_nr_sectors;
-    acb->buf += acb->cur_nr_sectors * 512;
+    acb->bytes_done += acb->cur_nr_sectors * 512;
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
@@ -636,20 +625,27 @@ static void qcow_aio_write_cb(void *opaque, int ret)
 
     assert((acb->cluster_offset & 511) == 0);
 
+    qemu_iovec_reset(&acb->hd_qiov);
+    qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
+        acb->cur_nr_sectors * 512);
+
     if (s->crypt_method) {
         if (!acb->cluster_data) {
             acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
                                              s->cluster_size);
         }
-        qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
-                        acb->cur_nr_sectors, 1, &s->aes_encrypt_key);
-        src_buf = acb->cluster_data;
-    } else {
-        src_buf = acb->buf;
+
+        assert(acb->hd_qiov.size <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+        qemu_iovec_to_buffer(&acb->hd_qiov, acb->cluster_data);
+
+        qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data,
+            acb->cluster_data, acb->cur_nr_sectors, 1, &s->aes_encrypt_key);
+
+        qemu_iovec_reset(&acb->hd_qiov);
+        qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
+            acb->cur_nr_sectors * 512);
     }
-    acb->hd_iov.iov_base = (void *)src_buf;
-    acb->hd_iov.iov_len = acb->cur_nr_sectors * 512;
-    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
     acb->hd_aiocb = bdrv_aio_writev(bs->file,
                                     (acb->cluster_offset >> 9) + index_in_cluster,
@@ -667,9 +663,8 @@ fail:
         QLIST_REMOVE(&acb->l2meta, next_in_flight);
     }
 done:
-    if (acb->qiov->niov > 1)
-        qemu_vfree(acb->orig_buf);
     acb->common.cb(acb->common.opaque, ret);
+    qemu_iovec_destroy(&acb->hd_qiov);
     qemu_aio_release(acb);
 }
 

From f8b6d67251cfafed930d5f8be402fb0adac61299 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Mon, 6 Sep 2010 16:07:33 +0200
Subject: [PATCH 18/20] scsi-generic: add missing reset handler

Ensure that pending requests of a SCSI generic device are purged on
system reset. This also avoids calling a NULL function in lsi53c895a.
The lsi code was recently changed to call the .qdev.reset function.

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-generic.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 953802777d..7212091695 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -455,15 +455,31 @@ static int get_stream_blocksize(BlockDriverState *bdrv)
     return (buf[9] << 16) | (buf[10] << 8) | buf[11];
 }
 
-static void scsi_destroy(SCSIDevice *d)
+static void scsi_generic_purge_requests(SCSIGenericState *s)
 {
-    SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev, d);
     SCSIGenericReq *r;
 
     while (!QTAILQ_EMPTY(&s->qdev.requests)) {
         r = DO_UPCAST(SCSIGenericReq, req, QTAILQ_FIRST(&s->qdev.requests));
+        if (r->req.aiocb) {
+            bdrv_aio_cancel(r->req.aiocb);
+        }
         scsi_remove_request(r);
     }
+}
+
+static void scsi_generic_reset(DeviceState *dev)
+{
+    SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev.qdev, dev);
+
+    scsi_generic_purge_requests(s);
+}
+
+static void scsi_destroy(SCSIDevice *d)
+{
+    SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev, d);
+
+    scsi_generic_purge_requests(s);
     blockdev_mark_auto_del(s->qdev.conf.bs);
 }
 
@@ -537,6 +553,7 @@ static SCSIDeviceInfo scsi_generic_info = {
     .qdev.name    = "scsi-generic",
     .qdev.desc    = "pass through generic scsi device (/dev/sg*)",
     .qdev.size    = sizeof(SCSIGenericState),
+    .qdev.reset   = scsi_generic_reset,
     .init         = scsi_generic_initfn,
     .destroy      = scsi_destroy,
     .send_command = scsi_send_command,

From a5e3d9ef4dcca9b709999a98f6b37b9506c93ad3 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Mon, 6 Sep 2010 16:58:44 +0200
Subject: [PATCH 19/20] scsi_bus: fix length and xfer_mode for RESERVE and
 RELEASE commands

For the RESERVE and RELEASE commands the length must be zero
and xfer_mode must be SCSI_XFER_NONE.

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-bus.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 7aa0bcd1ee..5a3fd4b7ac 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -208,6 +208,8 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
     case SEEK_6:
     case WRITE_FILEMARKS:
     case SPACE:
+    case RESERVE:
+    case RELEASE:
     case ERASE:
     case ALLOW_MEDIUM_REMOVAL:
     case VERIFY:
@@ -319,7 +321,6 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     case WRITE_BUFFER:
     case FORMAT_UNIT:
     case REASSIGN_BLOCKS:
-    case RESERVE:
     case SEARCH_EQUAL:
     case SEARCH_HIGH:
     case SEARCH_LOW:

From d9d334176c5dba23b47be07192f431b0c030928d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Tue, 21 Sep 2010 15:43:03 +0100
Subject: [PATCH 20/20] blkverify: Add block driver for verifying I/O

The blkverify block driver makes investigating image format data
corruption much easier.  A raw image initialized with the same contents
as the test image (e.g. qcow2 file) must be provided.  The raw image
mirrors read/write operations and is used to verify that data read from
the test image is correct.

See docs/blkverify.txt for more information.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 Makefile.objs      |   2 +-
 block/blkverify.c  | 382 +++++++++++++++++++++++++++++++++++++++++++++
 docs/blkverify.txt |  69 ++++++++
 3 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 block/blkverify.c
 create mode 100644 docs/blkverify.txt

diff --git a/Makefile.objs b/Makefile.objs
index 3ef6d80dfd..dad459343d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/blkverify.c b/block/blkverify.c
new file mode 100644
index 0000000000..4202685a63
--- /dev/null
+++ b/block/blkverify.c
@@ -0,0 +1,382 @@
+/*
+ * Block protocol for block driver correctness testing
+ *
+ * Copyright (C) 2010 IBM, Corp.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "block_int.h"
+
+typedef struct {
+    BlockDriverState *test_file;
+} BDRVBlkverifyState;
+
+typedef struct BlkverifyAIOCB BlkverifyAIOCB;
+struct BlkverifyAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+
+    /* Request metadata */
+    bool is_write;
+    int64_t sector_num;
+    int nb_sectors;
+
+    int ret;                    /* first completed request's result */
+    unsigned int done;          /* completion counter */
+    bool *finished;             /* completion signal for cancel */
+
+    QEMUIOVector *qiov;         /* user I/O vector */
+    QEMUIOVector raw_qiov;      /* cloned I/O vector for raw file */
+    void *buf;                  /* buffer for raw file I/O */
+
+    void (*verify)(BlkverifyAIOCB *acb);
+};
+
+static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb;
+    bool finished = false;
+
+    /* Wait until request completes, invokes its callback, and frees itself */
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static AIOPool blkverify_aio_pool = {
+    .aiocb_size         = sizeof(BlkverifyAIOCB),
+    .cancel             = blkverify_aio_cancel,
+};
+
+static void blkverify_err(BlkverifyAIOCB *acb, const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    fprintf(stderr, "blkverify: %s sector_num=%ld nb_sectors=%d ",
+            acb->is_write ? "write" : "read", acb->sector_num,
+            acb->nb_sectors);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+    exit(1);
+}
+
+/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */
+static int blkverify_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    int ret;
+    char *raw, *c;
+
+    /* Parse the blkverify: prefix */
+    if (strncmp(filename, "blkverify:", strlen("blkverify:"))) {
+        return -EINVAL;
+    }
+    filename += strlen("blkverify:");
+
+    /* Parse the raw image filename */
+    c = strchr(filename, ':');
+    if (c == NULL) {
+        return -EINVAL;
+    }
+
+    raw = strdup(filename);
+    raw[c - filename] = '\0';
+    ret = bdrv_file_open(&bs->file, raw, flags);
+    free(raw);
+    if (ret < 0) {
+        return ret;
+    }
+    filename = c + 1;
+
+    /* Open the test file */
+    s->test_file = bdrv_new("");
+    ret = bdrv_open(s->test_file, filename, flags, NULL);
+    if (ret < 0) {
+        bdrv_delete(s->test_file);
+        s->test_file = NULL;
+        return ret;
+    }
+
+    return 0;
+}
+
+static void blkverify_close(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    bdrv_delete(s->test_file);
+    s->test_file = NULL;
+}
+
+static void blkverify_flush(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    /* Only flush test file, the raw file is not important */
+    bdrv_flush(s->test_file);
+}
+
+static int64_t blkverify_getlength(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    return bdrv_getlength(s->test_file);
+}
+
+/**
+ * Check that I/O vector contents are identical
+ *
+ * @a:          I/O vector
+ * @b:          I/O vector
+ * @ret:        Offset to first mismatching byte or -1 if match
+ */
+static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+    int i;
+    ssize_t offset = 0;
+
+    assert(a->niov == b->niov);
+    for (i = 0; i < a->niov; i++) {
+        size_t len = 0;
+        uint8_t *p = (uint8_t *)a->iov[i].iov_base;
+        uint8_t *q = (uint8_t *)b->iov[i].iov_base;
+
+        assert(a->iov[i].iov_len == b->iov[i].iov_len);
+        while (len < a->iov[i].iov_len && *p++ == *q++) {
+            len++;
+        }
+
+        offset += len;
+
+        if (len != a->iov[i].iov_len) {
+            return offset;
+        }
+    }
+    return -1;
+}
+
+typedef struct {
+    int src_index;
+    struct iovec *src_iov;
+    void *dest_base;
+} IOVectorSortElem;
+
+static int sortelem_cmp_src_base(const void *a, const void *b)
+{
+    const IOVectorSortElem *elem_a = a;
+    const IOVectorSortElem *elem_b = b;
+
+    /* Don't overflow */
+    if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
+        return -1;
+    } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static int sortelem_cmp_src_index(const void *a, const void *b)
+{
+    const IOVectorSortElem *elem_a = a;
+    const IOVectorSortElem *elem_b = b;
+
+    return elem_a->src_index - elem_b->src_index;
+}
+
+/**
+ * Copy contents of I/O vector
+ *
+ * The relative relationships of overlapping iovecs are preserved.  This is
+ * necessary to ensure identical semantics in the cloned I/O vector.
+ */
+static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src,
+                                  void *buf)
+{
+    IOVectorSortElem sortelems[src->niov];
+    void *last_end;
+    int i;
+
+    /* Sort by source iovecs by base address */
+    for (i = 0; i < src->niov; i++) {
+        sortelems[i].src_index = i;
+        sortelems[i].src_iov = &src->iov[i];
+    }
+    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
+
+    /* Allocate buffer space taking into account overlapping iovecs */
+    last_end = NULL;
+    for (i = 0; i < src->niov; i++) {
+        struct iovec *cur = sortelems[i].src_iov;
+        ptrdiff_t rewind = 0;
+
+        /* Detect overlap */
+        if (last_end && last_end > cur->iov_base) {
+            rewind = last_end - cur->iov_base;
+        }
+
+        sortelems[i].dest_base = buf - rewind;
+        buf += cur->iov_len - MIN(rewind, cur->iov_len);
+        last_end = MAX(cur->iov_base + cur->iov_len, last_end);
+    }
+
+    /* Sort by source iovec index and build destination iovec */
+    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
+    for (i = 0; i < src->niov; i++) {
+        qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
+    }
+}
+
+static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
+                                         int64_t sector_num, QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BlockDriverCompletionFunc *cb,
+                                         void *opaque)
+{
+    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque);
+
+    acb->bh = NULL;
+    acb->is_write = is_write;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = -EINPROGRESS;
+    acb->done = 0;
+    acb->qiov = qiov;
+    acb->buf = NULL;
+    acb->verify = NULL;
+    acb->finished = NULL;
+    return acb;
+}
+
+static void blkverify_aio_bh(void *opaque)
+{
+    BlkverifyAIOCB *acb = opaque;
+
+    qemu_bh_delete(acb->bh);
+    if (acb->buf) {
+        qemu_iovec_destroy(&acb->raw_qiov);
+        qemu_vfree(acb->buf);
+    }
+    acb->common.cb(acb->common.opaque, acb->ret);
+    if (acb->finished) {
+        *acb->finished = true;
+    }
+    qemu_aio_release(acb);
+}
+
+static void blkverify_aio_cb(void *opaque, int ret)
+{
+    BlkverifyAIOCB *acb = opaque;
+
+    switch (++acb->done) {
+    case 1:
+        acb->ret = ret;
+        break;
+
+    case 2:
+        if (acb->ret != ret) {
+            blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret);
+        }
+
+        if (acb->verify) {
+            acb->verify(acb);
+        }
+
+        acb->bh = qemu_bh_new(blkverify_aio_bh, acb);
+        qemu_bh_schedule(acb->bh);
+        break;
+    }
+}
+
+static void blkverify_verify_readv(BlkverifyAIOCB *acb)
+{
+    ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov);
+    if (offset != -1) {
+        blkverify_err(acb, "contents mismatch in sector %ld",
+                      acb->sector_num + (offset / BDRV_SECTOR_SIZE));
+    }
+}
+
+static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov,
+                                            nb_sectors, cb, opaque);
+
+    acb->verify = blkverify_verify_readv;
+    acb->buf = qemu_blockalign(bs->file, qiov->size);
+    qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
+    blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
+
+    if (!bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
+                        blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    if (!bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors,
+                        blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
+                                            nb_sectors, cb, opaque);
+
+    if (!bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors,
+                         blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    if (!bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
+                         blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    /* Only flush test file, the raw file is not important */
+    return bdrv_aio_flush(s->test_file, cb, opaque);
+}
+
+static BlockDriver bdrv_blkverify = {
+    .format_name        = "blkverify",
+    .protocol_name      = "blkverify",
+
+    .instance_size      = sizeof(BDRVBlkverifyState),
+
+    .bdrv_getlength     = blkverify_getlength,
+
+    .bdrv_file_open     = blkverify_open,
+    .bdrv_close         = blkverify_close,
+    .bdrv_flush         = blkverify_flush,
+
+    .bdrv_aio_readv     = blkverify_aio_readv,
+    .bdrv_aio_writev    = blkverify_aio_writev,
+    .bdrv_aio_flush     = blkverify_aio_flush,
+};
+
+static void bdrv_blkverify_init(void)
+{
+    bdrv_register(&bdrv_blkverify);
+}
+
+block_init(bdrv_blkverify_init);
diff --git a/docs/blkverify.txt b/docs/blkverify.txt
new file mode 100644
index 0000000000..d556dc4e6d
--- /dev/null
+++ b/docs/blkverify.txt
@@ -0,0 +1,69 @@
+= Block driver correctness testing with blkverify =
+
+== Introduction ==
+
+This document describes how to use the blkverify protocol to test that a block
+driver is operating correctly.
+
+It is difficult to test and debug block drivers against real guests.  Often
+processes inside the guest will crash because corrupt sectors were read as part
+of the executable.  Other times obscure errors are raised by a program inside
+the guest.  These issues are extremely hard to trace back to bugs in the block
+driver.
+
+Blkverify solves this problem by catching data corruption inside QEMU the first
+time bad data is read and reporting the disk sector that is corrupted.
+
+== How it works ==
+
+The blkverify protocol has two child block devices, the "test" device and the
+"raw" device.  Read/write operations are mirrored to both devices so their
+state should always be in sync.
+
+The "raw" device is a raw image, a flat file, that has identical starting
+contents to the "test" image.  The idea is that the "raw" device will handle
+read/write operations correctly and not corrupt data.  It can be used as a
+reference for comparison against the "test" device.
+
+After a mirrored read operation completes, blkverify will compare the data and
+raise an error if it is not identical.  This makes it possible to catch the
+first instance where corrupt data is read.
+
+== Example ==
+
+Imagine raw.img has 0xcd repeated throughout its first sector:
+
+    $ ./qemu-io -c 'read -v 0 512' raw.img
+    00000000:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    00000010:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    [...]
+    000001e0:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    000001f0:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    read 512/512 bytes at offset 0
+    512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec)
+
+And test.img is corrupt, its first sector is zeroed when it shouldn't be:
+
+    $ ./qemu-io -c 'read -v 0 512' test.img
+    00000000:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    00000010:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    [...]
+    000001e0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    000001f0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    read 512/512 bytes at offset 0
+    512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec)
+
+This error is caught by blkverify:
+
+    $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img
+    blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
+
+A more realistic scenario is verifying the installation of a guest OS:
+
+    $ ./qemu-img create raw.img 16G
+    $ ./qemu-img create -f qcow2 test.qcow2 16G
+    $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \
+                                        -drive file=blkverify:raw.img:test.qcow2
+
+If the installation is aborted when blkverify detects corruption, use qemu-io
+to explore the contents of the disk image at the sector in question.