From 819fa276311ce328a8e38ad9306c1093961b3f4b Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 9 Dec 2014 18:15:08 +0000
Subject: [PATCH 01/47] Restore atapi_dma flag across migration

If a migration happens just after the guest has kicked
off an ATAPI command and kicked off DMA, we lose the atapi_dma
flag, and the destination tries to complete the command as PIO
rather than DMA.  This upsets Linux; modern libata based kernels
stumble and recover OK, older kernels end up passing bad data
to userspace.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index d4af5e2eb1..ac3f015a8d 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2417,6 +2417,7 @@ static int ide_drive_pio_post_load(void *opaque, int version_id)
     s->end_transfer_func = transfer_end_table[s->end_transfer_fn_idx];
     s->data_ptr = s->io_buffer + s->cur_io_buffer_offset;
     s->data_end = s->data_ptr + s->cur_io_buffer_len;
+    s->atapi_dma = s->feature & 1; /* as per cmd_packet */
 
     return 0;
 }

From a71754e5b03fd3b8b8c6d3bc2a39f75bead729de Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 9 Dec 2014 18:15:09 +0000
Subject: [PATCH 02/47] atapi migration: Throw recoverable error to avoid
 recovery

(With the previous atapi_dma flag recovery)
If migration happens between the ATAPI command being written and the
bmdma being started, the DMA is dropped.  Eventually the guest times
out and recovers, but that can take many seconds.
(This is rare, on a pingpong reading the CD continuously I hit
this about ~1/30-1/50 migrates)

I don't think we've got enough state to be able to recover safely
at this point, so I throw a 'medium error, no seek complete'
that I'm assuming guests will try and recover from an apparently
dirty CD.

OK, it's a hack, the real solution is probably to push a lot of
ATAPI state into the migration stream, but this is a fix that
works with no stream changes. Tested only on Linux (both RHEL5
(pre-libata) and RHEL7).

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/atapi.c    | 17 +++++++++++++++++
 hw/ide/internal.h |  2 ++
 hw/ide/pci.c      | 11 +++++++++++
 3 files changed, 30 insertions(+)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index a71e6e014f..1bf8b34528 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -394,6 +394,23 @@ static void ide_atapi_cmd_read(IDEState *s, int lba, int nb_sectors,
     }
 }
 
+
+/* Called by *_restart_bh when the transfer function points
+ * to ide_atapi_cmd
+ */
+void ide_atapi_dma_restart(IDEState *s)
+{
+    /*
+     * I'm not sure we have enough stored to restart the command
+     * safely, so give the guest an error it should recover from.
+     * I'm assuming most guests will try to recover from something
+     * listed as a medium error on a CD; it seems to work on Linux.
+     * This would be more of a problem if we did any other type of
+     * DMA operation.
+     */
+    ide_atapi_cmd_error(s, MEDIUM_ERROR, ASC_NO_SEEK_COMPLETE);
+}
+
 static inline uint8_t ide_atapi_set_profile(uint8_t *buf, uint8_t *index,
                                             uint16_t profile)
 {
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index c998003bf3..ee9a57f039 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -289,6 +289,7 @@ typedef struct IDEDMAOps IDEDMAOps;
 #define ATAPI_INT_REASON_TAG            0xf8
 
 /* same constants as bochs */
+#define ASC_NO_SEEK_COMPLETE                 0x02
 #define ASC_ILLEGAL_OPCODE                   0x20
 #define ASC_LOGICAL_BLOCK_OOR                0x21
 #define ASC_INV_FIELD_IN_CMD_PACKET          0x24
@@ -530,6 +531,7 @@ void ide_dma_error(IDEState *s);
 
 void ide_atapi_cmd_ok(IDEState *s);
 void ide_atapi_cmd_error(IDEState *s, int sense_key, int asc);
+void ide_atapi_dma_restart(IDEState *s);
 void ide_atapi_io_error(IDEState *s, int ret);
 
 void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val);
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index bee5ad39fe..e3f2054a90 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -235,6 +235,17 @@ static void bmdma_restart_bh(void *opaque)
         }
     } else if (error_status & IDE_RETRY_FLUSH) {
         ide_flush_cache(bmdma_active_if(bm));
+    } else {
+        IDEState *s = bmdma_active_if(bm);
+
+        /*
+         * We've not got any bits to tell us about ATAPI - but
+         * we do have the end_transfer_func that tells us what
+         * we're trying to do.
+         */
+        if (s->end_transfer_func == ide_atapi_cmd) {
+            ide_atapi_dma_restart(s);
+        }
     }
 }
 

From 1486df0e312613c6f235ee08488c90cc6b987094 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 30 Jan 2015 11:42:11 +0300
Subject: [PATCH 03/47] block/raw-posix: create translate_err helper to merge
 errno values

actually the code
    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
        ret == -ENOTTY) {
        ret = -ENOTSUP;
    }
is present twice and will be added a couple more times. Create helper
for this.

CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
CC: Fam Zheng <famz@redhat.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index e51293a455..24300d0f78 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -893,6 +893,15 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 }
 #endif
 
+static int translate_err(int err)
+{
+    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
+        err == -ENOTTY) {
+        err = -ENOTSUP;
+    }
+    return err;
+}
+
 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
 {
     int ret = -EOPNOTSUPP;
@@ -921,10 +930,9 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
 #endif
     }
 
-    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
-        ret == -ENOTTY) {
+    ret = translate_err(ret);
+    if (ret == -ENOTSUP) {
         s->has_write_zeroes = false;
-        ret = -ENOTSUP;
     }
     return ret;
 }
@@ -968,10 +976,9 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 #endif
     }
 
-    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
-        ret == -ENOTTY) {
+    ret = translate_err(ret);
+    if (ret == -ENOTSUP) {
         s->has_discard = false;
-        ret = -ENOTSUP;
     }
     return ret;
 }

From 0b99171230bcdfe49956f283fc2e51a37efc7bfc Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 30 Jan 2015 11:42:12 +0300
Subject: [PATCH 04/47] block/raw-posix: create do_fallocate helper

The pattern
    do {
        if (fallocate(s->fd, mode, offset, len) == 0) {
            return 0;
        }
    } while (errno == EINTR);
    ret = translate_err(-errno);
will be commonly useful in next patches. Create helper for it.

CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
CC: Fam Zheng <famz@redhat.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 24300d0f78..2aa268aeab 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -902,6 +902,18 @@ static int translate_err(int err)
     return err;
 }
 
+#if defined(CONFIG_FALLOCATE_PUNCH_HOLE)
+static int do_fallocate(int fd, int mode, off_t offset, off_t len)
+{
+    do {
+        if (fallocate(fd, mode, offset, len) == 0) {
+            return 0;
+        }
+    } while (errno == EINTR);
+    return translate_err(-errno);
+}
+#endif
+
 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
 {
     int ret = -EOPNOTSUPP;
@@ -965,14 +977,8 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 #endif
 
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
-        do {
-            if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-                          aiocb->aio_offset, aiocb->aio_nbytes) == 0) {
-                return 0;
-            }
-        } while (errno == EINTR);
-
-        ret = -errno;
+        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                           aiocb->aio_offset, aiocb->aio_nbytes);
 #endif
     }
 

From 37cc9f7f684ed035da63274daca1594c7ee16213 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 30 Jan 2015 11:42:13 +0300
Subject: [PATCH 05/47] block/raw-posix: refactor handle_aiocb_write_zeroes a
 bit

move code dealing with a block device to a separate function. This will
allow to implement additional processing for ordinary files.

Please note, that xfs_code has been moved before checking for
s->has_write_zeroes as xfs_write_zeroes does not touch this flag inside.
This makes code a bit more consistent.

CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
CC: Fam Zheng <famz@redhat.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 48 +++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 2aa268aeab..d3910fdbf3 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -914,41 +914,49 @@ static int do_fallocate(int fd, int mode, off_t offset, off_t len)
 }
 #endif
 
-static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
 {
-    int ret = -EOPNOTSUPP;
+    int ret = -ENOTSUP;
     BDRVRawState *s = aiocb->bs->opaque;
 
-    if (s->has_write_zeroes == 0) {
+    if (!s->has_write_zeroes) {
         return -ENOTSUP;
     }
 
-    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
 #ifdef BLKZEROOUT
-        do {
-            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
-            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
-                return 0;
-            }
-        } while (errno == EINTR);
-
-        ret = -errno;
-#endif
-    } else {
-#ifdef CONFIG_XFS
-        if (s->is_xfs) {
-            return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+    do {
+        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+            return 0;
         }
-#endif
-    }
+    } while (errno == EINTR);
+
+    ret = translate_err(-errno);
+#endif
 
-    ret = translate_err(ret);
     if (ret == -ENOTSUP) {
         s->has_write_zeroes = false;
     }
     return ret;
 }
 
+static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+{
+    BDRVRawState *s = aiocb->bs->opaque;
+
+    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+        return handle_aiocb_write_zeroes_block(aiocb);
+    }
+
+#ifdef CONFIG_XFS
+    if (s->is_xfs) {
+        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+    }
+#endif
+
+    return -ENOTSUP;
+}
+
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 {
     int ret = -EOPNOTSUPP;

From b953f075007a7860adce9fa410b2e116a3d5e29c Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 30 Jan 2015 11:42:14 +0300
Subject: [PATCH 06/47] block: use fallocate(FALLOC_FL_ZERO_RANGE) in
 handle_aiocb_write_zeroes

This efficiently writes zeroes on Linux if the kernel is capable enough.
FALLOC_FL_ZERO_RANGE correctly handles all cases, including and not
including file expansion.

CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
CC: Fam Zheng <famz@redhat.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 15 +++++++++++++--
 configure         | 19 +++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index d3910fdbf3..5a777e7137 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -60,7 +60,7 @@
 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
 #endif
 #endif
-#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
 #include <linux/falloc.h>
 #endif
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -902,7 +902,7 @@ static int translate_err(int err)
     return err;
 }
 
-#if defined(CONFIG_FALLOCATE_PUNCH_HOLE)
+#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
 {
     do {
@@ -954,6 +954,17 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
     }
 #endif
 
+#ifdef CONFIG_FALLOCATE_ZERO_RANGE
+    if (s->has_write_zeroes) {
+        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
+                               aiocb->aio_offset, aiocb->aio_nbytes);
+        if (ret == 0 || ret != -ENOTSUP) {
+            return ret;
+        }
+        s->has_write_zeroes = false;
+    }
+#endif
+
     return -ENOTSUP;
 }
 
diff --git a/configure b/configure
index f185dd0838..e00e03af32 100755
--- a/configure
+++ b/configure
@@ -3335,6 +3335,22 @@ if compile_prog "" "" ; then
   fallocate_punch_hole=yes
 fi
 
+# check that fallocate supports range zeroing inside the file
+fallocate_zero_range=no
+cat > $TMPC << EOF
+#include <fcntl.h>
+#include <linux/falloc.h>
+
+int main(void)
+{
+    fallocate(0, FALLOC_FL_ZERO_RANGE, 0, 0);
+    return 0;
+}
+EOF
+if compile_prog "" "" ; then
+  fallocate_zero_range=yes
+fi
+
 # check for posix_fallocate
 posix_fallocate=no
 cat > $TMPC << EOF
@@ -4567,6 +4583,9 @@ fi
 if test "$fallocate_punch_hole" = "yes" ; then
   echo "CONFIG_FALLOCATE_PUNCH_HOLE=y" >> $config_host_mak
 fi
+if test "$fallocate_zero_range" = "yes" ; then
+  echo "CONFIG_FALLOCATE_ZERO_RANGE=y" >> $config_host_mak
+fi
 if test "$posix_fallocate" = "yes" ; then
   echo "CONFIG_POSIX_FALLOCATE=y" >> $config_host_mak
 fi

From d50d82221934696633296975aa433fe8aeeac714 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 30 Jan 2015 11:42:15 +0300
Subject: [PATCH 07/47] block/raw-posix: call plain fallocate in
 handle_aiocb_write_zeroes

There is a possibility that we are extending our image and thus writing
zeroes beyond the end of the file. In this case we do not need to care
about the hole to make sure that there is no data in the file under
this offset (pre-condition to fallocate(0) to work). We could simply call
fallocate(0).

This improves the performance of writing zeroes even on really old
platforms which do not have even FALLOC_FL_PUNCH_HOLE.

Before the patch do_fallocate was used when either
CONFIG_FALLOCATE_PUNCH_HOLE or CONFIG_FALLOCATE_ZERO_RANGE are defined.
Now the story is different. CONFIG_FALLOCATE is defined when Linux
fallocate is defined, posix_fallocate is completely different story
(CONFIG_POSIX_FALLOCATE). CONFIG_FALLOCATE is mandatory prerequite
for both CONFIG_FALLOCATE_PUNCH_HOLE and CONFIG_FALLOCATE_ZERO_RANGE
thus we are on the safe side.

CC: Max Reitz <mreitz@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
CC: Fam Zheng <famz@redhat.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 5a777e7137..1c88ad89d8 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -147,6 +147,7 @@ typedef struct BDRVRawState {
     bool has_discard:1;
     bool has_write_zeroes:1;
     bool discard_zeroes:1;
+    bool has_fallocate;
     bool needs_alignment;
 } BDRVRawState;
 
@@ -452,6 +453,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
     if (S_ISREG(st.st_mode)) {
         s->discard_zeroes = true;
+        s->has_fallocate = true;
     }
     if (S_ISBLK(st.st_mode)) {
 #ifdef BLKDISCARDZEROES
@@ -902,7 +904,7 @@ static int translate_err(int err)
     return err;
 }
 
-#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
+#ifdef CONFIG_FALLOCATE
 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
 {
     do {
@@ -965,6 +967,16 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
     }
 #endif
 
+#ifdef CONFIG_FALLOCATE
+    if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
+        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
+        if (ret == 0 || ret != -ENOTSUP) {
+            return ret;
+        }
+        s->has_fallocate = false;
+    }
+#endif
+
     return -ENOTSUP;
 }
 

From 1cdc3239f1bb8c8f18954defe3cb813edc9df4a0 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 30 Jan 2015 11:42:16 +0300
Subject: [PATCH 08/47] block: use fallocate(FALLOC_FL_PUNCH_HOLE) &
 fallocate(0) to write zeroes

This sequence works efficiently if FALLOC_FL_ZERO_RANGE is not supported.
Unfortunately, FALLOC_FL_ZERO_RANGE is supported on really modern systems
and only for a couple of filesystems. FALLOC_FL_PUNCH_HOLE is much more
mature.

The sequence of 2 operations FALLOC_FL_PUNCH_HOLE and 0 is necessary due
to the following reasons:
- FALLOC_FL_PUNCH_HOLE creates a hole in the file, the file becomes
  sparse. In order to retain original functionality we must allocate
  disk space afterwards. This is done using fallocate(0) call
- fallocate(0) without preceeding FALLOC_FL_PUNCH_HOLE will do nothing
  if called above already allocated areas of the file, i.e. the content
  will not be zeroed

This should increase the performance a bit for not-so-modern kernels.

CC: Max Reitz <mreitz@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
CC: Fam Zheng <famz@redhat.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 1c88ad89d8..7b42f37d83 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -967,6 +967,25 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
     }
 #endif
 
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+    if (s->has_discard && s->has_fallocate) {
+        int ret = do_fallocate(s->fd,
+                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                               aiocb->aio_offset, aiocb->aio_nbytes);
+        if (ret == 0) {
+            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
+            if (ret == 0 || ret != -ENOTSUP) {
+                return ret;
+            }
+            s->has_fallocate = false;
+        } else if (ret != -ENOTSUP) {
+            return ret;
+        } else {
+            s->has_discard = false;
+        }
+    }
+#endif
+
 #ifdef CONFIG_FALLOCATE
     if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);

From 98764152ad8ec9fa4e7bb6d8e10f8a7a7ce273d7 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 2 Feb 2015 15:48:34 +0100
Subject: [PATCH 09/47] block: change default for discard and write zeroes to
 INT_MAX

do not trim requests if the driver does not supply a limit
through BlockLimits. For write zeroes we still keep a limit
for the unsupported path to avoid allocating a big bounce buffer.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/block.c b/block.c
index d45e4ddf31..ee7ff2cac8 100644
--- a/block.c
+++ b/block.c
@@ -3192,10 +3192,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                             BDRV_REQ_COPY_ON_READ);
 }
 
-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_WRITE_ZEROES_DEFAULT 32768
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
 
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
@@ -3206,7 +3203,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int ret = 0;
 
     int max_write_zeroes = bs->bl.max_write_zeroes ?
-                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
+                           bs->bl.max_write_zeroes : INT_MAX;
 
     while (nb_sectors > 0 && !ret) {
         int num = nb_sectors;
@@ -3242,7 +3239,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
         if (ret == -ENOTSUP) {
             /* Fall back to bounce buffer if write zeroes is unsupported */
             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
-                                            MAX_WRITE_ZEROES_DEFAULT);
+                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
             num = MIN(num, max_xfer_len);
             iov.iov_len = num * BDRV_SECTOR_SIZE;
             if (iov.iov_base == NULL) {
@@ -5097,11 +5094,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
 }
 
-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_DISCARD_DEFAULT 32768
-
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                  int nb_sectors)
 {
@@ -5126,7 +5118,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
+    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
     while (nb_sectors > 0) {
         int ret;
         int num = nb_sectors;

From 61979a6adfc1c4590f31ca295998e160226785c4 Mon Sep 17 00:00:00 2001
From: Don Slutz <dslutz@verizon.com>
Date: Fri, 9 Jan 2015 10:17:35 -0500
Subject: [PATCH 10/47] qemu-img: Add QEMU_PKGVERSION to QEMU_IMG_VERSION

This is the same way vl.c handles this.

Signed-off-by: Don Slutz <dslutz@verizon.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-img.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index 4e9a7f5741..e148af8a3e 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -35,7 +35,7 @@
 #include "block/qapi.h"
 #include <getopt.h>
 
-#define QEMU_IMG_VERSION "qemu-img version " QEMU_VERSION \
+#define QEMU_IMG_VERSION "qemu-img version " QEMU_VERSION QEMU_PKGVERSION \
                           ", Copyright (c) 2004-2008 Fabrice Bellard\n"
 
 typedef struct img_cmd_t {

From 35f5a49374098733247c640cbdcbafcfc792c11f Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Wed, 28 Jan 2015 09:51:13 +0800
Subject: [PATCH 11/47] qed: Really remove unused field QEDAIOCB.finished

The commit 533ffb17a that removed qed_aiocb_info.cancel said to remove
this but didn't do it.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/qed.h b/block/qed.h
index d3934a05cd..615e676fc8 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -133,7 +133,6 @@ typedef struct QEDAIOCB {
     int bh_ret;                     /* final return status for completion bh */
     QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
     int flags;                      /* QED_AIOCB_* bits ORed together */
-    bool *finished;                 /* signal for cancel completion */
     uint64_t end_pos;               /* request end on block device, in bytes */
 
     /* User scatter-gather list */

From f4564d53c6952c61fb3ed8ee17a6e0f2f6bf0857 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 2 Feb 2015 14:52:18 +0100
Subject: [PATCH 12/47] block: add accounting for merged requests

Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                    |  2 ++
 block/accounting.c         |  7 +++++++
 block/qapi.c               |  2 ++
 hmp.c                      |  6 +++++-
 include/block/accounting.h |  3 +++
 qapi/block-core.json       |  9 ++++++++-
 qmp-commands.hx            | 22 ++++++++++++++++++----
 7 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index ee7ff2cac8..8272ef901e 100644
--- a/block.c
+++ b/block.c
@@ -4559,6 +4559,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
         }
     }
 
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
     return outidx + 1;
 }
 
diff --git a/block/accounting.c b/block/accounting.c
index 18102f015d..01d594ffdc 100644
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -54,3 +54,10 @@ void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
         stats->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 }
+
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                      int num_requests)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+    stats->merged[type] += num_requests;
+}
diff --git a/block/qapi.c b/block/qapi.c
index 75c388e90b..d1a891796c 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -335,6 +335,8 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
     s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
     s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
     s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
+    s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ];
+    s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE];
     s->stats->wr_highest_offset =
         bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
     s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];
diff --git a/hmp.c b/hmp.c
index a42c5c07be..b47f331f4d 100644
--- a/hmp.c
+++ b/hmp.c
@@ -474,6 +474,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        " wr_total_time_ns=%" PRId64
                        " rd_total_time_ns=%" PRId64
                        " flush_total_time_ns=%" PRId64
+                       " rd_merged=%" PRId64
+                       " wr_merged=%" PRId64
                        "\n",
                        stats->value->stats->rd_bytes,
                        stats->value->stats->wr_bytes,
@@ -482,7 +484,9 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        stats->value->stats->flush_operations,
                        stats->value->stats->wr_total_time_ns,
                        stats->value->stats->rd_total_time_ns,
-                       stats->value->stats->flush_total_time_ns);
+                       stats->value->stats->flush_total_time_ns,
+                       stats->value->stats->rd_merged,
+                       stats->value->stats->wr_merged);
     }
 
     qapi_free_BlockStatsList(stats_list);
diff --git a/include/block/accounting.h b/include/block/accounting.h
index 50b42b3808..4c406cff7a 100644
--- a/include/block/accounting.h
+++ b/include/block/accounting.h
@@ -39,6 +39,7 @@ typedef struct BlockAcctStats {
     uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
     uint64_t nr_ops[BLOCK_MAX_IOTYPE];
     uint64_t total_time_ns[BLOCK_MAX_IOTYPE];
+    uint64_t merged[BLOCK_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 } BlockAcctStats;
 
@@ -53,5 +54,7 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie);
 void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
                                unsigned int nb_sectors);
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                           int num_requests);
 
 #endif
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 80984d1660..b7d9772444 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -407,13 +407,20 @@
 #                     growable sparse files (like qcow2) that are used on top
 #                     of a physical device.
 #
+# @rd_merged: Number of read requests that have been merged into another
+#             request (Since 2.3).
+#
+# @wr_merged: Number of write requests that have been merged into another
+#             request (Since 2.3).
+#
 # Since: 0.14.0
 ##
 { 'type': 'BlockDeviceStats',
   'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'rd_operations': 'int',
            'wr_operations': 'int', 'flush_operations': 'int',
            'flush_total_time_ns': 'int', 'wr_total_time_ns': 'int',
-           'rd_total_time_ns': 'int', 'wr_highest_offset': 'int' } }
+           'rd_total_time_ns': 'int', 'wr_highest_offset': 'int',
+           'rd_merged': 'int', 'wr_merged': 'int' } }
 
 ##
 # @BlockStats:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index c5f16dd922..af3fd19935 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2303,6 +2303,10 @@ Each json-object contain the following:
     - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
+    - "rd_merged": number of read requests that have been merged into
+                   another request (json-int)
+    - "wr_merged": number of write requests that have been merged into
+                   another request (json-int)
 - "parent": Contains recursively the statistics of the underlying
             protocol (e.g. the host file for a qcow2 image). If there is
             no underlying protocol, this field is omitted
@@ -2326,6 +2330,8 @@ Example:
                   "rd_total_times_ns":3465673657
                   "flush_total_times_ns":49653
                   "flush_operations":61,
+                  "rd_merged":0,
+                  "wr_merged":0
                }
             },
             "stats":{
@@ -2337,7 +2343,9 @@ Example:
                "flush_operations":51,
                "wr_total_times_ns":313253456
                "rd_total_times_ns":3465673657
-               "flush_total_times_ns":49653
+               "flush_total_times_ns":49653,
+               "rd_merged":0,
+               "wr_merged":0
             }
          },
          {
@@ -2351,7 +2359,9 @@ Example:
                "flush_operations":0,
                "wr_total_times_ns":0
                "rd_total_times_ns":0
-               "flush_total_times_ns":0
+               "flush_total_times_ns":0,
+               "rd_merged":0,
+               "wr_merged":0
             }
          },
          {
@@ -2365,7 +2375,9 @@ Example:
                "flush_operations":0,
                "wr_total_times_ns":0
                "rd_total_times_ns":0
-               "flush_total_times_ns":0
+               "flush_total_times_ns":0,
+               "rd_merged":0,
+               "wr_merged":0
             }
          },
          {
@@ -2379,7 +2391,9 @@ Example:
                "flush_operations":0,
                "wr_total_times_ns":0
                "rd_total_times_ns":0
-               "flush_total_times_ns":0
+               "flush_total_times_ns":0,
+               "rd_merged":0,
+               "wr_merged":0
             }
          }
       ]

From d901f3c457ade0b4934427c0e8608dea31610720 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 2 Feb 2015 14:52:19 +0100
Subject: [PATCH 13/47] hw/virtio-blk: add a constant for max number of merged
 requests

As it was not obvious (at least for me) where the 32 comes from;
add a constant for it.

Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/virtio-blk.c          | 2 +-
 include/hw/virtio/virtio-blk.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 4032fcae27..e04adb8f16 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -360,7 +360,7 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
                      BLOCK_ACCT_WRITE);
 
-    if (mrb->num_writes == 32) {
+    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
         virtio_submit_multiwrite(req->dev->blk, mrb);
     }
 
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 4652b70b5d..6ef3fa57cf 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -134,8 +134,10 @@ typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
+#define VIRTIO_BLK_MAX_MERGE_REQS 32
+
 typedef struct MultiReqBuffer {
-    BlockRequest        blkreq[32];
+    BlockRequest        blkreq[VIRTIO_BLK_MAX_MERGE_REQS];
     unsigned int        num_writes;
 } MultiReqBuffer;
 

From 454057b7d9b9ad141bd5df8c4075745e56b4870f Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 2 Feb 2015 14:52:20 +0100
Subject: [PATCH 14/47] block-backend: expose bs->bl.max_transfer_length

Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c          | 5 +++++
 include/sysemu/block-backend.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index d00c129f15..c28e2402c7 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -580,6 +580,11 @@ int blk_get_flags(BlockBackend *blk)
     return bdrv_get_flags(blk->bs);
 }
 
+int blk_get_max_transfer_length(BlockBackend *blk)
+{
+    return blk->bs->bl.max_transfer_length;
+}
+
 void blk_set_guest_block_size(BlockBackend *blk, int align)
 {
     bdrv_set_guest_block_size(blk->bs, align);
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 8871a021d0..aab12b982c 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -127,6 +127,7 @@ int blk_is_inserted(BlockBackend *blk);
 void blk_lock_medium(BlockBackend *blk, bool locked);
 void blk_eject(BlockBackend *blk, bool eject_flag);
 int blk_get_flags(BlockBackend *blk);
+int blk_get_max_transfer_length(BlockBackend *blk);
 void blk_set_guest_block_size(BlockBackend *blk, int align);
 void *blk_blockalign(BlockBackend *blk, size_t size);
 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);

From 95f7142abc86a916682bd735aecd90172ffa0d30 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 2 Feb 2015 14:52:21 +0100
Subject: [PATCH 15/47] virtio-blk: introduce multiread

this patch finally introduces multiread support to virtio-blk. While
multiwrite support was there for a long time, read support was missing.

The complete merge logic is moved into virtio-blk.c which has
been the only user of request merging ever since. This is required
to be able to merge chunks of requests and immediately invoke callbacks
for those requests. Secondly, this is required to switch to
direct invocation of coroutines which is planned at a later stage.

The following benchmarks show the performance of running fio with
4 worker threads on a local ram disk. The numbers show the average
of 10 test runs after 1 run as warmup phase.

              |        4k        |       64k        |        4k
MB/s          | rd seq | rd rand | rd seq | rd rand | wr seq | wr rand
--------------+--------+---------+--------+---------+--------+--------
master        | 1221   | 1187    | 4178   | 4114    | 1745   | 1213
multiread     | 1829   | 1189    | 4639   | 4110    | 1894   | 1216

Signed-off-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/dataplane/virtio-blk.c |   8 +-
 hw/block/virtio-blk.c           | 296 ++++++++++++++++++++++----------
 include/hw/virtio/virtio-blk.h  |  19 +-
 trace-events                    |   1 +
 4 files changed, 218 insertions(+), 106 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 39c5d7103c..be957d1117 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -96,9 +96,7 @@ static void handle_notify(EventNotifier *e)
     event_notifier_test_and_clear(&s->host_notifier);
     blk_io_plug(s->conf->conf.blk);
     for (;;) {
-        MultiReqBuffer mrb = {
-            .num_writes = 0,
-        };
+        MultiReqBuffer mrb = {};
         int ret;
 
         /* Disable guest->host notifies to avoid unnecessary vmexits */
@@ -120,7 +118,9 @@ static void handle_notify(EventNotifier *e)
             virtio_blk_handle_request(req, &mrb);
         }
 
-        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
+        if (mrb.num_reqs) {
+            virtio_blk_submit_multireq(s->conf->conf.blk, &mrb);
+        }
 
         if (likely(ret == -EAGAIN)) { /* vring emptied */
             /* Re-enable guest->host notifies and stop processing the vring.
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e04adb8f16..d0a01a8864 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -34,6 +34,7 @@ VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
     req->dev = s;
     req->qiov.size = 0;
     req->next = NULL;
+    req->mr_next = NULL;
     return req;
 }
 
@@ -84,20 +85,32 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
 
 static void virtio_blk_rw_complete(void *opaque, int ret)
 {
-    VirtIOBlockReq *req = opaque;
+    VirtIOBlockReq *next = opaque;
 
-    trace_virtio_blk_rw_complete(req, ret);
+    while (next) {
+        VirtIOBlockReq *req = next;
+        next = req->mr_next;
+        trace_virtio_blk_rw_complete(req, ret);
 
-    if (ret) {
-        int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
-        bool is_read = !(p & VIRTIO_BLK_T_OUT);
-        if (virtio_blk_handle_rw_error(req, -ret, is_read))
-            return;
+        if (req->qiov.nalloc != -1) {
+            /* If nalloc is != 1 req->qiov is a local copy of the original
+             * external iovec. It was allocated in submit_merged_requests
+             * to be able to merge requests. */
+            qemu_iovec_destroy(&req->qiov);
+        }
+
+        if (ret) {
+            int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
+            bool is_read = !(p & VIRTIO_BLK_T_OUT);
+            if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
+                continue;
+            }
+        }
+
+        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+        virtio_blk_free_request(req);
     }
-
-    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
-    virtio_blk_free_request(req);
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -291,24 +304,127 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
     }
 }
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
+static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
+                                   int start, int num_reqs, int niov)
 {
-    int i, ret;
+    QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
+    int64_t sector_num = mrb->reqs[start]->sector_num;
+    int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE;
+    bool is_write = mrb->is_write;
 
-    if (!mrb->num_writes) {
+    if (num_reqs > 1) {
+        int i;
+        struct iovec *tmp_iov = qiov->iov;
+        int tmp_niov = qiov->niov;
+
+        /* mrb->reqs[start]->qiov was initialized from external so we can't
+         * modifiy it here. We need to initialize it locally and then add the
+         * external iovecs. */
+        qemu_iovec_init(qiov, niov);
+
+        for (i = 0; i < tmp_niov; i++) {
+            qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
+        }
+
+        for (i = start + 1; i < start + num_reqs; i++) {
+            qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
+                              mrb->reqs[i]->qiov.size);
+            mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
+            nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE;
+        }
+        assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE);
+
+        trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num,
+                                         nb_sectors, is_write);
+        block_acct_merge_done(blk_get_stats(blk),
+                              is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
+                              num_reqs - 1);
+    }
+
+    if (is_write) {
+        blk_aio_writev(blk, sector_num, qiov, nb_sectors,
+                       virtio_blk_rw_complete, mrb->reqs[start]);
+    } else {
+        blk_aio_readv(blk, sector_num, qiov, nb_sectors,
+                      virtio_blk_rw_complete, mrb->reqs[start]);
+    }
+}
+
+static int multireq_compare(const void *a, const void *b)
+{
+    const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
+                         *req2 = *(VirtIOBlockReq **)b;
+
+    /*
+     * Note that we can't simply subtract sector_num1 from sector_num2
+     * here as that could overflow the return value.
+     */
+    if (req1->sector_num > req2->sector_num) {
+        return 1;
+    } else if (req1->sector_num < req2->sector_num) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
+{
+    int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
+    int max_xfer_len = 0;
+    int64_t sector_num = 0;
+
+    if (mrb->num_reqs == 1) {
+        submit_requests(blk, mrb, 0, 1, -1);
+        mrb->num_reqs = 0;
         return;
     }
 
-    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
-    if (ret != 0) {
-        for (i = 0; i < mrb->num_writes; i++) {
-            if (mrb->blkreq[i].error) {
-                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
+    max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
+    max_xfer_len = MIN_NON_ZERO(max_xfer_len, INT_MAX);
+
+    qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
+          &multireq_compare);
+
+    for (i = 0; i < mrb->num_reqs; i++) {
+        VirtIOBlockReq *req = mrb->reqs[i];
+        if (num_reqs > 0) {
+            bool merge = true;
+
+            /* merge would exceed maximum number of IOVs */
+            if (niov + req->qiov.niov > IOV_MAX) {
+                merge = false;
+            }
+
+            /* merge would exceed maximum transfer length of backend device */
+            if (req->qiov.size / BDRV_SECTOR_SIZE + nb_sectors > max_xfer_len) {
+                merge = false;
+            }
+
+            /* requests are not sequential */
+            if (sector_num + nb_sectors != req->sector_num) {
+                merge = false;
+            }
+
+            if (!merge) {
+                submit_requests(blk, mrb, start, num_reqs, niov);
+                num_reqs = 0;
             }
         }
+
+        if (num_reqs == 0) {
+            sector_num = req->sector_num;
+            nb_sectors = niov = 0;
+            start = i;
+        }
+
+        nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
+        niov += req->qiov.niov;
+        num_reqs++;
     }
 
-    mrb->num_writes = 0;
+    submit_requests(blk, mrb, start, num_reqs, niov);
+    mrb->num_reqs = 0;
 }
 
 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
@@ -319,7 +435,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
-    virtio_submit_multiwrite(req->dev->blk, mrb);
+    if (mrb->is_write && mrb->num_reqs > 0) {
+        virtio_blk_submit_multireq(req->dev->blk, mrb);
+    }
     blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
 }
 
@@ -329,6 +447,9 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
     uint64_t total_sectors;
 
+    if (nb_sectors > INT_MAX) {
+        return false;
+    }
     if (sector & dev->sector_mask) {
         return false;
     }
@@ -342,60 +463,6 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     return true;
 }
 
-static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
-{
-    BlockRequest *blkreq;
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_WRITE);
-
-    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
-        virtio_submit_multiwrite(req->dev->blk, mrb);
-    }
-
-    blkreq = &mrb->blkreq[mrb->num_writes];
-    blkreq->sector = sector;
-    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
-    blkreq->qiov = &req->qiov;
-    blkreq->cb = virtio_blk_rw_complete;
-    blkreq->opaque = req;
-    blkreq->error = 0;
-
-    mrb->num_writes++;
-}
-
-static void virtio_blk_handle_read(VirtIOBlockReq *req)
-{
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_READ);
-    blk_aio_readv(req->dev->blk, sector, &req->qiov,
-                  req->qiov.size / BDRV_SECTOR_SIZE,
-                  virtio_blk_rw_complete, req);
-}
-
 void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -430,11 +497,57 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 
     type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
 
-    if (type & VIRTIO_BLK_T_FLUSH) {
+    /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
+     * is an optional flag. Altough a guest should not send this flag if
+     * not negotiated we ignored it in the past. So keep ignoring it. */
+    switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
+    case VIRTIO_BLK_T_IN:
+    {
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
+                                       &req->out.sector);
+
+        if (is_write) {
+            qemu_iovec_init_external(&req->qiov, iov, out_num);
+            trace_virtio_blk_handle_write(req, req->sector_num,
+                                          req->qiov.size / BDRV_SECTOR_SIZE);
+        } else {
+            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
+            trace_virtio_blk_handle_read(req, req->sector_num,
+                                         req->qiov.size / BDRV_SECTOR_SIZE);
+        }
+
+        if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
+                                      req->qiov.size)) {
+            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+            virtio_blk_free_request(req);
+            return;
+        }
+
+        block_acct_start(blk_get_stats(req->dev->blk),
+                         &req->acct, req->qiov.size,
+                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+        /* merge would exceed maximum number of requests or IO direction
+         * changes */
+        if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
+                                  is_write != mrb->is_write)) {
+            virtio_blk_submit_multireq(req->dev->blk, mrb);
+        }
+
+        assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
+        mrb->reqs[mrb->num_reqs++] = req;
+        mrb->is_write = is_write;
+        break;
+    }
+    case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
-    } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
+        break;
+    case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
-    } else if (type & VIRTIO_BLK_T_GET_ID) {
+        break;
+    case VIRTIO_BLK_T_GET_ID:
+    {
         VirtIOBlock *s = req->dev;
 
         /*
@@ -448,14 +561,9 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         iov_from_buf(in_iov, in_num, 0, serial, size);
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
         virtio_blk_free_request(req);
-    } else if (type & VIRTIO_BLK_T_OUT) {
-        qemu_iovec_init_external(&req->qiov, iov, out_num);
-        virtio_blk_handle_write(req, mrb);
-    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
-        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
-        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
-        virtio_blk_handle_read(req);
-    } else {
+        break;
+    }
+    default:
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
         virtio_blk_free_request(req);
     }
@@ -465,9 +573,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     VirtIOBlockReq *req;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb = {};
 
     /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
      * dataplane here instead of waiting for .set_status().
@@ -481,7 +587,9 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
         virtio_blk_handle_request(req, &mrb);
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    if (mrb.num_reqs) {
+        virtio_blk_submit_multireq(s->blk, &mrb);
+    }
 
     /*
      * FIXME: Want to check for completions before returning to guest mode,
@@ -494,9 +602,7 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 {
     VirtIOBlock *s = opaque;
     VirtIOBlockReq *req = s->rq;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb = {};
 
     qemu_bh_delete(s->bh);
     s->bh = NULL;
@@ -509,7 +615,9 @@ static void virtio_blk_dma_restart_bh(void *opaque)
         req = next;
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    if (mrb.num_reqs) {
+        virtio_blk_submit_multireq(s->blk, &mrb);
+    }
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 6ef3fa57cf..2027a64093 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -134,29 +134,32 @@ typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
-#define VIRTIO_BLK_MAX_MERGE_REQS 32
-
-typedef struct MultiReqBuffer {
-    BlockRequest        blkreq[VIRTIO_BLK_MAX_MERGE_REQS];
-    unsigned int        num_writes;
-} MultiReqBuffer;
-
 typedef struct VirtIOBlockReq {
+    int64_t sector_num;
     VirtIOBlock *dev;
     VirtQueueElement elem;
     struct virtio_blk_inhdr *in;
     struct virtio_blk_outhdr out;
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
+    struct VirtIOBlockReq *mr_next;
     BlockAcctCookie acct;
 } VirtIOBlockReq;
 
+#define VIRTIO_BLK_MAX_MERGE_REQS 32
+
+typedef struct MultiReqBuffer {
+    VirtIOBlockReq *reqs[VIRTIO_BLK_MAX_MERGE_REQS];
+    unsigned int num_reqs;
+    bool is_write;
+} MultiReqBuffer;
+
 VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s);
 
 void virtio_blk_free_request(VirtIOBlockReq *req);
 
 void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb);
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb);
+void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb);
 
 #endif
diff --git a/trace-events b/trace-events
index 907da7ed8a..57f357f7f7 100644
--- a/trace-events
+++ b/trace-events
@@ -116,6 +116,7 @@ virtio_blk_req_complete(void *req, int status) "req %p status %d"
 virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
 virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
 virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
+virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t sector, size_t nsectors, bool is_write) "mrb %p start %d num_reqs %d sector %"PRIu64" nsectors %zu is_write %d"
 
 # hw/block/dataplane/virtio-blk.c
 virtio_blk_data_plane_start(void *s) "dataplane %p"

From c99495ac1b4a27cb57bf04ed1a169177aeea2649 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 2 Feb 2015 14:52:22 +0100
Subject: [PATCH 16/47] virtio-blk: add a knob to disable request merging

this adds a knob to disable request merging for debugging or benchmarks if dedired.

Signed-off-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/virtio-blk.c          | 5 ++++-
 include/hw/virtio/virtio-blk.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index d0a01a8864..8c51a295d9 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -531,7 +531,8 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         /* merge would exceed maximum number of requests or IO direction
          * changes */
         if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
-                                  is_write != mrb->is_write)) {
+                                  is_write != mrb->is_write ||
+                                  !req->dev->conf.request_merging)) {
             virtio_blk_submit_multireq(req->dev->blk, mrb);
         }
 
@@ -950,6 +951,8 @@ static Property virtio_blk_properties[] = {
 #ifdef __linux__
     DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, true),
 #endif
+    DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
+                    true),
     DEFINE_PROP_BIT("x-data-plane", VirtIOBlock, conf.data_plane, 0, false),
     DEFINE_PROP_END_OF_LIST(),
 };
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 2027a64093..fc7d311a43 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -113,6 +113,7 @@ struct VirtIOBlkConf
     uint32_t scsi;
     uint32_t config_wce;
     uint32_t data_plane;
+    uint32_t request_merging;
 };
 
 struct VirtIOBlockDataPlane;

From 79e7a01954d5b171186ef1e946e9159417befc1e Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 16 Jan 2015 09:38:42 +0800
Subject: [PATCH 17/47] qemu-iotests: Fix supported_oses check

There is a bug in the recently added sys.platform test, and we no longer
run python tests, because "linux2" is the value to compare here. So do a
prefix match. According to python doc [1], the way to use sys.platform
is "unless you want to test for a specific system version, it is
therefore recommended to use the following idiom":

if sys.platform.startswith('freebsd'):
    # FreeBSD-specific code here...
elif sys.platform.startswith('linux'):
    # Linux-specific code here...

[1]: https://docs.python.org/2.7/library/sys.html#sys.platform

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/iotests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 87002e0e2c..241b5ee9dd 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -288,7 +288,7 @@ def main(supported_fmts=[], supported_oses=['linux']):
     if supported_fmts and (imgfmt not in supported_fmts):
         notrun('not suitable for this image format: %s' % imgfmt)
 
-    if sys.platform not in supported_oses:
+    if True not in [sys.platform.startswith(x) for x in supported_oses]:
         notrun('not suitable for this OS: %s' % sys.platform)
 
     # We need to filter out the time taken from the output so that qemu-iotest

From 6440d44cea84451ee9facb1237a4e7251631df80 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 23 Jan 2015 14:28:34 -0500
Subject: [PATCH 18/47] iotests: Specify format for qemu-nbd

This patch is necessary to suppress the "probed raw" warning when
running raw over nbd tests.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/common.rc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index aa093d9d84..22d3514041 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -153,7 +153,7 @@ _make_test_img()
 
     # Start an NBD server on the image file, which is what we'll be talking to
     if [ $IMGPROTO = "nbd" ]; then
-        eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810  $TEST_IMG_FILE &"
+        eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 -f $IMGFMT  $TEST_IMG_FILE &"
         QEMU_NBD_PID=$!
         sleep 1 # FIXME: qemu-nbd needs to be listening before we continue
     fi

From e2462113b2003085ad16f15e1442ded64e2d9a29 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Mon, 12 Jan 2015 14:11:13 +0100
Subject: [PATCH 19/47] block: add event when disk usage exceeds threshold

Managing applications, like oVirt (http://www.ovirt.org), make extensive
use of thin-provisioned disk images.
To let the guest run smoothly and be not unnecessarily paused, oVirt sets
a disk usage threshold (so called 'high water mark') based on the occupation
of the device,  and automatically extends the image once the threshold
is reached or exceeded.

In order to detect the crossing of the threshold, oVirt has no choice but
aggressively polling the QEMU monitor using the query-blockstats command.
This lead to unnecessary system load, and is made even worse under scale:
deployments with hundreds of VMs are no longer rare.

To fix this, this patch adds:
* A new monitor command `block-set-write-threshold', to set a mark for
  a given block device.
* A new event `BLOCK_WRITE_THRESHOLD', to report if a block device
  usage exceeds the threshold.
* A new `write_threshold' field into the `BlockDeviceInfo' structure,
  to report the configured threshold.

This will allow the managing application to use smarter and more
efficient monitoring, greatly reducing the need of polling.

[Updated qemu-iotests 067 output to add the new 'write_threshold'
property. --Stefan]
[Changed g_assert_false() to !g_assert() to fix the build on older glib
versions. --Kevin]

Signed-off-by: Francesco Romani <fromani@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1421068273-692-1-git-send-email-fromani@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/Makefile.objs             |   1 +
 block/qapi.c                    |   3 +
 block/write-threshold.c         | 125 ++++++++++++++++++++++++++++++++
 include/block/block_int.h       |   4 +
 include/block/write-threshold.h |  64 ++++++++++++++++
 qapi/block-core.json            |  51 ++++++++++++-
 qmp-commands.hx                 |  32 ++++++++
 tests/Makefile                  |   3 +
 tests/qemu-iotests/067.out      |   5 ++
 tests/test-write-threshold.c    | 119 ++++++++++++++++++++++++++++++
 10 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 block/write-threshold.c
 create mode 100644 include/block/write-threshold.h
 create mode 100644 tests/test-write-threshold.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 04b0e43eb1..010afad71a 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -20,6 +20,7 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
+block-obj-y += write-threshold.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
diff --git a/block/qapi.c b/block/qapi.c
index d1a891796c..1808e67336 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -24,6 +24,7 @@
 
 #include "block/qapi.h"
 #include "block/block_int.h"
+#include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
 #include "qapi/qmp-output-visitor.h"
@@ -89,6 +90,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
         info->iops_size = cfg.op_size;
     }
 
+    info->write_threshold = bdrv_write_threshold_get(bs);
+
     return info;
 }
 
diff --git a/block/write-threshold.c b/block/write-threshold.c
new file mode 100644
index 0000000000..c2cd517716
--- /dev/null
+++ b/block/write-threshold.c
@@ -0,0 +1,125 @@
+/*
+ * QEMU System Emulator block write threshold notification
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ *  Francesco Romani <fromani@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "block/block_int.h"
+#include "block/coroutine.h"
+#include "block/write-threshold.h"
+#include "qemu/notify.h"
+#include "qapi-event.h"
+#include "qmp-commands.h"
+
+
+uint64_t bdrv_write_threshold_get(const BlockDriverState *bs)
+{
+    return bs->write_threshold_offset;
+}
+
+bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
+{
+    return bs->write_threshold_offset > 0;
+}
+
+static void write_threshold_disable(BlockDriverState *bs)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        notifier_with_return_remove(&bs->write_threshold_notifier);
+        bs->write_threshold_offset = 0;
+    }
+}
+
+uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
+                                       const BdrvTrackedRequest *req)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        if (req->offset > bs->write_threshold_offset) {
+            return (req->offset - bs->write_threshold_offset) + req->bytes;
+        }
+        if ((req->offset + req->bytes) > bs->write_threshold_offset) {
+            return (req->offset + req->bytes) - bs->write_threshold_offset;
+        }
+    }
+    return 0;
+}
+
+static int coroutine_fn before_write_notify(NotifierWithReturn *notifier,
+                                            void *opaque)
+{
+    BdrvTrackedRequest *req = opaque;
+    BlockDriverState *bs = req->bs;
+    uint64_t amount = 0;
+
+    amount = bdrv_write_threshold_exceeded(bs, req);
+    if (amount > 0) {
+        qapi_event_send_block_write_threshold(
+            bs->node_name,
+            amount,
+            bs->write_threshold_offset,
+            &error_abort);
+
+        /* autodisable to avoid flooding the monitor */
+        write_threshold_disable(bs);
+    }
+
+    return 0; /* should always let other notifiers run */
+}
+
+static void write_threshold_register_notifier(BlockDriverState *bs)
+{
+    bs->write_threshold_notifier.notify = before_write_notify;
+    notifier_with_return_list_add(&bs->before_write_notifiers,
+                                  &bs->write_threshold_notifier);
+}
+
+static void write_threshold_update(BlockDriverState *bs,
+                                   int64_t threshold_bytes)
+{
+    bs->write_threshold_offset = threshold_bytes;
+}
+
+void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        if (threshold_bytes > 0) {
+            write_threshold_update(bs, threshold_bytes);
+        } else {
+            write_threshold_disable(bs);
+        }
+    } else {
+        if (threshold_bytes > 0) {
+            /* avoid multiple registration */
+            write_threshold_register_notifier(bs);
+            write_threshold_update(bs, threshold_bytes);
+        }
+        /* discard bogus disable request */
+    }
+}
+
+void qmp_block_set_write_threshold(const char *node_name,
+                                   uint64_t threshold_bytes,
+                                   Error **errp)
+{
+    BlockDriverState *bs;
+    AioContext *aio_context;
+
+    bs = bdrv_find_node(node_name);
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
+        return;
+    }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    bdrv_write_threshold_set(bs, threshold_bytes);
+
+    aio_context_release(aio_context);
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e264be97b2..7ad19503df 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -412,6 +412,10 @@ struct BlockDriverState {
 
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
+
+    /* threshold limit for writes, in bytes. "High water mark". */
+    uint64_t write_threshold_offset;
+    NotifierWithReturn write_threshold_notifier;
 };
 
 
diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h
new file mode 100644
index 0000000000..f1b899cd5f
--- /dev/null
+++ b/include/block/write-threshold.h
@@ -0,0 +1,64 @@
+/*
+ * QEMU System Emulator block write threshold notification
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ *  Francesco Romani <fromani@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#ifndef BLOCK_WRITE_THRESHOLD_H
+#define BLOCK_WRITE_THRESHOLD_H
+
+#include <stdint.h>
+
+#include "qemu/typedefs.h"
+#include "qemu-common.h"
+
+/*
+ * bdrv_write_threshold_set:
+ *
+ * Set the write threshold for block devices, in bytes.
+ * Notify when a write exceeds the threshold, meaning the device
+ * is becoming full, so it can be transparently resized.
+ * To be used with thin-provisioned block devices.
+ *
+ * Use threshold_bytes == 0 to disable.
+ */
+void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes);
+
+/*
+ * bdrv_write_threshold_get
+ *
+ * Get the configured write threshold, in bytes.
+ * Zero means no threshold configured.
+ */
+uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
+
+/*
+ * bdrv_write_threshold_is_set
+ *
+ * Tell if a write threshold is set for a given BDS.
+ */
+bool bdrv_write_threshold_is_set(const BlockDriverState *bs);
+
+/*
+ * bdrv_write_threshold_exceeded
+ *
+ * Return the extent of a write request that exceeded the threshold,
+ * or zero if the request is below the threshold.
+ * Return zero also if the threshold was not set.
+ *
+ * NOTE: here we assume the following holds for each request this code
+ * deals with:
+ *
+ * assert((req->offset + req->bytes) <= UINT64_MAX)
+ *
+ * Please not there is *not* an actual C assert().
+ */
+uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
+                                       const BdrvTrackedRequest *req);
+
+#endif
diff --git a/qapi/block-core.json b/qapi/block-core.json
index b7d9772444..a3fdaf02ba 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -257,6 +257,9 @@
 #
 # @cache: the cache mode used for the block device (since: 2.3)
 #
+# @write_threshold: configured write threshold for the device.
+#                   0 if disabled. (Since 2.3)
+#
 # Since: 0.14.0
 #
 ##
@@ -271,7 +274,8 @@
             '*bps_max': 'int', '*bps_rd_max': 'int',
             '*bps_wr_max': 'int', '*iops_max': 'int',
             '*iops_rd_max': 'int', '*iops_wr_max': 'int',
-            '*iops_size': 'int', 'cache': 'BlockdevCacheInfo' } }
+            '*iops_size': 'int', 'cache': 'BlockdevCacheInfo',
+            'write_threshold': 'int' } }
 
 ##
 # @BlockDeviceIoStatus:
@@ -1917,3 +1921,48 @@
 ##
 { 'enum': 'PreallocMode',
   'data': [ 'off', 'metadata', 'falloc', 'full' ] }
+
+##
+# @BLOCK_WRITE_THRESHOLD
+#
+# Emitted when writes on block device reaches or exceeds the
+# configured write threshold. For thin-provisioned devices, this
+# means the device should be extended to avoid pausing for
+# disk exhaustion.
+# The event is one shot. Once triggered, it needs to be
+# re-registered with another block-set-threshold command.
+#
+# @node-name: graph node name on which the threshold was exceeded.
+#
+# @amount-exceeded: amount of data which exceeded the threshold, in bytes.
+#
+# @write-threshold: last configured threshold, in bytes.
+#
+# Since: 2.3
+##
+{ 'event': 'BLOCK_WRITE_THRESHOLD',
+  'data': { 'node-name': 'str',
+            'amount-exceeded': 'uint64',
+            'write-threshold': 'uint64' } }
+
+##
+# @block-set-write-threshold
+#
+# Change the write threshold for a block drive. An event will be delivered
+# if a write to this block drive crosses the configured threshold.
+# This is useful to transparently resize thin-provisioned drives without
+# the guest OS noticing.
+#
+# @node-name: graph node name on which the threshold must be set.
+#
+# @write-threshold: configured threshold for the block device, bytes.
+#                   Use 0 to disable the threshold.
+#
+# Returns: Nothing on success
+#          If @node name is not found on the block device graph,
+#          DeviceNotFound
+#
+# Since: 2.3
+##
+{ 'command': 'block-set-write-threshold',
+  'data': { 'node-name': 'str', 'write-threshold': 'uint64' } }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index af3fd19935..a85d8479e3 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2146,6 +2146,8 @@ Each json-object contain the following:
          - "iops_size": I/O size when limiting by iops (json-int)
          - "detect_zeroes": detect and optimize zero writing (json-string)
              - Possible values: "off", "on", "unmap"
+         - "write_threshold": write offset threshold in bytes, a event will be
+                              emitted if crossed. Zero if disabled (json-int)
          - "image": the detail of the image, it is a json-object containing
             the following:
              - "filename": image file name (json-string)
@@ -2223,6 +2225,7 @@ Example:
                "iops_wr_max": 0,
                "iops_size": 0,
                "detect_zeroes": "on",
+               "write_threshold": 0,
                "image":{
                   "filename":"disks/test.qcow2",
                   "format":"qcow2",
@@ -3685,6 +3688,7 @@ Example:
                    "iops_rd_max": 0,
                    "iops_wr_max": 0,
                    "iops_size": 0,
+                   "write_threshold": 0,
                    "image":{
                       "filename":"disks/test.qcow2",
                       "format":"qcow2",
@@ -3920,4 +3924,32 @@ Move mouse pointer to absolute coordinates (20000, 400).
                { "type": "abs", "data" : { "axis": "Y", "value" : 400 } } ] } }
 <- { "return": {} }
 
+EQMP
+
+    {
+        .name       = "block-set-write-threshold",
+        .args_type  = "node-name:s,write-threshold:l",
+        .mhandler.cmd_new = qmp_marshal_input_block_set_write_threshold,
+    },
+
+SQMP
+block-set-write-threshold
+------------
+
+Change the write threshold for a block drive. The threshold is an offset,
+thus must be non-negative. Default is no write threshold.
+Setting the threshold to zero disables it.
+
+Arguments:
+
+- "node-name": the node name in the block driver state graph (json-string)
+- "write-threshold": the write threshold in bytes (json-int)
+
+Example:
+
+-> { "execute": "block-set-write-threshold",
+  "arguments": { "node-name": "mydev",
+                 "write-threshold": 17179869184 } }
+<- { "return": {} }
+
 EQMP
diff --git a/tests/Makefile b/tests/Makefile
index 5caccf765a..d5df16882d 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -68,6 +68,8 @@ check-unit-y += tests/check-qom-interface$(EXESUF)
 gcov-files-check-qom-interface-y = qom/object.c
 check-unit-y += tests/test-qemu-opts$(EXESUF)
 gcov-files-test-qemu-opts-y = qom/test-qemu-opts.c
+check-unit-y += tests/test-write-threshold$(EXESUF)
+gcov-files-test-write-threshold-y = block/write-threshold.c
 
 check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
 
@@ -360,6 +362,7 @@ tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
 tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y)
 tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
 tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a
+tests/test-write-threshold$(EXESUF): tests/test-write-threshold.o $(block-obj-y) libqemuutil.a libqemustub.a
 
 ifeq ($(CONFIG_POSIX),y)
 LIBS += -lutil
diff --git a/tests/qemu-iotests/067.out b/tests/qemu-iotests/067.out
index 13ff3cd7aa..00b3eaefc7 100644
--- a/tests/qemu-iotests/067.out
+++ b/tests/qemu-iotests/067.out
@@ -43,6 +43,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk -device virti
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -218,6 +219,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -423,6 +425,7 @@ Testing:
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -607,6 +610,7 @@ Testing:
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -717,6 +721,7 @@ Testing:
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
diff --git a/tests/test-write-threshold.c b/tests/test-write-threshold.c
new file mode 100644
index 0000000000..faffa7b855
--- /dev/null
+++ b/tests/test-write-threshold.c
@@ -0,0 +1,119 @@
+/*
+ * Test block device write threshold
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include <glib.h>
+#include <stdint.h>
+#include "block/block_int.h"
+#include "block/write-threshold.h"
+
+
+static void test_threshold_not_set_on_init(void)
+{
+    uint64_t res;
+    BlockDriverState bs;
+    memset(&bs, 0, sizeof(bs));
+
+    g_assert(!bdrv_write_threshold_is_set(&bs));
+
+    res = bdrv_write_threshold_get(&bs);
+    g_assert_cmpint(res, ==, 0);
+}
+
+static void test_threshold_set_get(void)
+{
+    uint64_t threshold = 4 * 1024 * 1024;
+    uint64_t res;
+    BlockDriverState bs;
+    memset(&bs, 0, sizeof(bs));
+
+    bdrv_write_threshold_set(&bs, threshold);
+
+    g_assert(bdrv_write_threshold_is_set(&bs));
+
+    res = bdrv_write_threshold_get(&bs);
+    g_assert_cmpint(res, ==, threshold);
+}
+
+static void test_threshold_multi_set_get(void)
+{
+    uint64_t threshold1 = 4 * 1024 * 1024;
+    uint64_t threshold2 = 15 * 1024 * 1024;
+    uint64_t res;
+    BlockDriverState bs;
+    memset(&bs, 0, sizeof(bs));
+
+    bdrv_write_threshold_set(&bs, threshold1);
+    bdrv_write_threshold_set(&bs, threshold2);
+    res = bdrv_write_threshold_get(&bs);
+    g_assert_cmpint(res, ==, threshold2);
+}
+
+static void test_threshold_not_trigger(void)
+{
+    uint64_t amount = 0;
+    uint64_t threshold = 4 * 1024 * 1024;
+    BlockDriverState bs;
+    BdrvTrackedRequest req;
+
+    memset(&bs, 0, sizeof(bs));
+    memset(&req, 0, sizeof(req));
+    req.offset = 1024;
+    req.bytes = 1024;
+
+    bdrv_write_threshold_set(&bs, threshold);
+    amount = bdrv_write_threshold_exceeded(&bs, &req);
+    g_assert_cmpuint(amount, ==, 0);
+}
+
+
+static void test_threshold_trigger(void)
+{
+    uint64_t amount = 0;
+    uint64_t threshold = 4 * 1024 * 1024;
+    BlockDriverState bs;
+    BdrvTrackedRequest req;
+
+    memset(&bs, 0, sizeof(bs));
+    memset(&req, 0, sizeof(req));
+    req.offset = (4 * 1024 * 1024) - 1024;
+    req.bytes = 2 * 1024;
+
+    bdrv_write_threshold_set(&bs, threshold);
+    amount = bdrv_write_threshold_exceeded(&bs, &req);
+    g_assert_cmpuint(amount, >=, 1024);
+}
+
+typedef struct TestStruct {
+    const char *name;
+    void (*func)(void);
+} TestStruct;
+
+
+int main(int argc, char **argv)
+{
+    size_t i;
+    TestStruct tests[] = {
+        { "/write-threshold/not-set-on-init",
+          test_threshold_not_set_on_init },
+        { "/write-threshold/set-get",
+          test_threshold_set_get },
+        { "/write-threshold/multi-set-get",
+          test_threshold_multi_set_get },
+        { "/write-threshold/not-trigger",
+          test_threshold_not_trigger },
+        { "/write-threshold/trigger",
+          test_threshold_trigger },
+        { NULL, NULL }
+    };
+
+    g_test_init(&argc, &argv, NULL);
+    for (i = 0; tests[i].name != NULL; i++) {
+        g_test_add_func(tests[i].name, tests[i].func);
+    }
+    return g_test_run();
+}

From fa8354bd226617e2afcc45c27a74959c15291cc0 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:04 +0100
Subject: [PATCH 20/47] block/dmg: properly detect the UDIF trailer

DMG files have a variable length with a UDIF trailer at the end of a
file. This UDIF trailer is essential as it describes the contents of
the image. At the moment however, the start of this trailer is almost
always incorrect as bdrv_getlength() returns a multiple of the block
size (rounded up). This results in a failure to recognize DMG files,
resulting in Invalid argument (EINVAL) errors.

As there is no API to retrieve the real file size, look for the magic
header in the last two sectors to find the start of this 512-byte UDIF
trailer (the "koly" block).

The resource fork offset ("info_begin") has its offset adjusted as the
initial value of offset does not mean "end of file" anymore, but "begin
of UDIF trailer".

[Replaced error_set(errp, ERROR_CLASS_GENERIC_ERROR, ...) with
error_setg(errp, ...) as discussed with Peter.
--Stefan]

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1420566495-13284-2-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index e455886d2b..cdad28fe14 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -131,6 +131,46 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
     }
 }
 
+static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
+{
+    int64_t length;
+    int64_t offset = 0;
+    uint8_t buffer[515];
+    int i, ret;
+
+    /* bdrv_getlength returns a multiple of block size (512), rounded up. Since
+     * dmg images can have odd sizes, try to look for the "koly" magic which
+     * marks the begin of the UDIF trailer (512 bytes). This magic can be found
+     * in the last 511 bytes of the second-last sector or the first 4 bytes of
+     * the last sector (search space: 515 bytes) */
+    length = bdrv_getlength(file_bs);
+    if (length < 0) {
+        error_setg_errno(errp, -length,
+            "Failed to get file size while reading UDIF trailer");
+        return length;
+    } else if (length < 512) {
+        error_setg(errp, "dmg file must be at least 512 bytes long");
+        return -EINVAL;
+    }
+    if (length > 511 + 512) {
+        offset = length - 511 - 512;
+    }
+    length = length < 515 ? length : 515;
+    ret = bdrv_pread(file_bs, offset, buffer, length);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed while reading UDIF trailer");
+        return ret;
+    }
+    for (i = 0; i < length - 3; i++) {
+        if (buffer[i] == 'k' && buffer[i+1] == 'o' &&
+            buffer[i+2] == 'l' && buffer[i+3] == 'y') {
+            return offset + i;
+        }
+    }
+    error_setg(errp, "Could not locate UDIF trailer in dmg file");
+    return -EINVAL;
+}
+
 static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
@@ -145,15 +185,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     s->n_chunks = 0;
     s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
 
-    /* read offset of info blocks */
-    offset = bdrv_getlength(bs->file);
+    /* locate the UDIF trailer */
+    offset = dmg_find_koly_offset(bs->file, errp);
     if (offset < 0) {
         ret = offset;
         goto fail;
     }
-    offset -= 0x1d8;
 
-    ret = read_uint64(bs, offset, &info_begin);
+    ret = read_uint64(bs, offset + 0x28, &info_begin);
     if (ret < 0) {
         goto fail;
     } else if (info_begin == 0) {

From 65a1c7c96a64cd69269c9ba854fa347dd0bbda0b Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:05 +0100
Subject: [PATCH 21/47] block/dmg: extract mish block decoding functionality

Extract the mish block decoder such that this can be used for other
formats in the future. A new DmgHeaderState struct is introduced to
share state while decoding.

The code is kept unchanged as much as possible, a "fail" label is added
for example where a simple return would probably do. In dmg_open, the
variable "tmp" is renamed to "rsrc_data_offset" for clarity and comments
have been added explaining various data.

Note that this patch has one subtle difference with the previous
version which should not affect functionality. In the previous code,
the end of a resource was inferred from the mish block (the offsets
would be increased by the fields). In this patch, the resource length
is used instead to avoid the need to rely on the previous offsets.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1420566495-13284-3-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 228 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 133 insertions(+), 95 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index cdad28fe14..c571ac9121 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -171,19 +171,138 @@ static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
     return -EINVAL;
 }
 
+/* used when building the sector table */
+typedef struct DmgHeaderState {
+    /* used internally by dmg_read_mish_block to remember offsets of blocks
+     * across calls */
+    uint64_t last_in_offset;
+    uint64_t last_out_offset;
+    /* exported for dmg_open */
+    uint32_t max_compressed_size;
+    uint32_t max_sectors_per_chunk;
+} DmgHeaderState;
+
+static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds,
+                               int64_t offset, uint32_t count)
+{
+    BDRVDMGState *s = bs->opaque;
+    uint32_t type, i;
+    int ret;
+    size_t new_size;
+    uint32_t chunk_count;
+
+    ret = read_uint32(bs, offset, &type);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* skip data that is not a valid MISH block (invalid magic or too small) */
+    if (type != 0x6d697368 || count < 244) {
+        /* assume success for now */
+        return 0;
+    }
+
+    offset += 4;
+    offset += 200;
+
+    chunk_count = (count - 204) / 40;
+    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+    s->types = g_realloc(s->types, new_size / 2);
+    s->offsets = g_realloc(s->offsets, new_size);
+    s->lengths = g_realloc(s->lengths, new_size);
+    s->sectors = g_realloc(s->sectors, new_size);
+    s->sectorcounts = g_realloc(s->sectorcounts, new_size);
+
+    for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
+        ret = read_uint32(bs, offset, &s->types[i]);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += 4;
+        if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
+            s->types[i] != 2) {
+            if (s->types[i] == 0xffffffff && i > 0) {
+                ds->last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
+                ds->last_out_offset = s->sectors[i - 1] +
+                                      s->sectorcounts[i - 1];
+            }
+            chunk_count--;
+            i--;
+            offset += 36;
+            continue;
+        }
+        offset += 4;
+
+        ret = read_uint64(bs, offset, &s->sectors[i]);
+        if (ret < 0) {
+            goto fail;
+        }
+        s->sectors[i] += ds->last_out_offset;
+        offset += 8;
+
+        ret = read_uint64(bs, offset, &s->sectorcounts[i]);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += 8;
+
+        if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+            error_report("sector count %" PRIu64 " for chunk %" PRIu32
+                         " is larger than max (%u)",
+                         s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        ret = read_uint64(bs, offset, &s->offsets[i]);
+        if (ret < 0) {
+            goto fail;
+        }
+        s->offsets[i] += ds->last_in_offset;
+        offset += 8;
+
+        ret = read_uint64(bs, offset, &s->lengths[i]);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += 8;
+
+        if (s->lengths[i] > DMG_LENGTHS_MAX) {
+            error_report("length %" PRIu64 " for chunk %" PRIu32
+                         " is larger than max (%u)",
+                         s->lengths[i], i, DMG_LENGTHS_MAX);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        update_max_chunk_size(s, i, &ds->max_compressed_size,
+                              &ds->max_sectors_per_chunk);
+    }
+    s->n_chunks += chunk_count;
+    return 0;
+
+fail:
+    return ret;
+}
+
 static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVDMGState *s = bs->opaque;
-    uint64_t info_begin, info_end, last_in_offset, last_out_offset;
-    uint32_t count, tmp;
-    uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i;
+    DmgHeaderState ds;
+    uint64_t info_begin, info_end;
+    uint32_t count, rsrc_data_offset;
     int64_t offset;
     int ret;
 
     bs->read_only = 1;
     s->n_chunks = 0;
     s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
+    /* used by dmg_read_mish_block to keep track of the current I/O position */
+    ds.last_in_offset = 0;
+    ds.last_out_offset = 0;
+    ds.max_compressed_size = 1;
+    ds.max_sectors_per_chunk = 1;
 
     /* locate the UDIF trailer */
     offset = dmg_find_koly_offset(bs->file, errp);
@@ -200,10 +319,10 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = read_uint32(bs, info_begin, &tmp);
+    ret = read_uint32(bs, info_begin, &rsrc_data_offset);
     if (ret < 0) {
         goto fail;
-    } else if (tmp != 0x100) {
+    } else if (rsrc_data_offset != 0x100) {
         ret = -EINVAL;
         goto fail;
     }
@@ -215,15 +334,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EINVAL;
         goto fail;
     }
+    /* end of resource data, ignoring the following resource map */
     info_end = info_begin + count;
 
+    /* begin of resource data (consisting of one or more resources) */
     offset = info_begin + 0x100;
 
-    /* read offsets */
-    last_in_offset = last_out_offset = 0;
+    /* read offsets (mish blocks) from one or more resources in resource data */
     while (offset < info_end) {
-        uint32_t type;
-
+        /* size of following resource */
         ret = read_uint32(bs, offset, &count);
         if (ret < 0) {
             goto fail;
@@ -233,100 +352,19 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         }
         offset += 4;
 
-        ret = read_uint32(bs, offset, &type);
+        ret = dmg_read_mish_block(bs, &ds, offset, count);
         if (ret < 0) {
             goto fail;
         }
-
-        if (type == 0x6d697368 && count >= 244) {
-            size_t new_size;
-            uint32_t chunk_count;
-
-            offset += 4;
-            offset += 200;
-
-            chunk_count = (count - 204) / 40;
-            new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
-            s->types = g_realloc(s->types, new_size / 2);
-            s->offsets = g_realloc(s->offsets, new_size);
-            s->lengths = g_realloc(s->lengths, new_size);
-            s->sectors = g_realloc(s->sectors, new_size);
-            s->sectorcounts = g_realloc(s->sectorcounts, new_size);
-
-            for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
-                ret = read_uint32(bs, offset, &s->types[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                offset += 4;
-                if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
-                    s->types[i] != 2) {
-                    if (s->types[i] == 0xffffffff && i > 0) {
-                        last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
-                        last_out_offset = s->sectors[i - 1] +
-                                          s->sectorcounts[i - 1];
-                    }
-                    chunk_count--;
-                    i--;
-                    offset += 36;
-                    continue;
-                }
-                offset += 4;
-
-                ret = read_uint64(bs, offset, &s->sectors[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                s->sectors[i] += last_out_offset;
-                offset += 8;
-
-                ret = read_uint64(bs, offset, &s->sectorcounts[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                offset += 8;
-
-                if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
-                    error_report("sector count %" PRIu64 " for chunk %" PRIu32
-                                 " is larger than max (%u)",
-                                 s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
-                    ret = -EINVAL;
-                    goto fail;
-                }
-
-                ret = read_uint64(bs, offset, &s->offsets[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                s->offsets[i] += last_in_offset;
-                offset += 8;
-
-                ret = read_uint64(bs, offset, &s->lengths[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                offset += 8;
-
-                if (s->lengths[i] > DMG_LENGTHS_MAX) {
-                    error_report("length %" PRIu64 " for chunk %" PRIu32
-                                 " is larger than max (%u)",
-                                 s->lengths[i], i, DMG_LENGTHS_MAX);
-                    ret = -EINVAL;
-                    goto fail;
-                }
-
-                update_max_chunk_size(s, i, &max_compressed_size,
-                                      &max_sectors_per_chunk);
-            }
-            s->n_chunks += chunk_count;
-        }
+        /* advance offset by size of resource */
+        offset += count;
     }
 
     /* initialize zlib engine */
     s->compressed_chunk = qemu_try_blockalign(bs->file,
-                                              max_compressed_size + 1);
+                                              ds.max_compressed_size + 1);
     s->uncompressed_chunk = qemu_try_blockalign(bs->file,
-                                                512 * max_sectors_per_chunk);
+                                                512 * ds.max_sectors_per_chunk);
     if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) {
         ret = -ENOMEM;
         goto fail;

From b0e8dc5d54225d2e7012bd306e62ac90ba6fc1ea Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:06 +0100
Subject: [PATCH 22/47] block/dmg: extract processing of resource forks

Besides the offset, also read the resource length. This length is now
used in the extracted function to verify the end of the resource fork
against "count" from the resource fork.

Instead of relying on the value of offset to conclude whether the
resource fork is available or not (info_begin==0), check the
rsrc_fork_length instead. This would allow a dmg file to begin with a
resource fork. This seemingly unnecessary restriction was found while
trying to craft a DMG file by hand.

Other changes:

 - Do not require resource data offset to be 0x100 (but check that it
   is within bounds though).
 - Further improve boundary checking (resource data must be within
   the resource fork).
 - Use correct value for resource data length (spotted by John Snow)
 - Consider the resource data offset when determining info_end.
   This fixes an EINVAL on the tuxpaint dmg example.

The resource fork format is documented at
https://developer.apple.com/legacy/library/documentation/mac/pdf/MoreMacintoshToolbox.pdf#page=151

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-4-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 108 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index c571ac9121..04bae729a4 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -285,13 +285,70 @@ fail:
     return ret;
 }
 
+static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
+                                  uint64_t info_begin, uint64_t info_length)
+{
+    int ret;
+    uint32_t count, rsrc_data_offset;
+    uint64_t info_end;
+    uint64_t offset;
+
+    /* read offset from begin of resource fork (info_begin) to resource data */
+    ret = read_uint32(bs, info_begin, &rsrc_data_offset);
+    if (ret < 0) {
+        goto fail;
+    } else if (rsrc_data_offset > info_length) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* read length of resource data */
+    ret = read_uint32(bs, info_begin + 8, &count);
+    if (ret < 0) {
+        goto fail;
+    } else if (count == 0 || rsrc_data_offset + count > info_length) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* begin of resource data (consisting of one or more resources) */
+    offset = info_begin + rsrc_data_offset;
+
+    /* end of resource data (there is possibly a following resource map
+     * which will be ignored). */
+    info_end = offset + count;
+
+    /* read offsets (mish blocks) from one or more resources in resource data */
+    while (offset < info_end) {
+        /* size of following resource */
+        ret = read_uint32(bs, offset, &count);
+        if (ret < 0) {
+            goto fail;
+        } else if (count == 0) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        offset += 4;
+
+        ret = dmg_read_mish_block(bs, ds, offset, count);
+        if (ret < 0) {
+            goto fail;
+        }
+        /* advance offset by size of resource */
+        offset += count;
+    }
+    return 0;
+
+fail:
+    return ret;
+}
+
 static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVDMGState *s = bs->opaque;
     DmgHeaderState ds;
-    uint64_t info_begin, info_end;
-    uint32_t count, rsrc_data_offset;
+    uint64_t rsrc_fork_offset, rsrc_fork_length;
     int64_t offset;
     int ret;
 
@@ -311,53 +368,24 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = read_uint64(bs, offset + 0x28, &info_begin);
+    /* offset of resource fork (RsrcForkOffset) */
+    ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset);
     if (ret < 0) {
         goto fail;
-    } else if (info_begin == 0) {
-        ret = -EINVAL;
-        goto fail;
     }
-
-    ret = read_uint32(bs, info_begin, &rsrc_data_offset);
+    ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length);
     if (ret < 0) {
         goto fail;
-    } else if (rsrc_data_offset != 0x100) {
-        ret = -EINVAL;
-        goto fail;
     }
-
-    ret = read_uint32(bs, info_begin + 4, &count);
-    if (ret < 0) {
-        goto fail;
-    } else if (count == 0) {
-        ret = -EINVAL;
-        goto fail;
-    }
-    /* end of resource data, ignoring the following resource map */
-    info_end = info_begin + count;
-
-    /* begin of resource data (consisting of one or more resources) */
-    offset = info_begin + 0x100;
-
-    /* read offsets (mish blocks) from one or more resources in resource data */
-    while (offset < info_end) {
-        /* size of following resource */
-        ret = read_uint32(bs, offset, &count);
-        if (ret < 0) {
-            goto fail;
-        } else if (count == 0) {
-            ret = -EINVAL;
-            goto fail;
-        }
-        offset += 4;
-
-        ret = dmg_read_mish_block(bs, &ds, offset, count);
+    if (rsrc_fork_length != 0) {
+        ret = dmg_read_resource_fork(bs, &ds,
+                                     rsrc_fork_offset, rsrc_fork_length);
         if (ret < 0) {
             goto fail;
         }
-        /* advance offset by size of resource */
-        offset += count;
+    } else {
+        ret = -EINVAL;
+        goto fail;
     }
 
     /* initialize zlib engine */

From 7aee37b93a4f694cdd670807f30b8efd33d0c721 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:07 +0100
Subject: [PATCH 23/47] block/dmg: process a buffer instead of reading ints

As the decoded plist XML is not a pointer in the file,
dmg_read_mish_block must be able to process a buffer instead of a file
pointer. Since the full buffer must be processed, let's change the
return value again to just a success flag.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-5-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 60 ++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 04bae729a4..4f56227fc5 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -100,6 +100,16 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
     return 0;
 }
 
+static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset)
+{
+    return be64_to_cpu(*(uint64_t *)&buffer[offset]);
+}
+
+static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset)
+{
+    return be32_to_cpu(*(uint32_t *)&buffer[offset]);
+}
+
 /* Increase max chunk sizes, if necessary.  This function is used to calculate
  * the buffer sizes needed for compressed/uncompressed chunk I/O.
  */
@@ -182,20 +192,16 @@ typedef struct DmgHeaderState {
     uint32_t max_sectors_per_chunk;
 } DmgHeaderState;
 
-static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds,
-                               int64_t offset, uint32_t count)
+static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
+                               uint8_t *buffer, uint32_t count)
 {
-    BDRVDMGState *s = bs->opaque;
     uint32_t type, i;
     int ret;
     size_t new_size;
     uint32_t chunk_count;
+    int64_t offset = 0;
 
-    ret = read_uint32(bs, offset, &type);
-    if (ret < 0) {
-        goto fail;
-    }
-
+    type = buff_read_uint32(buffer, offset);
     /* skip data that is not a valid MISH block (invalid magic or too small) */
     if (type != 0x6d697368 || count < 244) {
         /* assume success for now */
@@ -214,10 +220,7 @@ static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds,
     s->sectorcounts = g_realloc(s->sectorcounts, new_size);
 
     for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
-        ret = read_uint32(bs, offset, &s->types[i]);
-        if (ret < 0) {
-            goto fail;
-        }
+        s->types[i] = buff_read_uint32(buffer, offset);
         offset += 4;
         if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
             s->types[i] != 2) {
@@ -233,17 +236,11 @@ static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds,
         }
         offset += 4;
 
-        ret = read_uint64(bs, offset, &s->sectors[i]);
-        if (ret < 0) {
-            goto fail;
-        }
+        s->sectors[i] = buff_read_uint64(buffer, offset);
         s->sectors[i] += ds->last_out_offset;
         offset += 8;
 
-        ret = read_uint64(bs, offset, &s->sectorcounts[i]);
-        if (ret < 0) {
-            goto fail;
-        }
+        s->sectorcounts[i] = buff_read_uint64(buffer, offset);
         offset += 8;
 
         if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
@@ -254,17 +251,11 @@ static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds,
             goto fail;
         }
 
-        ret = read_uint64(bs, offset, &s->offsets[i]);
-        if (ret < 0) {
-            goto fail;
-        }
+        s->offsets[i] = buff_read_uint64(buffer, offset);
         s->offsets[i] += ds->last_in_offset;
         offset += 8;
 
-        ret = read_uint64(bs, offset, &s->lengths[i]);
-        if (ret < 0) {
-            goto fail;
-        }
+        s->lengths[i] = buff_read_uint64(buffer, offset);
         offset += 8;
 
         if (s->lengths[i] > DMG_LENGTHS_MAX) {
@@ -288,8 +279,10 @@ fail:
 static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
                                   uint64_t info_begin, uint64_t info_length)
 {
+    BDRVDMGState *s = bs->opaque;
     int ret;
     uint32_t count, rsrc_data_offset;
+    uint8_t *buffer = NULL;
     uint64_t info_end;
     uint64_t offset;
 
@@ -330,16 +323,23 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
         }
         offset += 4;
 
-        ret = dmg_read_mish_block(bs, ds, offset, count);
+        buffer = g_realloc(buffer, count);
+        ret = bdrv_pread(bs->file, offset, buffer, count);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        ret = dmg_read_mish_block(s, ds, buffer, count);
         if (ret < 0) {
             goto fail;
         }
         /* advance offset by size of resource */
         offset += count;
     }
-    return 0;
+    ret = 0;
 
 fail:
+    g_free(buffer);
     return ret;
 }
 

From f6e6652d7c9251236fc1ecc6cece36104c7af15b Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:08 +0100
Subject: [PATCH 24/47] block/dmg: validate chunk size to avoid overflow

Previously the chunk size was not checked, allowing for a large memory
allocation. This patch checks whether the chunks size is within the
resource fork length, and whether the resource fork is below the
trailer of the dmg file.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-6-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/dmg.c b/block/dmg.c
index 4f56227fc5..5c2c2c231d 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -317,7 +317,7 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
         ret = read_uint32(bs, offset, &count);
         if (ret < 0) {
             goto fail;
-        } else if (count == 0) {
+        } else if (count == 0 || count > info_end - offset) {
             ret = -EINVAL;
             goto fail;
         }
@@ -377,6 +377,11 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     if (ret < 0) {
         goto fail;
     }
+    if (rsrc_fork_offset >= offset ||
+        rsrc_fork_length > offset - rsrc_fork_offset) {
+        ret = -EINVAL;
+        goto fail;
+    }
     if (rsrc_fork_length != 0) {
         ret = dmg_read_resource_fork(bs, &ds,
                                      rsrc_fork_offset, rsrc_fork_length);

From 0599e56ed468d245148530eb194be4a5056a0583 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:09 +0100
Subject: [PATCH 25/47] block/dmg: process XML plists

The format is simple enough to avoid using a full-blown XML parser. It
assumes that all BLKX items begin with the "mish" magic word, therefore
it is not a problem if other values get matched which are not a BLKX
block.

The offsets are based on the description at
http://newosxbook.com/DMG.html

For compatibility with glib 2.12, use g_base64_decode (which
additionally requires an extra buffer allocation) instead of
g_base64_decode_inplace (which is only available since glib 2.20).

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-7-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/block/dmg.c b/block/dmg.c
index 5c2c2c231d..a78506ad77 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -26,6 +26,7 @@
 #include "qemu/bswap.h"
 #include "qemu/module.h"
 #include <zlib.h>
+#include <glib.h>
 
 enum {
     /* Limit chunk sizes to prevent unreasonable amounts of memory being used
@@ -343,12 +344,67 @@ fail:
     return ret;
 }
 
+static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds,
+                              uint64_t info_begin, uint64_t info_length)
+{
+    BDRVDMGState *s = bs->opaque;
+    int ret;
+    uint8_t *buffer = NULL;
+    char *data_begin, *data_end;
+
+    /* Have at least some length to avoid NULL for g_malloc. Attempt to set a
+     * safe upper cap on the data length. A test sample had a XML length of
+     * about 1 MiB. */
+    if (info_length == 0 || info_length > 16 * 1024 * 1024) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    buffer = g_malloc(info_length + 1);
+    buffer[info_length] = '\0';
+    ret = bdrv_pread(bs->file, info_begin, buffer, info_length);
+    if (ret != info_length) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* look for <data>...</data>. The data is 284 (0x11c) bytes after base64
+     * decode. The actual data element has 431 (0x1af) bytes which includes tabs
+     * and line feeds. */
+    data_end = (char *)buffer;
+    while ((data_begin = strstr(data_end, "<data>")) != NULL) {
+        guchar *mish;
+        gsize out_len = 0;
+
+        data_begin += 6;
+        data_end = strstr(data_begin, "</data>");
+        /* malformed XML? */
+        if (data_end == NULL) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        *data_end++ = '\0';
+        mish = g_base64_decode(data_begin, &out_len);
+        ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len);
+        g_free(mish);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+    ret = 0;
+
+fail:
+    g_free(buffer);
+    return ret;
+}
+
 static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVDMGState *s = bs->opaque;
     DmgHeaderState ds;
     uint64_t rsrc_fork_offset, rsrc_fork_length;
+    uint64_t plist_xml_offset, plist_xml_length;
     int64_t offset;
     int ret;
 
@@ -382,12 +438,31 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EINVAL;
         goto fail;
     }
+    /* offset of property list (XMLOffset) */
+    ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset);
+    if (ret < 0) {
+        goto fail;
+    }
+    ret = read_uint64(bs, offset + 0xe0, &plist_xml_length);
+    if (ret < 0) {
+        goto fail;
+    }
+    if (plist_xml_offset >= offset ||
+        plist_xml_length > offset - plist_xml_offset) {
+        ret = -EINVAL;
+        goto fail;
+    }
     if (rsrc_fork_length != 0) {
         ret = dmg_read_resource_fork(bs, &ds,
                                      rsrc_fork_offset, rsrc_fork_length);
         if (ret < 0) {
             goto fail;
         }
+    } else if (plist_xml_length != 0) {
+        ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length);
+        if (ret < 0) {
+            goto fail;
+        }
     } else {
         ret = -EINVAL;
         goto fail;

From 8daf425794ed624083de98caab04b9fedb873420 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:10 +0100
Subject: [PATCH 26/47] block/dmg: set virtual size to a non-zero value

Right now the virtual size is always reported as zero which makes it
impossible to convert between formats.

After this patch, the number of sectors will be read from the trailer
("koly" block).

To verify the behavior, the output of `dmg2img foo.dmg foo.img` was
compared against `qemu-img convert -f dmg -O raw foo.dmg foo.raw`. The
tests showed that the file contents are exactly the same, except that
QEMU creates a slightly larger file (it matches the total sectors
count).

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-8-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/dmg.c b/block/dmg.c
index a78506ad77..a33c131a9f 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -452,6 +452,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EINVAL;
         goto fail;
     }
+    ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+    if (bs->total_sectors < 0) {
+        ret = -EINVAL;
+        goto fail;
+    }
     if (rsrc_fork_length != 0) {
         ret = dmg_read_resource_fork(bs, &ds,
                                      rsrc_fork_offset, rsrc_fork_length);

From c6d34865fa02463cf34634f45369ebcc725b101b Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:11 +0100
Subject: [PATCH 27/47] block/dmg: fix sector data offset calculation

This patch addresses two issues:

 - The data fork offset was not taken into account, resulting in failure
   to read an InstallESD.dmg file (5164763151 bytes) which had a
   non-zero DataForkOffset field.
 - The offset of the previous block ("partition") was unconditionally
   added to the current block because older files would start the input
   offset of a new block at zero. Newer files (including vlc-2.1.5.dmg,
   tuxpaint-0.9.15-macosx.dmg and OS X Yosemite [MAS].dmg) failed in
   reads because these files have chunk offsets, relative to the begin
   of a data fork.

Now the data offset of the mish is taken into account. While we could
check that the data_offset is within the data fork, let's not do that
here as it would only result in parse failures on invalid files (rather
than gracefully handling such bad files). dmg_read will error out if
the offset is incorrect.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-9-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index a33c131a9f..34dc6322e9 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -186,7 +186,7 @@ static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
 typedef struct DmgHeaderState {
     /* used internally by dmg_read_mish_block to remember offsets of blocks
      * across calls */
-    uint64_t last_in_offset;
+    uint64_t data_fork_offset;
     uint64_t last_out_offset;
     /* exported for dmg_open */
     uint32_t max_compressed_size;
@@ -201,6 +201,8 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
     size_t new_size;
     uint32_t chunk_count;
     int64_t offset = 0;
+    uint64_t data_offset;
+    uint64_t in_offset = ds->data_fork_offset;
 
     type = buff_read_uint32(buffer, offset);
     /* skip data that is not a valid MISH block (invalid magic or too small) */
@@ -209,8 +211,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         return 0;
     }
 
-    offset += 4;
-    offset += 200;
+    /* location in data fork for (compressed) blob (in bytes) */
+    data_offset = buff_read_uint64(buffer, offset + 0x18);
+    in_offset += data_offset;
+
+    /* move to begin of chunk entries */
+    offset += 204;
 
     chunk_count = (count - 204) / 40;
     new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
@@ -226,7 +232,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
             s->types[i] != 2) {
             if (s->types[i] == 0xffffffff && i > 0) {
-                ds->last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
                 ds->last_out_offset = s->sectors[i - 1] +
                                       s->sectorcounts[i - 1];
             }
@@ -253,7 +258,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         }
 
         s->offsets[i] = buff_read_uint64(buffer, offset);
-        s->offsets[i] += ds->last_in_offset;
+        s->offsets[i] += in_offset;
         offset += 8;
 
         s->lengths[i] = buff_read_uint64(buffer, offset);
@@ -412,7 +417,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     s->n_chunks = 0;
     s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
     /* used by dmg_read_mish_block to keep track of the current I/O position */
-    ds.last_in_offset = 0;
+    ds.data_fork_offset = 0;
     ds.last_out_offset = 0;
     ds.max_compressed_size = 1;
     ds.max_sectors_per_chunk = 1;
@@ -424,6 +429,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    /* offset of data fork (DataForkOffset) */
+    ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset);
+    if (ret < 0) {
+        goto fail;
+    } else if (ds.data_fork_offset > offset) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
     /* offset of resource fork (RsrcForkOffset) */
     ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset);
     if (ret < 0) {

From 66ec3bba972b6d4433698397d25d039c230949dc Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:12 +0100
Subject: [PATCH 28/47] block/dmg: use SectorNumber from BLKX header

Previously the sector table parsing relied on the previous offset of
the DMG file. Now it uses the sector number from the BLKX header
(see http://newosxbook.com/DMG.html).

The implementation of dmg2img (from vu1tur) does not base the output
sector on the location of the terminator (0xffffffff) either so it
should be safe to drop this dependency on the previous state.

(It makes somehow makes sense, a terminator should halt further
processing of a block and is perhaps used to preallocate some space.)

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-10-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 34dc6322e9..d144a5e810 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -187,7 +187,6 @@ typedef struct DmgHeaderState {
     /* used internally by dmg_read_mish_block to remember offsets of blocks
      * across calls */
     uint64_t data_fork_offset;
-    uint64_t last_out_offset;
     /* exported for dmg_open */
     uint32_t max_compressed_size;
     uint32_t max_sectors_per_chunk;
@@ -203,6 +202,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
     int64_t offset = 0;
     uint64_t data_offset;
     uint64_t in_offset = ds->data_fork_offset;
+    uint64_t out_offset;
 
     type = buff_read_uint32(buffer, offset);
     /* skip data that is not a valid MISH block (invalid magic or too small) */
@@ -211,6 +211,9 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         return 0;
     }
 
+    /* chunk offsets are relative to this sector number */
+    out_offset = buff_read_uint64(buffer, offset + 8);
+
     /* location in data fork for (compressed) blob (in bytes) */
     data_offset = buff_read_uint64(buffer, offset + 0x18);
     in_offset += data_offset;
@@ -231,10 +234,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         offset += 4;
         if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
             s->types[i] != 2) {
-            if (s->types[i] == 0xffffffff && i > 0) {
-                ds->last_out_offset = s->sectors[i - 1] +
-                                      s->sectorcounts[i - 1];
-            }
             chunk_count--;
             i--;
             offset += 36;
@@ -243,7 +242,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         offset += 4;
 
         s->sectors[i] = buff_read_uint64(buffer, offset);
-        s->sectors[i] += ds->last_out_offset;
+        s->sectors[i] += out_offset;
         offset += 8;
 
         s->sectorcounts[i] = buff_read_uint64(buffer, offset);
@@ -418,7 +417,6 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
     /* used by dmg_read_mish_block to keep track of the current I/O position */
     ds.data_fork_offset = 0;
-    ds.last_out_offset = 0;
     ds.max_compressed_size = 1;
     ds.max_sectors_per_chunk = 1;
 

From a8b10c6ead7f62e8eadbdaf944f371889c3c4c29 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:13 +0100
Subject: [PATCH 29/47] block/dmg: factor out block type check

In preparation for adding bzip2 support, split the type check into a
separate function. Make all offsets relative to the begin of a chunk
such that it is easier to recognize the position without having to
add up all offsets. Some comments are added to describe the fields.

There is no functional change.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-11-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index d144a5e810..cc584b55aa 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -192,6 +192,18 @@ typedef struct DmgHeaderState {
     uint32_t max_sectors_per_chunk;
 } DmgHeaderState;
 
+static bool dmg_is_known_block_type(uint32_t entry_type)
+{
+    switch (entry_type) {
+    case 0x00000001:    /* uncompressed */
+    case 0x00000002:    /* zeroes */
+    case 0x80000005:    /* zlib */
+        return true;
+    default:
+        return false;
+    }
+}
+
 static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
                                uint8_t *buffer, uint32_t count)
 {
@@ -231,22 +243,19 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
 
     for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
         s->types[i] = buff_read_uint32(buffer, offset);
-        offset += 4;
-        if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
-            s->types[i] != 2) {
+        if (!dmg_is_known_block_type(s->types[i])) {
             chunk_count--;
             i--;
-            offset += 36;
+            offset += 40;
             continue;
         }
-        offset += 4;
 
-        s->sectors[i] = buff_read_uint64(buffer, offset);
+        /* sector number */
+        s->sectors[i] = buff_read_uint64(buffer, offset + 8);
         s->sectors[i] += out_offset;
-        offset += 8;
 
-        s->sectorcounts[i] = buff_read_uint64(buffer, offset);
-        offset += 8;
+        /* sector count */
+        s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10);
 
         if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
             error_report("sector count %" PRIu64 " for chunk %" PRIu32
@@ -256,12 +265,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
             goto fail;
         }
 
-        s->offsets[i] = buff_read_uint64(buffer, offset);
+        /* offset in (compressed) data fork */
+        s->offsets[i] = buff_read_uint64(buffer, offset + 0x18);
         s->offsets[i] += in_offset;
-        offset += 8;
 
-        s->lengths[i] = buff_read_uint64(buffer, offset);
-        offset += 8;
+        /* length in (compressed) data fork */
+        s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
 
         if (s->lengths[i] > DMG_LENGTHS_MAX) {
             error_report("length %" PRIu64 " for chunk %" PRIu32
@@ -273,6 +282,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
 
         update_max_chunk_size(s, i, &ds->max_compressed_size,
                               &ds->max_sectors_per_chunk);
+        offset += 40;
     }
     s->n_chunks += chunk_count;
     return 0;

From 6b383c08c46468ecee98ecf71ffd8362e9bd7f42 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:14 +0100
Subject: [PATCH 30/47] block/dmg: support bzip2 block entry types

This patch adds support for bzip2-compressed block entries as introduced
with OS X 10.4 (source: https://en.wikipedia.org/wiki/Apple_Disk_Image).

It was tested against a 5.2G "OS X Yosemite" installation image which
stores the BLXX block in the XML property list (instead of resource
forks) and has over 5k chunks.

New configure entries are added (--enable-bzip2 / --disable-bzip2) to
control inclusion of bzip2 functionality (which requires linking against
libbz2). The help message suggests that this option is needed for DMG
files, but the tests are generic enough that other parts of QEMU can use
bzip2 if needed.

The identifiers are based on http://newosxbook.com/DMG.html.

The decompression routines are based on the zlib case, but as there is
no way to reset the decompression state (unlike zlib), memory is
allocated and deallocated for every decompression. This should not be
problematic as the decompression takes most of the time and as blocks
are typically about/over 1 MiB in size, only one allocation is done
every 2000 sectors.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1420566495-13284-12-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/Makefile.objs |  1 +
 block/dmg.c         | 43 ++++++++++++++++++++++++++++++++++++++++++-
 configure           | 31 +++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 010afad71a..db2933e469 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -37,5 +37,6 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
diff --git a/block/dmg.c b/block/dmg.c
index cc584b55aa..797fda3ec8 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -26,6 +26,9 @@
 #include "qemu/bswap.h"
 #include "qemu/module.h"
 #include <zlib.h>
+#ifdef CONFIG_BZIP2
+#include <bzlib.h>
+#endif
 #include <glib.h>
 
 enum {
@@ -56,6 +59,9 @@ typedef struct BDRVDMGState {
     uint8_t *compressed_chunk;
     uint8_t *uncompressed_chunk;
     z_stream zstream;
+#ifdef CONFIG_BZIP2
+    bz_stream bzstream;
+#endif
 } BDRVDMGState;
 
 static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -123,6 +129,7 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
 
     switch (s->types[chunk]) {
     case 0x80000005: /* zlib compressed */
+    case 0x80000006: /* bzip2 compressed */
         compressed_size = s->lengths[chunk];
         uncompressed_sectors = s->sectorcounts[chunk];
         break;
@@ -198,6 +205,9 @@ static bool dmg_is_known_block_type(uint32_t entry_type)
     case 0x00000001:    /* uncompressed */
     case 0x00000002:    /* zeroes */
     case 0x80000005:    /* zlib */
+#ifdef CONFIG_BZIP2
+    case 0x80000006:    /* bzip2 */
+#endif
         return true;
     default:
         return false;
@@ -564,13 +574,16 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
     if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
         int ret;
         uint32_t chunk = search_chunk(s, sector_num);
+#ifdef CONFIG_BZIP2
+        uint64_t total_out;
+#endif
 
         if (chunk >= s->n_chunks) {
             return -1;
         }
 
         s->current_chunk = s->n_chunks;
-        switch (s->types[chunk]) {
+        switch (s->types[chunk]) { /* block entry type */
         case 0x80000005: { /* zlib compressed */
             /* we need to buffer, because only the chunk as whole can be
              * inflated. */
@@ -594,6 +607,34 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
                 return -1;
             }
             break; }
+#ifdef CONFIG_BZIP2
+        case 0x80000006: /* bzip2 compressed */
+            /* we need to buffer, because only the chunk as whole can be
+             * inflated. */
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
+                             s->compressed_chunk, s->lengths[chunk]);
+            if (ret != s->lengths[chunk]) {
+                return -1;
+            }
+
+            ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0);
+            if (ret != BZ_OK) {
+                return -1;
+            }
+            s->bzstream.next_in = (char *)s->compressed_chunk;
+            s->bzstream.avail_in = (unsigned int) s->lengths[chunk];
+            s->bzstream.next_out = (char *)s->uncompressed_chunk;
+            s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk];
+            ret = BZ2_bzDecompress(&s->bzstream);
+            total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) +
+                        s->bzstream.total_out_lo32;
+            BZ2_bzDecompressEnd(&s->bzstream);
+            if (ret != BZ_STREAM_END ||
+                total_out != 512 * s->sectorcounts[chunk]) {
+                return -1;
+            }
+            break;
+#endif /* CONFIG_BZIP2 */
         case 1: /* copy */
             ret = bdrv_pread(bs->file, s->offsets[chunk],
                              s->uncompressed_chunk, s->lengths[chunk]);
diff --git a/configure b/configure
index e00e03af32..7ba4bcb866 100755
--- a/configure
+++ b/configure
@@ -313,6 +313,7 @@ glx=""
 zlib="yes"
 lzo=""
 snappy=""
+bzip2=""
 guest_agent=""
 guest_agent_with_vss="no"
 vss_win32_sdk=""
@@ -1060,6 +1061,10 @@ for opt do
   ;;
   --enable-snappy) snappy="yes"
   ;;
+  --disable-bzip2) bzip2="no"
+  ;;
+  --enable-bzip2) bzip2="yes"
+  ;;
   --enable-guest-agent) guest_agent="yes"
   ;;
   --disable-guest-agent) guest_agent="no"
@@ -1374,6 +1379,8 @@ Advanced options (experts only):
   --enable-usb-redir       enable usb network redirection support
   --enable-lzo             enable the support of lzo compression library
   --enable-snappy          enable the support of snappy compression library
+  --enable-bzip2           enable the support of bzip2 compression library (for
+                           reading bzip2-compressed dmg images)
   --disable-guest-agent    disable building of the QEMU Guest Agent
   --enable-guest-agent     enable building of the QEMU Guest Agent
   --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
@@ -1819,6 +1826,24 @@ EOF
     fi
 fi
 
+##########################################
+# bzip2 check
+
+if test "$bzip2" != "no" ; then
+    cat > $TMPC << EOF
+#include <bzlib.h>
+int main(void) { BZ2_bzlibVersion(); return 0; }
+EOF
+    if compile_prog "" "-lbz2" ; then
+        bzip2="yes"
+    else
+        if test "$bzip2" = "yes"; then
+            feature_not_found "libbzip2" "Install libbzip2 devel"
+        fi
+        bzip2="no"
+    fi
+fi
+
 ##########################################
 # libseccomp check
 
@@ -4385,6 +4410,7 @@ echo "vhdx              $vhdx"
 echo "Quorum            $quorum"
 echo "lzo support       $lzo"
 echo "snappy support    $snappy"
+echo "bzip2 support     $bzip2"
 echo "NUMA host support $numa"
 
 if test "$sdl_too_old" = "yes"; then
@@ -4743,6 +4769,11 @@ if test "$snappy" = "yes" ; then
   echo "CONFIG_SNAPPY=y" >> $config_host_mak
 fi
 
+if test "$bzip2" = "yes" ; then
+  echo "CONFIG_BZIP2=y" >> $config_host_mak
+  echo "BZIP2_LIBS=-lbz2" >> $config_host_mak
+fi
+
 if test "$libiscsi" = "yes" ; then
   echo "CONFIG_LIBISCSI=m" >> $config_host_mak
   echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak

From 177b75104da3e3a9af84975c32a44782d903c41f Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 6 Jan 2015 18:48:15 +0100
Subject: [PATCH 31/47] block/dmg: improve zeroes handling

Disk images may contain large all-zeroes gaps (1.66k sectors or 812 MiB
is seen in the real world). These blocks (type 2) do not need to be
extracted into a temporary buffer, there is no need to allocate memory
for these blocks nor to check its length.

(For the test image, the maximum uncompressed size is 1054371 bytes,
probably for a bzip2-compressed block.)

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 1420566495-13284-13-git-send-email-peter@lekensteyn.nl
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/dmg.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 797fda3ec8..825c49d59a 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -137,7 +137,9 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
         uncompressed_sectors = (s->lengths[chunk] + 511) / 512;
         break;
     case 2: /* zero */
-        uncompressed_sectors = s->sectorcounts[chunk];
+        /* as the all-zeroes block may be large, it is treated specially: the
+         * sector is not copied from a large buffer, a simple memset is used
+         * instead. Therefore uncompressed_sectors does not need to be set. */
         break;
     }
 
@@ -267,7 +269,9 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
         /* sector count */
         s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10);
 
-        if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+        /* all-zeroes sector (type 2) does not need to be "uncompressed" and can
+         * therefore be unbounded. */
+        if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
             error_report("sector count %" PRIu64 " for chunk %" PRIu32
                          " is larger than max (%u)",
                          s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
@@ -643,7 +647,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
             }
             break;
         case 2: /* zero */
-            memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]);
+            /* see dmg_read, it is treated specially. No buffer needs to be
+             * pre-filled, the zeroes can be set directly. */
             break;
         }
         s->current_chunk = chunk;
@@ -662,6 +667,13 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num,
         if (dmg_read_chunk(bs, sector_num + i) != 0) {
             return -1;
         }
+        /* Special case: current chunk is all zeroes. Do not perform a memcpy as
+         * s->uncompressed_chunk may be too small to cover the large all-zeroes
+         * section. dmg_read_chunk is called to find s->current_chunk */
+        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
+            memset(buf + i * 512, 0, 512);
+            continue;
+        }
         sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
         memcpy(buf + i * 512,
                s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);

From 0adfa1ed655904d5bf17fe047635a563f0229789 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 12 Jan 2015 12:31:32 +0000
Subject: [PATCH 32/47] qed: check for header size overflow

Header size is denoted in clusters.  The maximum cluster size is 64 MB
but there is no limit on header size.  Check for uint32_t overflow in
case the header size field has a whacky value.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1421065893-18875-2-git-send-email-stefanha@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/qed.c b/block/qed.c
index 80f18d82e2..892b13c806 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -440,6 +440,11 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
     s->l2_mask = s->table_nelems - 1;
     s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
 
+    /* Header size calculation must not overflow uint32_t */
+    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
+        return -EINVAL;
+    }
+
     if ((s->header.features & QED_F_BACKING_FILE)) {
         if ((uint64_t)s->header.backing_filename_offset +
             s->header.backing_filename_size >

From 319fc53e344d5cead970c74f088ae5c607d426b3 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 12 Jan 2015 12:31:33 +0000
Subject: [PATCH 33/47] qemu-iotests: add 116 invalid QED input file tests

These tests exercise error code paths in the QED image format.  The
tests are very simple, they just prove that the error path exits
cleanly.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1421065893-18875-3-git-send-email-stefanha@redhat.com
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/116     | 96 ++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/116.out | 37 +++++++++++++++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 134 insertions(+)
 create mode 100755 tests/qemu-iotests/116
 create mode 100644 tests/qemu-iotests/116.out

diff --git a/tests/qemu-iotests/116 b/tests/qemu-iotests/116
new file mode 100755
index 0000000000..713ed484ba
--- /dev/null
+++ b/tests/qemu-iotests/116
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Test error code paths for invalid QED images
+#
+# The aim of this test is to exercise the error paths in qed_open() to ensure
+# there are no crashes with invalid input files.
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=stefanha@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qed
+_supported_proto generic
+_supported_os Linux
+
+
+size=128M
+
+echo
+echo "== truncated header cluster =="
+_make_test_img $size
+truncate -s 512 "$TEST_IMG"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid header magic =="
+_make_test_img $size
+poke_file "$TEST_IMG" "0" "QEDX"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid cluster size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "4" "\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid table size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "8" "\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid header size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "12" "\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid L1 table offset =="
+_make_test_img $size
+poke_file "$TEST_IMG" "40" "\xff\xff\xff\xff\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid image size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "48" "\xff\xff\xff\xff\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/116.out b/tests/qemu-iotests/116.out
new file mode 100644
index 0000000000..b679ceea63
--- /dev/null
+++ b/tests/qemu-iotests/116.out
@@ -0,0 +1,37 @@
+QA output created by 116
+
+== truncated header cluster ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid header magic ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Image not in QED format
+no file open, try 'help open'
+
+== invalid cluster size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid table size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid header size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid L1 table offset ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid image size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index f8bf354156..4b2b93bc19 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -116,3 +116,4 @@
 111 rw auto quick
 113 rw auto quick
 114 rw auto quick
+116 rw auto quick

From e729fa6afed3aa917287b63034244f548b79ec60 Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Tue, 27 Jan 2015 08:33:55 -0500
Subject: [PATCH 34/47] block: fix off-by-one error in qcow and qcow2

This fixes an off-by-one error introduced in 9a29e18.  Both qcow and
qcow2 need to make sure to leave room for string terminator '\0' for
the backing file, so the max length of the non-terminated string is
either 1023 or PATH_MAX - 1.

Reported-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c  | 2 +-
 block/qcow2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index ccbe9e0d2c..055896910e 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -215,7 +215,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     /* read the backing file name */
     if (header.backing_file_offset != 0) {
         len = header.backing_file_size;
-        if (len > 1023 || len > sizeof(bs->backing_file)) {
+        if (len > 1023 || len >= sizeof(bs->backing_file)) {
             error_setg(errp, "Backing file name too long");
             ret = -EINVAL;
             goto fail;
diff --git a/block/qcow2.c b/block/qcow2.c
index dbaf016bc7..7e614d76a4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -869,7 +869,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     if (header.backing_file_offset != 0) {
         len = header.backing_file_size;
         if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
-            len > sizeof(bs->backing_file)) {
+            len >= sizeof(bs->backing_file)) {
             error_setg(errp, "Backing file name too long");
             ret = -EINVAL;
             goto fail;

From 53f9e77f4ed898a4ea4ad2bc118c06c336ba30dd Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 26 Jan 2015 21:02:56 -0500
Subject: [PATCH 35/47] iotests: Fix 083

As of 8f9e835fd2e687d2bfe936819c3494af4343614d, probing should be
disabled in the qemu-iotests (at least when using qemu-io). This broke
083's reference output (which consisted mostly of "Could not read image
for determining its format").

This patch fixes it.

Note that one case which failed before is now successful: Disconnect
after data. This is due to qemu having read twice before (once for
probing, once for the qemu-io read command), but only once now (the
qemu-io read command). Therefore, reading is successful (which is
correct).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/083.out | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out
index 85ee8d6dd7..fd397f421f 100644
--- a/tests/qemu-iotests/083.out
+++ b/tests/qemu-iotests/083.out
@@ -62,8 +62,7 @@ no file open, try 'help open'
 === Check disconnect after neg2 ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect 8 neg2 ===
 
@@ -80,49 +79,43 @@ no file open, try 'help open'
 === Check disconnect before request ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect after request ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect before reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect after reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect 4 reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect 8 reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect before data ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect after data ===
 
 
-read failed: Input/output error
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 === Check disconnect before neg-classic ===
 
@@ -157,7 +150,6 @@ no file open, try 'help open'
 === Check disconnect after neg-classic ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 *** done

From 23ab6953f4675b8663888ce6cfe40f4191be3e48 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 26 Jan 2015 21:02:57 -0500
Subject: [PATCH 36/47] iotests: Fix 100 for nbd

In case of NBD, _make_test_img starts a new NBD server. Therefore,
_cleanup_test_img (which shuts that server down) has to be invoked
before the next _make_test_img call in order to make 100 work for NBD.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/100 | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/qemu-iotests/100 b/tests/qemu-iotests/100
index 9124aba760..7c1b235b51 100755
--- a/tests/qemu-iotests/100
+++ b/tests/qemu-iotests/100
@@ -55,6 +55,8 @@ echo "== verify pattern =="
 $QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Sequential requests =="
 _make_test_img $size
@@ -66,6 +68,8 @@ $QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0xce 4k 4k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 8k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Superset overlapping requests =="
 _make_test_img $size
@@ -79,6 +83,8 @@ $QEMU_IO -c "read -P 0xcd 0 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0xcd 3k 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Subset overlapping requests =="
 _make_test_img $size
@@ -92,6 +98,8 @@ $QEMU_IO -c "read -P 0xce 0 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0xce 3k 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Head overlapping requests =="
 _make_test_img $size
@@ -104,6 +112,8 @@ echo "== verify pattern =="
 $QEMU_IO -c "read -P 0xce 2k 2k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Tail overlapping requests =="
 _make_test_img $size
@@ -116,6 +126,8 @@ echo "== verify pattern =="
 $QEMU_IO -c "read -P 0xce 0k 2k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Disjoint requests =="
 _make_test_img $size

From a231cb272611c758d45135f6a35a4dd1beaf5585 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 26 Jan 2015 21:02:58 -0500
Subject: [PATCH 37/47] iotests: Fix 104 for NBD

_make_test_img sets up an NBD server, _cleanup_test_img shuts it down;
thus, _cleanup_test_img has to be called before _make_test_img is
invoked another time.

Furthermore, the pipe through _filter_test_img was unnecessary;
_make_test_img already takes care of that.

And finally, a filter is added to _filter_img_info to replace
"nbd://127.0.0.1:10810" by "TEST_DIR/t.IMGFMT", since the former is the
way to express the full image path (normally the latter) for NBD tests.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/104           | 9 +++------
 tests/qemu-iotests/common.filter | 1 +
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/104 b/tests/qemu-iotests/104
index b471aa5bb1..f32752bb6f 100755
--- a/tests/qemu-iotests/104
+++ b/tests/qemu-iotests/104
@@ -28,11 +28,7 @@ here=`pwd`
 tmp=/tmp/$$
 status=1	# failure is the default!
 
-_cleanup()
-{
-	_cleanup_test_img
-}
-trap "_cleanup; exit \$status" 0 1 2 3 15
+trap "exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
 . ./common.rc
@@ -47,8 +43,9 @@ echo
 image_sizes="1024 1234"
 
 for s in $image_sizes; do
-    _make_test_img $s | _filter_img_create
+    _make_test_img $s
     _img_info | _filter_img_info
+    _cleanup_test_img
 done
 
 # success, all done
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index b73c70be95..06e1bb045f 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -200,6 +200,7 @@ _filter_img_info()
     sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
         -e "s#$TEST_DIR#TEST_DIR#g" \
         -e "s#$IMGFMT#IMGFMT#g" \
+        -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \
         -e "/encrypted: yes/d" \
         -e "/cluster_size: [0-9]\\+/d" \
         -e "/table_size: [0-9]\\+/d" \

From 1ce52846d3ce0e4b58caebcae84719bef6401fbb Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 26 Jan 2015 21:02:59 -0500
Subject: [PATCH 38/47] nbd: Improve error messages

This patch makes use of the Error object for nbd_receive_negotiate() so
that errors during negotiation look nicer.

Furthermore, this patch adds an additional error message if the received
magic was wrong, but would be correct for the other protocol version,
respectively: So if an export name was specified, but the NBD server
magic corresponds to an old handshake, this condition is explicitly
signaled to the user, and vice versa.

As these messages are now part of the "Could not open image" error
message, additional filtering has to be employed in iotest 083, which
this patch does as well.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nbd-client.c         |  4 +--
 block/nbd-client.h         |  2 +-
 block/nbd.c                |  2 +-
 include/block/nbd.h        |  2 +-
 nbd.c                      | 42 ++++++++++++++++++-------------
 qemu-nbd.c                 |  7 +++++-
 tests/qemu-iotests/083     |  3 ++-
 tests/qemu-iotests/083.out | 51 +++++++++++++-------------------------
 8 files changed, 55 insertions(+), 58 deletions(-)

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 6e1c97cad0..28bfb62bed 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -373,7 +373,7 @@ void nbd_client_session_close(NbdClientSession *client)
 }
 
 int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
-    int sock, const char *export)
+                            int sock, const char *export, Error **errp)
 {
     int ret;
 
@@ -382,7 +382,7 @@ int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
     qemu_set_block(sock);
     ret = nbd_receive_negotiate(sock, export,
                                 &client->nbdflags, &client->size,
-                                &client->blocksize);
+                                &client->blocksize, errp);
     if (ret < 0) {
         logout("Failed to negotiate with the NBD server\n");
         closesocket(sock);
diff --git a/block/nbd-client.h b/block/nbd-client.h
index cd478f3a98..cfeecc2775 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -36,7 +36,7 @@ typedef struct NbdClientSession {
 } NbdClientSession;
 
 int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
-                            int sock, const char *export_name);
+                            int sock, const char *export_name, Error **errp);
 void nbd_client_session_close(NbdClientSession *client);
 
 int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
diff --git a/block/nbd.c b/block/nbd.c
index 04cc845076..2e20831715 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -271,7 +271,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* NBD handshake */
-    result = nbd_client_session_init(&s->client, bs, sock, export);
+    result = nbd_client_session_init(&s->client, bs, sock, export, errp);
     g_free(export);
     return result;
 }
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 348302c90b..b75959556c 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -75,7 +75,7 @@ enum {
 
 ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read);
 int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
-                          off_t *size, size_t *blocksize);
+                          off_t *size, size_t *blocksize, Error **errp);
 int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize);
 ssize_t nbd_send_request(int csock, struct nbd_request *request);
 ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply);
diff --git a/nbd.c b/nbd.c
index 53cf82be0b..e56afbc162 100644
--- a/nbd.c
+++ b/nbd.c
@@ -494,7 +494,7 @@ fail:
 }
 
 int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
-                          off_t *size, size_t *blocksize)
+                          off_t *size, size_t *blocksize, Error **errp)
 {
     char buf[256];
     uint64_t magic, s;
@@ -506,13 +506,13 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
     rc = -EINVAL;
 
     if (read_sync(csock, buf, 8) != 8) {
-        LOG("read failed");
+        error_setg(errp, "Failed to read data");
         goto fail;
     }
 
     buf[8] = '\0';
     if (strlen(buf) == 0) {
-        LOG("server connection closed");
+        error_setg(errp, "Server connection closed unexpectedly");
         goto fail;
     }
 
@@ -527,12 +527,12 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
           qemu_isprint(buf[7]) ? buf[7] : '.');
 
     if (memcmp(buf, "NBDMAGIC", 8) != 0) {
-        LOG("Invalid magic received");
+        error_setg(errp, "Invalid magic received");
         goto fail;
     }
 
     if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
-        LOG("read failed");
+        error_setg(errp, "Failed to read magic");
         goto fail;
     }
     magic = be64_to_cpu(magic);
@@ -545,52 +545,60 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
 
         TRACE("Checking magic (opts_magic)");
         if (magic != NBD_OPTS_MAGIC) {
-            LOG("Bad magic received");
+            if (magic == NBD_CLIENT_MAGIC) {
+                error_setg(errp, "Server does not support export names");
+            } else {
+                error_setg(errp, "Bad magic received");
+            }
             goto fail;
         }
         if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
-            LOG("flags read failed");
+            error_setg(errp, "Failed to read server flags");
             goto fail;
         }
         *flags = be16_to_cpu(tmp) << 16;
         /* reserved for future use */
         if (write_sync(csock, &reserved, sizeof(reserved)) !=
             sizeof(reserved)) {
-            LOG("write failed (reserved)");
+            error_setg(errp, "Failed to read reserved field");
             goto fail;
         }
         /* write the export name */
         magic = cpu_to_be64(magic);
         if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
-            LOG("write failed (magic)");
+            error_setg(errp, "Failed to send export name magic");
             goto fail;
         }
         opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
         if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
-            LOG("write failed (opt)");
+            error_setg(errp, "Failed to send export name option number");
             goto fail;
         }
         namesize = cpu_to_be32(strlen(name));
         if (write_sync(csock, &namesize, sizeof(namesize)) !=
             sizeof(namesize)) {
-            LOG("write failed (namesize)");
+            error_setg(errp, "Failed to send export name length");
             goto fail;
         }
         if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
-            LOG("write failed (name)");
+            error_setg(errp, "Failed to send export name");
             goto fail;
         }
     } else {
         TRACE("Checking magic (cli_magic)");
 
         if (magic != NBD_CLIENT_MAGIC) {
-            LOG("Bad magic received");
+            if (magic == NBD_OPTS_MAGIC) {
+                error_setg(errp, "Server requires an export name");
+            } else {
+                error_setg(errp, "Bad magic received");
+            }
             goto fail;
         }
     }
 
     if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
-        LOG("read failed");
+        error_setg(errp, "Failed to read export length");
         goto fail;
     }
     *size = be64_to_cpu(s);
@@ -599,19 +607,19 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
 
     if (!name) {
         if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
-            LOG("read failed (flags)");
+            error_setg(errp, "Failed to read export flags");
             goto fail;
         }
         *flags = be32_to_cpup(flags);
     } else {
         if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
-            LOG("read failed (tmp)");
+            error_setg(errp, "Failed to read export flags");
             goto fail;
         }
         *flags |= be32_to_cpu(tmp);
     }
     if (read_sync(csock, &buf, 124) != 124) {
-        LOG("read failed (buf)");
+        error_setg(errp, "Failed to read reserved block");
         goto fail;
     }
     rc = 0;
diff --git a/qemu-nbd.c b/qemu-nbd.c
index d222512412..4d8df08385 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -284,6 +284,7 @@ static void *nbd_client_thread(void *arg)
     int fd, sock;
     int ret;
     pthread_t show_parts_thread;
+    Error *local_error = NULL;
 
     sock = unix_socket_outgoing(sockpath);
     if (sock < 0) {
@@ -291,8 +292,12 @@ static void *nbd_client_thread(void *arg)
     }
 
     ret = nbd_receive_negotiate(sock, NULL, &nbdflags,
-                                &size, &blocksize);
+                                &size, &blocksize, &local_error);
     if (ret < 0) {
+        if (local_error) {
+            fprintf(stderr, "%s\n", error_get_pretty(local_error));
+            error_free(local_error);
+        }
         goto out_socket;
     }
 
diff --git a/tests/qemu-iotests/083 b/tests/qemu-iotests/083
index 991a9d91db..1b2d3f1156 100755
--- a/tests/qemu-iotests/083
+++ b/tests/qemu-iotests/083
@@ -56,7 +56,8 @@ filter_nbd() {
 	#
 	# Filter out the TCP port number since this changes between runs.
 	sed -e 's#^.*nbd\.c:.*##g' \
-	    -e 's#nbd:127\.0\.0\.1:[^:]*:#nbd:127\.0\.0\.1:PORT:#g'
+	    -e 's#nbd:127\.0\.0\.1:[^:]*:#nbd:127\.0\.0\.1:PORT:#g' \
+            -e 's#\(exportname=foo\|PORT\): Failed to .*$#\1#'
 }
 
 check_disconnect() {
diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out
index fd397f421f..8c1441bf4f 100644
--- a/tests/qemu-iotests/083.out
+++ b/tests/qemu-iotests/083.out
@@ -1,62 +1,52 @@
 QA output created by 083
 === Check disconnect before neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect after neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 8 neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 16 neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect before export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect after export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 4 export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 12 export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 16 export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect before neg2 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect after neg2 ===
@@ -66,14 +56,12 @@ read failed: Input/output error
 
 === Check disconnect 8 neg2 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 10 neg2 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect before request ===
@@ -119,32 +107,27 @@ read 512/512 bytes at offset 0
 
 === Check disconnect before neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 8 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 16 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 24 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 28 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect after neg-classic ===

From 75af1f34cd5b07c3c7fcf86dfc99a42de48a600d Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Fri, 6 Feb 2015 11:54:11 +0100
Subject: [PATCH 39/47] block: introduce BDRV_REQUEST_MAX_SECTORS

we check and adjust request sizes at several places with
sometimes inconsistent checks or default values:
 INT_MAX
 INT_MAX >> BDRV_SECTOR_BITS
 UINT_MAX >> BDRV_SECTOR_BITS
 SIZE_MAX >> BDRV_SECTOR_BITS

This patches introdocues a macro for the maximal allowed sectors
per request and uses it at several places.

Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 21 +++++++++------------
 hw/block/virtio-blk.c |  4 ++--
 include/block/block.h |  3 +++
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/block.c b/block.c
index 8272ef901e..49e0073ce9 100644
--- a/block.c
+++ b/block.c
@@ -2647,7 +2647,7 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 {
     int64_t len;
 
-    if (size > INT_MAX) {
+    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
         return -EIO;
     }
 
@@ -2671,7 +2671,7 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                               int nb_sectors)
 {
-    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EIO;
     }
 
@@ -2758,7 +2758,7 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
     };
 
-    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EINVAL;
     }
 
@@ -2826,13 +2826,10 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
     }
 
     for (;;) {
-        nb_sectors = target_sectors - sector_num;
+        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
         if (nb_sectors <= 0) {
             return 0;
         }
-        if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
-            nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
-        }
         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
         if (ret < 0) {
             error_report("error getting block status at sector %" PRId64 ": %s",
@@ -3167,7 +3164,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EINVAL;
     }
 
@@ -3202,8 +3199,8 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     struct iovec iov = {0};
     int ret = 0;
 
-    int max_write_zeroes = bs->bl.max_write_zeroes ?
-                           bs->bl.max_write_zeroes : INT_MAX;
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+                                        BDRV_REQUEST_MAX_SECTORS);
 
     while (nb_sectors > 0 && !ret) {
         int num = nb_sectors;
@@ -3458,7 +3455,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EINVAL;
     }
 
@@ -5120,7 +5117,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
+    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
     while (nb_sectors > 0) {
         int ret;
         int num = nb_sectors;
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 8c51a295d9..1a8a176dcc 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -381,7 +381,7 @@ void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
     }
 
     max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
-    max_xfer_len = MIN_NON_ZERO(max_xfer_len, INT_MAX);
+    max_xfer_len = MIN_NON_ZERO(max_xfer_len, BDRV_REQUEST_MAX_SECTORS);
 
     qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
           &multireq_compare);
@@ -447,7 +447,7 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
     uint64_t total_sectors;
 
-    if (nb_sectors > INT_MAX) {
+    if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return false;
     }
     if (sector & dev->sector_mask) {
diff --git a/include/block/block.h b/include/block/block.h
index 3082d2b80e..25a6d62d1b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -83,6 +83,9 @@ typedef enum {
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
 
+#define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
+                                     INT_MAX >> BDRV_SECTOR_BITS)
+
 /*
  * Allocation status flags
  * BDRV_BLOCK_DATA: data is read from bs->file or another file

From fa21e6faa6f1d7de49fd030ebdb0722b59cf9a41 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 6 Feb 2015 14:24:43 +0300
Subject: [PATCH 40/47] nbd: fix max_discard/max_transfer_length

nbd_co_discard calls nbd_client_session_co_discard which uses uint32_t
as the length in bytes of the data to discard due to the following
definition:

struct nbd_request {
    uint32_t magic;
    uint32_t type;
    uint64_t handle;
    uint64_t from;
    uint32_t len; <-- the length of data to be discarded, in bytes
} QEMU_PACKED;

Thus we should limit bl_max_discard to UINT32_MAX >> BDRV_SECTOR_BITS to
avoid overflow.

NBD read/write code uses the same structure for transfers. Fix
max_transfer_length accordingly.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Peter Lieven <pl@kamp.de>
CC: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nbd.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 2e20831715..b05d1d0407 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -301,6 +301,12 @@ static int nbd_co_flush(BlockDriverState *bs)
     return nbd_client_session_co_flush(&s->client);
 }
 
+static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
+    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
+}
+
 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors)
 {
@@ -396,6 +402,7 @@ static BlockDriver bdrv_nbd = {
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
     .bdrv_attach_aio_context    = nbd_attach_aio_context,
@@ -413,6 +420,7 @@ static BlockDriver bdrv_nbd_tcp = {
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
     .bdrv_attach_aio_context    = nbd_attach_aio_context,
@@ -430,6 +438,7 @@ static BlockDriver bdrv_nbd_unix = {
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
     .bdrv_attach_aio_context    = nbd_attach_aio_context,

From 8e8cb375e0964b4ed099cb8563029028db26a834 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Thu, 5 Feb 2015 14:55:31 +0200
Subject: [PATCH 41/47] block: Give always priority to unused entries in the
 qcow2 L2 cache

The current algorithm to replace entries from the L2 cache gives
priority to newer hits by dividing the hit count of all existing
entries by two everytime there is a cache miss.

However, if there are several cache misses the hit count of the
existing entries can easily go down to 0. This will result in those
entries being replaced even when there are others that have never been
used.

This problem is more noticeable with larger disk images and cache
sizes, since the chances of having several misses before the cache is
full are higher.

If we make sure that the hit count can never go down to 0 again,
unused entries will always have priority.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 904f6b1f44..b1155492a3 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -253,7 +253,9 @@ static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
 
         /* Give newer hits priority */
         /* TODO Check how to optimize the replacement strategy */
-        c->entries[i].cache_hits /= 2;
+        if (c->entries[i].cache_hits > 1) {
+            c->entries[i].cache_hits /= 2;
+        }
     }
 
     if (min_index == -1) {

From 8c44dfbc62a50a8bc4113f199b8662861f757591 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 6 Feb 2015 09:39:16 -0500
Subject: [PATCH 42/47] qcow2: Rewrite qcow2_alloc_bytes()

qcow2_alloc_bytes() is a function with insufficient error handling and
an unnecessary goto. This patch rewrites it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 80 +++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 9afdb40b40..9b80ca79ea 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -759,54 +759,54 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 {
     BDRVQcowState *s = bs->opaque;
-    int64_t offset, cluster_offset;
-    int free_in_cluster;
+    int64_t offset;
+    size_t free_in_cluster;
+    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
     assert(size > 0 && size <= s->cluster_size);
-    if (s->free_byte_offset == 0) {
-        offset = qcow2_alloc_clusters(bs, s->cluster_size);
-        if (offset < 0) {
-            return offset;
+    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
+
+    offset = s->free_byte_offset;
+
+    if (offset) {
+        int refcount = qcow2_get_refcount(bs, offset >> s->cluster_bits);
+        if (refcount < 0) {
+            return refcount;
         }
-        s->free_byte_offset = offset;
-    }
- redo:
-    free_in_cluster = s->cluster_size -
-        offset_into_cluster(s, s->free_byte_offset);
-    if (size <= free_in_cluster) {
-        /* enough space in current cluster */
-        offset = s->free_byte_offset;
-        s->free_byte_offset += size;
-        free_in_cluster -= size;
-        if (free_in_cluster == 0)
-            s->free_byte_offset = 0;
-        if (offset_into_cluster(s, offset) != 0)
-            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                          QCOW2_DISCARD_NEVER);
-    } else {
-        offset = qcow2_alloc_clusters(bs, s->cluster_size);
-        if (offset < 0) {
-            return offset;
-        }
-        cluster_offset = start_of_cluster(s, s->free_byte_offset);
-        if ((cluster_offset + s->cluster_size) == offset) {
-            /* we are lucky: contiguous data */
-            offset = s->free_byte_offset;
-            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                          QCOW2_DISCARD_NEVER);
-            s->free_byte_offset += size;
-        } else {
-            s->free_byte_offset = offset;
-            goto redo;
+
+        if (refcount == 0xffff) {
+            offset = 0;
         }
     }
 
-    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
-     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
-     * be flushed before the caller's L2 table updates.
-     */
+    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
+    if (!offset || free_in_cluster < size) {
+        int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
+        if (new_cluster < 0) {
+            return new_cluster;
+        }
+
+        if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
+            offset = new_cluster;
+        }
+    }
+
+    assert(offset);
+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* The cluster refcount was incremented; refcount blocks must be flushed
+     * before the caller's L2 table updates. */
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+
+    s->free_byte_offset = offset + size;
+    if (!offset_into_cluster(s, s->free_byte_offset)) {
+        s->free_byte_offset = 0;
+    }
+
     return offset;
 }
 

From 24d6bffe8ab523b1dcd8b8a2be678775f3e26b1c Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Thu, 29 Jan 2015 10:36:58 +0100
Subject: [PATCH 43/47] blockdev: Give find_block_job() an Error ** parameter

When find_block_job() fails, all its callers build the same Error
object.  Build it in find_block_job() instead.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1422524221-8566-2-git-send-email-armbru@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockdev.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index d59efd3f15..8d6ca35ac8 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2653,7 +2653,8 @@ out:
 }
 
 /* Get the block job for a given device name and acquire its AioContext */
-static BlockJob *find_block_job(const char *device, AioContext **aio_context)
+static BlockJob *find_block_job(const char *device, AioContext **aio_context,
+                                Error **errp)
 {
     BlockDriverState *bs;
 
@@ -2673,6 +2674,7 @@ static BlockJob *find_block_job(const char *device, AioContext **aio_context)
     return bs->job;
 
 notfound:
+    error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
     *aio_context = NULL;
     return NULL;
 }
@@ -2680,10 +2682,9 @@ notfound:
 void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2695,10 +2696,9 @@ void qmp_block_job_cancel(const char *device,
                           bool has_force, bool force, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2721,10 +2721,9 @@ out:
 void qmp_block_job_pause(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2736,10 +2735,9 @@ void qmp_block_job_pause(const char *device, Error **errp)
 void qmp_block_job_resume(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2751,10 +2749,9 @@ void qmp_block_job_resume(const char *device, Error **errp)
 void qmp_block_job_complete(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 

From 2e3a0266bd84a9be9f5e23c1568db6eb7f3e9e94 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@pond.sub.org>
Date: Thu, 29 Jan 2015 10:36:59 +0100
Subject: [PATCH 44/47] blockdev: Eliminate silly QERR_BLOCK_JOB_NOT_ACTIVE
 macro

The QERR_ macros are leftovers from the days of "rich" error objects.
They're used with error_set() and qerror_report(), and expand into the
first *two* arguments.  This trickiness has become pointless.  Clean
this one up.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1422524221-8566-3-git-send-email-armbru@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 blockdev.c                | 3 ++-
 include/qapi/qmp/qerror.h | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 8d6ca35ac8..287d7af901 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2674,7 +2674,8 @@ static BlockJob *find_block_job(const char *device, AioContext **aio_context,
     return bs->job;
 
 notfound:
-    error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
+    error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
+              "No active block job on device '%s'", device);
     *aio_context = NULL;
     return NULL;
 }
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index eeaf0cb981..85f1699df7 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -37,9 +37,6 @@ void qerror_report_err(Error *err);
 #define QERR_BASE_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found"
 
-#define QERR_BLOCK_JOB_NOT_ACTIVE \
-    ERROR_CLASS_DEVICE_NOT_ACTIVE, "No active block job on device '%s'"
-
 #define QERR_BLOCK_JOB_NOT_READY \
     ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed"
 

From 4d2855a348c5e90f56584ab9777fc877965ca2e0 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Thu, 29 Jan 2015 10:37:00 +0100
Subject: [PATCH 45/47] block: New bdrv_add_key(), convert monitor to use it

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1422524221-8566-4-git-send-email-armbru@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block.c               | 29 +++++++++++++++++++++++++++++
 blockdev.c            | 24 ++----------------------
 include/block/block.h |  1 +
 monitor.c             | 16 +++++++++++-----
 qmp.c                 |  8 ++++----
 5 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/block.c b/block.c
index 49e0073ce9..84af3cd210 100644
--- a/block.c
+++ b/block.c
@@ -3713,6 +3713,35 @@ int bdrv_set_key(BlockDriverState *bs, const char *key)
     return ret;
 }
 
+/*
+ * Provide an encryption key for @bs.
+ * If @key is non-null:
+ *     If @bs is not encrypted, fail.
+ *     Else if the key is invalid, fail.
+ *     Else set @bs's key to @key, replacing the existing key, if any.
+ * If @key is null:
+ *     If @bs is encrypted and still lacks a key, fail.
+ *     Else do nothing.
+ * On failure, store an error object through @errp if non-null.
+ */
+void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
+{
+    if (key) {
+        if (!bdrv_is_encrypted(bs)) {
+            error_set(errp, QERR_DEVICE_NOT_ENCRYPTED,
+                      bdrv_get_device_name(bs));
+        } else if (bdrv_set_key(bs, key) < 0) {
+            error_set(errp, QERR_INVALID_PASSWORD);
+        }
+    } else {
+        if (bdrv_key_required(bs)) {
+            error_set(errp, QERR_DEVICE_ENCRYPTED,
+                      bdrv_get_device_name(bs),
+                      bdrv_get_encrypted_filename(bs));
+        }
+    }
+}
+
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
     return bs->drv ? bs->drv->format_name : NULL;
diff --git a/blockdev.c b/blockdev.c
index 287d7af901..7d34960b96 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1793,7 +1793,6 @@ void qmp_block_passwd(bool has_device, const char *device,
     Error *local_err = NULL;
     BlockDriverState *bs;
     AioContext *aio_context;
-    int err;
 
     bs = bdrv_lookup_bs(has_device ? device : NULL,
                         has_node_name ? node_name : NULL,
@@ -1806,16 +1805,8 @@ void qmp_block_passwd(bool has_device, const char *device,
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    err = bdrv_set_key(bs, password);
-    if (err == -EINVAL) {
-        error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, bdrv_get_device_name(bs));
-        goto out;
-    } else if (err < 0) {
-        error_set(errp, QERR_INVALID_PASSWORD);
-        goto out;
-    }
+    bdrv_add_key(bs, password, errp);
 
-out:
     aio_context_release(aio_context);
 }
 
@@ -1833,18 +1824,7 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename,
         return;
     }
 
-    if (bdrv_key_required(bs)) {
-        if (password) {
-            if (bdrv_set_key(bs, password) < 0) {
-                error_set(errp, QERR_INVALID_PASSWORD);
-            }
-        } else {
-            error_set(errp, QERR_DEVICE_ENCRYPTED, bdrv_get_device_name(bs),
-                      bdrv_get_encrypted_filename(bs));
-        }
-    } else if (password) {
-        error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, bdrv_get_device_name(bs));
-    }
+    bdrv_add_key(bs, password, errp);
 }
 
 void qmp_change_blockdev(const char *device, const char *filename,
diff --git a/include/block/block.h b/include/block/block.h
index 25a6d62d1b..321295e5f7 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -381,6 +381,7 @@ BlockDriverState *bdrv_next(BlockDriverState *bs);
 int bdrv_is_encrypted(BlockDriverState *bs);
 int bdrv_key_required(BlockDriverState *bs);
 int bdrv_set_key(BlockDriverState *bs, const char *key);
+void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp);
 int bdrv_query_missing_keys(void);
 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                          void *opaque);
diff --git a/monitor.c b/monitor.c
index 5a24311844..c3cc060b45 100644
--- a/monitor.c
+++ b/monitor.c
@@ -5368,9 +5368,12 @@ static void bdrv_password_cb(void *opaque, const char *password,
     Monitor *mon = opaque;
     BlockDriverState *bs = readline_opaque;
     int ret = 0;
+    Error *local_err = NULL;
 
-    if (bdrv_set_key(bs, password) != 0) {
-        monitor_printf(mon, "invalid password\n");
+    bdrv_add_key(bs, password, &local_err);
+    if (local_err) {
+        monitor_printf(mon, "%s\n", error_get_pretty(local_err));
+        error_free(local_err);
         ret = -EPERM;
     }
     if (mon->password_completion_cb)
@@ -5388,17 +5391,20 @@ int monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs,
                                 BlockCompletionFunc *completion_cb,
                                 void *opaque)
 {
+    Error *local_err = NULL;
     int err;
 
-    if (!bdrv_key_required(bs)) {
+    bdrv_add_key(bs, NULL, &local_err);
+    if (!local_err) {
         if (completion_cb)
             completion_cb(opaque, 0);
         return 0;
     }
 
+    /* Need a key for @bs */
+
     if (monitor_ctrl_mode(mon)) {
-        qerror_report(QERR_DEVICE_ENCRYPTED, bdrv_get_device_name(bs),
-                      bdrv_get_encrypted_filename(bs));
+        qerror_report_err(local_err);
         return -1;
     }
 
diff --git a/qmp.c b/qmp.c
index 7f2d25a492..20a9e9739f 100644
--- a/qmp.c
+++ b/qmp.c
@@ -154,6 +154,7 @@ SpiceInfo *qmp_query_spice(Error **errp)
 
 void qmp_cont(Error **errp)
 {
+    Error *local_err = NULL;
     BlockDriverState *bs;
 
     if (runstate_needs_reset()) {
@@ -167,10 +168,9 @@ void qmp_cont(Error **errp)
         bdrv_iostatus_reset(bs);
     }
     for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
-        if (bdrv_key_required(bs)) {
-            error_set(errp, QERR_DEVICE_ENCRYPTED,
-                      bdrv_get_device_name(bs),
-                      bdrv_get_encrypted_filename(bs));
+        bdrv_add_key(bs, NULL, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
             return;
         }
     }

From b1ca639184d93984551b423d8e538ad4add5eb15 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Thu, 29 Jan 2015 10:37:01 +0100
Subject: [PATCH 46/47] block: Eliminate silly QERR_ macros used for encryption
 keys

The QERR_ macros are leftovers from the days of "rich" error objects.
They're used with error_set() and qerror_report(), and expand into the
first *two* arguments.  This trickiness has become pointless.  Clean
up QERR_DEVICE_ENCRYPTED and QERR_DEVICE_NOT_ENCRYPTED.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1422524221-8566-5-git-send-email-armbru@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block.c                   | 5 +++--
 include/qapi/qmp/qerror.h | 6 ------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index 84af3cd210..210fd5f997 100644
--- a/block.c
+++ b/block.c
@@ -3728,14 +3728,15 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
 {
     if (key) {
         if (!bdrv_is_encrypted(bs)) {
-            error_set(errp, QERR_DEVICE_NOT_ENCRYPTED,
+            error_setg(errp, "Device '%s' is not encrypted",
                       bdrv_get_device_name(bs));
         } else if (bdrv_set_key(bs, key) < 0) {
             error_set(errp, QERR_INVALID_PASSWORD);
         }
     } else {
         if (bdrv_key_required(bs)) {
-            error_set(errp, QERR_DEVICE_ENCRYPTED,
+            error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
+                      "'%s' (%s) is encrypted",
                       bdrv_get_device_name(bs),
                       bdrv_get_encrypted_filename(bs));
         }
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 85f1699df7..986260f466 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -49,9 +49,6 @@ void qerror_report_err(Error *err);
 #define QERR_BUS_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Bus '%s' not found"
 
-#define QERR_DEVICE_ENCRYPTED \
-    ERROR_CLASS_DEVICE_ENCRYPTED, "'%s' (%s) is encrypted"
-
 #define QERR_DEVICE_HAS_NO_MEDIUM \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' has no medium"
 
@@ -67,9 +64,6 @@ void qerror_report_err(Error *err);
 #define QERR_DEVICE_NO_HOTPLUG \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging"
 
-#define QERR_DEVICE_NOT_ENCRYPTED \
-    ERROR_CLASS_GENERIC_ERROR, "Device '%s' is not encrypted"
-
 #define QERR_DEVICE_NOT_FOUND \
     ERROR_CLASS_DEVICE_NOT_FOUND, "Device '%s' not found"
 

From 728dacbda817b2ca259e9d337fab06bcf14e94a6 Mon Sep 17 00:00:00 2001
From: Programmingkid <programmingkidx@gmail.com>
Date: Mon, 19 Jan 2015 17:12:55 -0500
Subject: [PATCH 47/47] block/raw-posix.c: Fix raw_getlength() on Mac OS X
 block devices

This patch replaces the dummy code in raw_getlength() for block devices
on OS X, which always returned LLONG_MAX, with a real implementation
that returns the actual block device size.

Signed-off-by: John Arbuckle <programmingkidx@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 7b42f37d83..e474c17974 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1375,7 +1375,20 @@ again:
         if (size == 0)
 #endif
 #if defined(__APPLE__) && defined(__MACH__)
-        size = LLONG_MAX;
+        {
+            uint64_t sectors = 0;
+            uint32_t sector_size = 0;
+
+            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
+               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
+                size = sectors * sector_size;
+            } else {
+                size = lseek(fd, 0LL, SEEK_END);
+                if (size < 0) {
+                    return -errno;
+                }
+            }
+        }
 #else
         size = lseek(fd, 0LL, SEEK_END);
         if (size < 0) {