From 819fa276311ce328a8e38ad9306c1093961b3f4b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 9 Dec 2014 18:15:08 +0000 Subject: [PATCH 01/47] Restore atapi_dma flag across migration If a migration happens just after the guest has kicked off an ATAPI command and kicked off DMA, we lose the atapi_dma flag, and the destination tries to complete the command as PIO rather than DMA. This upsets Linux; modern libata based kernels stumble and recover OK, older kernels end up passing bad data to userspace. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: John Snow Signed-off-by: Kevin Wolf --- hw/ide/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/ide/core.c b/hw/ide/core.c index d4af5e2eb1..ac3f015a8d 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -2417,6 +2417,7 @@ static int ide_drive_pio_post_load(void *opaque, int version_id) s->end_transfer_func = transfer_end_table[s->end_transfer_fn_idx]; s->data_ptr = s->io_buffer + s->cur_io_buffer_offset; s->data_end = s->data_ptr + s->cur_io_buffer_len; + s->atapi_dma = s->feature & 1; /* as per cmd_packet */ return 0; } From a71754e5b03fd3b8b8c6d3bc2a39f75bead729de Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 9 Dec 2014 18:15:09 +0000 Subject: [PATCH 02/47] atapi migration: Throw recoverable error to avoid recovery (With the previous atapi_dma flag recovery) If migration happens between the ATAPI command being written and the bmdma being started, the DMA is dropped. Eventually the guest times out and recovers, but that can take many seconds. (This is rare, on a pingpong reading the CD continuously I hit this about ~1/30-1/50 migrates) I don't think we've got enough state to be able to recover safely at this point, so I throw a 'medium error, no seek complete' that I'm assuming guests will try and recover from an apparently dirty CD. OK, it's a hack, the real solution is probably to push a lot of ATAPI state into the migration stream, but this is a fix that works with no stream changes. Tested only on Linux (both RHEL5 (pre-libata) and RHEL7). Signed-off-by: Dr. David Alan Gilbert Reviewed-by: John Snow Signed-off-by: Kevin Wolf --- hw/ide/atapi.c | 17 +++++++++++++++++ hw/ide/internal.h | 2 ++ hw/ide/pci.c | 11 +++++++++++ 3 files changed, 30 insertions(+) diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c index a71e6e014f..1bf8b34528 100644 --- a/hw/ide/atapi.c +++ b/hw/ide/atapi.c @@ -394,6 +394,23 @@ static void ide_atapi_cmd_read(IDEState *s, int lba, int nb_sectors, } } + +/* Called by *_restart_bh when the transfer function points + * to ide_atapi_cmd + */ +void ide_atapi_dma_restart(IDEState *s) +{ + /* + * I'm not sure we have enough stored to restart the command + * safely, so give the guest an error it should recover from. + * I'm assuming most guests will try to recover from something + * listed as a medium error on a CD; it seems to work on Linux. + * This would be more of a problem if we did any other type of + * DMA operation. + */ + ide_atapi_cmd_error(s, MEDIUM_ERROR, ASC_NO_SEEK_COMPLETE); +} + static inline uint8_t ide_atapi_set_profile(uint8_t *buf, uint8_t *index, uint16_t profile) { diff --git a/hw/ide/internal.h b/hw/ide/internal.h index c998003bf3..ee9a57f039 100644 --- a/hw/ide/internal.h +++ b/hw/ide/internal.h @@ -289,6 +289,7 @@ typedef struct IDEDMAOps IDEDMAOps; #define ATAPI_INT_REASON_TAG 0xf8 /* same constants as bochs */ +#define ASC_NO_SEEK_COMPLETE 0x02 #define ASC_ILLEGAL_OPCODE 0x20 #define ASC_LOGICAL_BLOCK_OOR 0x21 #define ASC_INV_FIELD_IN_CMD_PACKET 0x24 @@ -530,6 +531,7 @@ void ide_dma_error(IDEState *s); void ide_atapi_cmd_ok(IDEState *s); void ide_atapi_cmd_error(IDEState *s, int sense_key, int asc); +void ide_atapi_dma_restart(IDEState *s); void ide_atapi_io_error(IDEState *s, int ret); void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val); diff --git a/hw/ide/pci.c b/hw/ide/pci.c index bee5ad39fe..e3f2054a90 100644 --- a/hw/ide/pci.c +++ b/hw/ide/pci.c @@ -235,6 +235,17 @@ static void bmdma_restart_bh(void *opaque) } } else if (error_status & IDE_RETRY_FLUSH) { ide_flush_cache(bmdma_active_if(bm)); + } else { + IDEState *s = bmdma_active_if(bm); + + /* + * We've not got any bits to tell us about ATAPI - but + * we do have the end_transfer_func that tells us what + * we're trying to do. + */ + if (s->end_transfer_func == ide_atapi_cmd) { + ide_atapi_dma_restart(s); + } } } From 1486df0e312613c6f235ee08488c90cc6b987094 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 30 Jan 2015 11:42:11 +0300 Subject: [PATCH 03/47] block/raw-posix: create translate_err helper to merge errno values actually the code if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || ret == -ENOTTY) { ret = -ENOTSUP; } is present twice and will be added a couple more times. Create helper for this. CC: Kevin Wolf CC: Stefan Hajnoczi CC: Peter Lieven CC: Fam Zheng Signed-off-by: Denis V. Lunev Reviewed-by: Max Reitz Reviewed-by: Peter Lieven Signed-off-by: Kevin Wolf --- block/raw-posix.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index e51293a455..24300d0f78 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -893,6 +893,15 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) } #endif +static int translate_err(int err) +{ + if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || + err == -ENOTTY) { + err = -ENOTSUP; + } + return err; +} + static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) { int ret = -EOPNOTSUPP; @@ -921,10 +930,9 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) #endif } - if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || - ret == -ENOTTY) { + ret = translate_err(ret); + if (ret == -ENOTSUP) { s->has_write_zeroes = false; - ret = -ENOTSUP; } return ret; } @@ -968,10 +976,9 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) #endif } - if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || - ret == -ENOTTY) { + ret = translate_err(ret); + if (ret == -ENOTSUP) { s->has_discard = false; - ret = -ENOTSUP; } return ret; } From 0b99171230bcdfe49956f283fc2e51a37efc7bfc Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 30 Jan 2015 11:42:12 +0300 Subject: [PATCH 04/47] block/raw-posix: create do_fallocate helper The pattern do { if (fallocate(s->fd, mode, offset, len) == 0) { return 0; } } while (errno == EINTR); ret = translate_err(-errno); will be commonly useful in next patches. Create helper for it. CC: Kevin Wolf CC: Stefan Hajnoczi CC: Peter Lieven CC: Fam Zheng Signed-off-by: Denis V. Lunev Reviewed-by: Max Reitz Reviewed-by: Peter Lieven Signed-off-by: Kevin Wolf --- block/raw-posix.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 24300d0f78..2aa268aeab 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -902,6 +902,18 @@ static int translate_err(int err) return err; } +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) +static int do_fallocate(int fd, int mode, off_t offset, off_t len) +{ + do { + if (fallocate(fd, mode, offset, len) == 0) { + return 0; + } + } while (errno == EINTR); + return translate_err(-errno); +} +#endif + static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) { int ret = -EOPNOTSUPP; @@ -965,14 +977,8 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) #endif #ifdef CONFIG_FALLOCATE_PUNCH_HOLE - do { - if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - aiocb->aio_offset, aiocb->aio_nbytes) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = -errno; + ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); #endif } From 37cc9f7f684ed035da63274daca1594c7ee16213 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 30 Jan 2015 11:42:13 +0300 Subject: [PATCH 05/47] block/raw-posix: refactor handle_aiocb_write_zeroes a bit move code dealing with a block device to a separate function. This will allow to implement additional processing for ordinary files. Please note, that xfs_code has been moved before checking for s->has_write_zeroes as xfs_write_zeroes does not touch this flag inside. This makes code a bit more consistent. CC: Kevin Wolf CC: Stefan Hajnoczi CC: Peter Lieven CC: Fam Zheng Signed-off-by: Denis V. Lunev Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 48 +++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 2aa268aeab..d3910fdbf3 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -914,41 +914,49 @@ static int do_fallocate(int fd, int mode, off_t offset, off_t len) } #endif -static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) { - int ret = -EOPNOTSUPP; + int ret = -ENOTSUP; BDRVRawState *s = aiocb->bs->opaque; - if (s->has_write_zeroes == 0) { + if (!s->has_write_zeroes) { return -ENOTSUP; } - if (aiocb->aio_type & QEMU_AIO_BLKDEV) { #ifdef BLKZEROOUT - do { - uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; - if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = -errno; -#endif - } else { -#ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { + return 0; } -#endif - } + } while (errno == EINTR); + + ret = translate_err(-errno); +#endif - ret = translate_err(ret); if (ret == -ENOTSUP) { s->has_write_zeroes = false; } return ret; } +static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +{ + BDRVRawState *s = aiocb->bs->opaque; + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { + return handle_aiocb_write_zeroes_block(aiocb); + } + +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + + return -ENOTSUP; +} + static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) { int ret = -EOPNOTSUPP; From b953f075007a7860adce9fa410b2e116a3d5e29c Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 30 Jan 2015 11:42:14 +0300 Subject: [PATCH 06/47] block: use fallocate(FALLOC_FL_ZERO_RANGE) in handle_aiocb_write_zeroes This efficiently writes zeroes on Linux if the kernel is capable enough. FALLOC_FL_ZERO_RANGE correctly handles all cases, including and not including file expansion. CC: Kevin Wolf CC: Stefan Hajnoczi CC: Peter Lieven CC: Fam Zheng Signed-off-by: Denis V. Lunev Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 15 +++++++++++++-- configure | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index d3910fdbf3..5a777e7137 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -60,7 +60,7 @@ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ #endif #endif -#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) #include #endif #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -902,7 +902,7 @@ static int translate_err(int err) return err; } -#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) static int do_fallocate(int fd, int mode, off_t offset, off_t len) { do { @@ -954,6 +954,17 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) } #endif +#ifdef CONFIG_FALLOCATE_ZERO_RANGE + if (s->has_write_zeroes) { + int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, + aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_write_zeroes = false; + } +#endif + return -ENOTSUP; } diff --git a/configure b/configure index f185dd0838..e00e03af32 100755 --- a/configure +++ b/configure @@ -3335,6 +3335,22 @@ if compile_prog "" "" ; then fallocate_punch_hole=yes fi +# check that fallocate supports range zeroing inside the file +fallocate_zero_range=no +cat > $TMPC << EOF +#include +#include + +int main(void) +{ + fallocate(0, FALLOC_FL_ZERO_RANGE, 0, 0); + return 0; +} +EOF +if compile_prog "" "" ; then + fallocate_zero_range=yes +fi + # check for posix_fallocate posix_fallocate=no cat > $TMPC << EOF @@ -4567,6 +4583,9 @@ fi if test "$fallocate_punch_hole" = "yes" ; then echo "CONFIG_FALLOCATE_PUNCH_HOLE=y" >> $config_host_mak fi +if test "$fallocate_zero_range" = "yes" ; then + echo "CONFIG_FALLOCATE_ZERO_RANGE=y" >> $config_host_mak +fi if test "$posix_fallocate" = "yes" ; then echo "CONFIG_POSIX_FALLOCATE=y" >> $config_host_mak fi From d50d82221934696633296975aa433fe8aeeac714 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 30 Jan 2015 11:42:15 +0300 Subject: [PATCH 07/47] block/raw-posix: call plain fallocate in handle_aiocb_write_zeroes There is a possibility that we are extending our image and thus writing zeroes beyond the end of the file. In this case we do not need to care about the hole to make sure that there is no data in the file under this offset (pre-condition to fallocate(0) to work). We could simply call fallocate(0). This improves the performance of writing zeroes even on really old platforms which do not have even FALLOC_FL_PUNCH_HOLE. Before the patch do_fallocate was used when either CONFIG_FALLOCATE_PUNCH_HOLE or CONFIG_FALLOCATE_ZERO_RANGE are defined. Now the story is different. CONFIG_FALLOCATE is defined when Linux fallocate is defined, posix_fallocate is completely different story (CONFIG_POSIX_FALLOCATE). CONFIG_FALLOCATE is mandatory prerequite for both CONFIG_FALLOCATE_PUNCH_HOLE and CONFIG_FALLOCATE_ZERO_RANGE thus we are on the safe side. CC: Max Reitz CC: Kevin Wolf CC: Stefan Hajnoczi CC: Peter Lieven CC: Fam Zheng Signed-off-by: Denis V. Lunev Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 5a777e7137..1c88ad89d8 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -147,6 +147,7 @@ typedef struct BDRVRawState { bool has_discard:1; bool has_write_zeroes:1; bool discard_zeroes:1; + bool has_fallocate; bool needs_alignment; } BDRVRawState; @@ -452,6 +453,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } if (S_ISREG(st.st_mode)) { s->discard_zeroes = true; + s->has_fallocate = true; } if (S_ISBLK(st.st_mode)) { #ifdef BLKDISCARDZEROES @@ -902,7 +904,7 @@ static int translate_err(int err) return err; } -#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) +#ifdef CONFIG_FALLOCATE static int do_fallocate(int fd, int mode, off_t offset, off_t len) { do { @@ -965,6 +967,16 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) } #endif +#ifdef CONFIG_FALLOCATE + if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { + int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_fallocate = false; + } +#endif + return -ENOTSUP; } From 1cdc3239f1bb8c8f18954defe3cb813edc9df4a0 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 30 Jan 2015 11:42:16 +0300 Subject: [PATCH 08/47] block: use fallocate(FALLOC_FL_PUNCH_HOLE) & fallocate(0) to write zeroes This sequence works efficiently if FALLOC_FL_ZERO_RANGE is not supported. Unfortunately, FALLOC_FL_ZERO_RANGE is supported on really modern systems and only for a couple of filesystems. FALLOC_FL_PUNCH_HOLE is much more mature. The sequence of 2 operations FALLOC_FL_PUNCH_HOLE and 0 is necessary due to the following reasons: - FALLOC_FL_PUNCH_HOLE creates a hole in the file, the file becomes sparse. In order to retain original functionality we must allocate disk space afterwards. This is done using fallocate(0) call - fallocate(0) without preceeding FALLOC_FL_PUNCH_HOLE will do nothing if called above already allocated areas of the file, i.e. the content will not be zeroed This should increase the performance a bit for not-so-modern kernels. CC: Max Reitz CC: Kevin Wolf CC: Stefan Hajnoczi CC: Peter Lieven CC: Fam Zheng Signed-off-by: Denis V. Lunev Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/block/raw-posix.c b/block/raw-posix.c index 1c88ad89d8..7b42f37d83 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -967,6 +967,25 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) } #endif +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + if (s->has_discard && s->has_fallocate) { + int ret = do_fallocate(s->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0) { + ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_fallocate = false; + } else if (ret != -ENOTSUP) { + return ret; + } else { + s->has_discard = false; + } + } +#endif + #ifdef CONFIG_FALLOCATE if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); From 98764152ad8ec9fa4e7bb6d8e10f8a7a7ce273d7 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 2 Feb 2015 15:48:34 +0100 Subject: [PATCH 09/47] block: change default for discard and write zeroes to INT_MAX do not trim requests if the driver does not supply a limit through BlockLimits. For write zeroes we still keep a limit for the unsupported path to avoid allocating a big bounce buffer. Suggested-by: Kevin Wolf Suggested-by: Denis V. Lunev Signed-off-by: Peter Lieven Signed-off-by: Kevin Wolf --- block.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/block.c b/block.c index d45e4ddf31..ee7ff2cac8 100644 --- a/block.c +++ b/block.c @@ -3192,10 +3192,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, BDRV_REQ_COPY_ON_READ); } -/* if no limit is specified in the BlockLimits use a default - * of 32768 512-byte sectors (16 MiB) per request. - */ -#define MAX_WRITE_ZEROES_DEFAULT 32768 +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) @@ -3206,7 +3203,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, int ret = 0; int max_write_zeroes = bs->bl.max_write_zeroes ? - bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; + bs->bl.max_write_zeroes : INT_MAX; while (nb_sectors > 0 && !ret) { int num = nb_sectors; @@ -3242,7 +3239,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, if (ret == -ENOTSUP) { /* Fall back to bounce buffer if write zeroes is unsupported */ int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, - MAX_WRITE_ZEROES_DEFAULT); + MAX_WRITE_ZEROES_BOUNCE_BUFFER); num = MIN(num, max_xfer_len); iov.iov_len = num * BDRV_SECTOR_SIZE; if (iov.iov_base == NULL) { @@ -5097,11 +5094,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque) rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); } -/* if no limit is specified in the BlockLimits use a default - * of 32768 512-byte sectors (16 MiB) per request. - */ -#define MAX_DISCARD_DEFAULT 32768 - int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { @@ -5126,7 +5118,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } - max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; + max_discard = bs->bl.max_discard ? bs->bl.max_discard : INT_MAX; while (nb_sectors > 0) { int ret; int num = nb_sectors; From 61979a6adfc1c4590f31ca295998e160226785c4 Mon Sep 17 00:00:00 2001 From: Don Slutz Date: Fri, 9 Jan 2015 10:17:35 -0500 Subject: [PATCH 10/47] qemu-img: Add QEMU_PKGVERSION to QEMU_IMG_VERSION This is the same way vl.c handles this. Signed-off-by: Don Slutz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- qemu-img.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index 4e9a7f5741..e148af8a3e 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -35,7 +35,7 @@ #include "block/qapi.h" #include -#define QEMU_IMG_VERSION "qemu-img version " QEMU_VERSION \ +#define QEMU_IMG_VERSION "qemu-img version " QEMU_VERSION QEMU_PKGVERSION \ ", Copyright (c) 2004-2008 Fabrice Bellard\n" typedef struct img_cmd_t { From 35f5a49374098733247c640cbdcbafcfc792c11f Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Wed, 28 Jan 2015 09:51:13 +0800 Subject: [PATCH 11/47] qed: Really remove unused field QEDAIOCB.finished The commit 533ffb17a that removed qed_aiocb_info.cancel said to remove this but didn't do it. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/qed.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/qed.h b/block/qed.h index d3934a05cd..615e676fc8 100644 --- a/block/qed.h +++ b/block/qed.h @@ -133,7 +133,6 @@ typedef struct QEDAIOCB { int bh_ret; /* final return status for completion bh */ QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ int flags; /* QED_AIOCB_* bits ORed together */ - bool *finished; /* signal for cancel completion */ uint64_t end_pos; /* request end on block device, in bytes */ /* User scatter-gather list */ From f4564d53c6952c61fb3ed8ee17a6e0f2f6bf0857 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 2 Feb 2015 14:52:18 +0100 Subject: [PATCH 12/47] block: add accounting for merged requests Signed-off-by: Peter Lieven Reviewed-by: Eric Blake Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 2 ++ block/accounting.c | 7 +++++++ block/qapi.c | 2 ++ hmp.c | 6 +++++- include/block/accounting.h | 3 +++ qapi/block-core.json | 9 ++++++++- qmp-commands.hx | 22 ++++++++++++++++++---- 7 files changed, 45 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index ee7ff2cac8..8272ef901e 100644 --- a/block.c +++ b/block.c @@ -4559,6 +4559,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, } } + block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + return outidx + 1; } diff --git a/block/accounting.c b/block/accounting.c index 18102f015d..01d594ffdc 100644 --- a/block/accounting.c +++ b/block/accounting.c @@ -54,3 +54,10 @@ void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num, stats->wr_highest_sector = sector_num + nb_sectors - 1; } } + +void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, + int num_requests) +{ + assert(type < BLOCK_MAX_IOTYPE); + stats->merged[type] += num_requests; +} diff --git a/block/qapi.c b/block/qapi.c index 75c388e90b..d1a891796c 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -335,6 +335,8 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE]; s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ]; s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE]; + s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ]; + s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE]; s->stats->wr_highest_offset = bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE; s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH]; diff --git a/hmp.c b/hmp.c index a42c5c07be..b47f331f4d 100644 --- a/hmp.c +++ b/hmp.c @@ -474,6 +474,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict) " wr_total_time_ns=%" PRId64 " rd_total_time_ns=%" PRId64 " flush_total_time_ns=%" PRId64 + " rd_merged=%" PRId64 + " wr_merged=%" PRId64 "\n", stats->value->stats->rd_bytes, stats->value->stats->wr_bytes, @@ -482,7 +484,9 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict) stats->value->stats->flush_operations, stats->value->stats->wr_total_time_ns, stats->value->stats->rd_total_time_ns, - stats->value->stats->flush_total_time_ns); + stats->value->stats->flush_total_time_ns, + stats->value->stats->rd_merged, + stats->value->stats->wr_merged); } qapi_free_BlockStatsList(stats_list); diff --git a/include/block/accounting.h b/include/block/accounting.h index 50b42b3808..4c406cff7a 100644 --- a/include/block/accounting.h +++ b/include/block/accounting.h @@ -39,6 +39,7 @@ typedef struct BlockAcctStats { uint64_t nr_bytes[BLOCK_MAX_IOTYPE]; uint64_t nr_ops[BLOCK_MAX_IOTYPE]; uint64_t total_time_ns[BLOCK_MAX_IOTYPE]; + uint64_t merged[BLOCK_MAX_IOTYPE]; uint64_t wr_highest_sector; } BlockAcctStats; @@ -53,5 +54,7 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie); void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num, unsigned int nb_sectors); +void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, + int num_requests); #endif diff --git a/qapi/block-core.json b/qapi/block-core.json index 80984d1660..b7d9772444 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -407,13 +407,20 @@ # growable sparse files (like qcow2) that are used on top # of a physical device. # +# @rd_merged: Number of read requests that have been merged into another +# request (Since 2.3). +# +# @wr_merged: Number of write requests that have been merged into another +# request (Since 2.3). +# # Since: 0.14.0 ## { 'type': 'BlockDeviceStats', 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'rd_operations': 'int', 'wr_operations': 'int', 'flush_operations': 'int', 'flush_total_time_ns': 'int', 'wr_total_time_ns': 'int', - 'rd_total_time_ns': 'int', 'wr_highest_offset': 'int' } } + 'rd_total_time_ns': 'int', 'wr_highest_offset': 'int', + 'rd_merged': 'int', 'wr_merged': 'int' } } ## # @BlockStats: diff --git a/qmp-commands.hx b/qmp-commands.hx index c5f16dd922..af3fd19935 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2303,6 +2303,10 @@ Each json-object contain the following: - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int) - "wr_highest_offset": Highest offset of a sector written since the BlockDriverState has been opened (json-int) + - "rd_merged": number of read requests that have been merged into + another request (json-int) + - "wr_merged": number of write requests that have been merged into + another request (json-int) - "parent": Contains recursively the statistics of the underlying protocol (e.g. the host file for a qcow2 image). If there is no underlying protocol, this field is omitted @@ -2326,6 +2330,8 @@ Example: "rd_total_times_ns":3465673657 "flush_total_times_ns":49653 "flush_operations":61, + "rd_merged":0, + "wr_merged":0 } }, "stats":{ @@ -2337,7 +2343,9 @@ Example: "flush_operations":51, "wr_total_times_ns":313253456 "rd_total_times_ns":3465673657 - "flush_total_times_ns":49653 + "flush_total_times_ns":49653, + "rd_merged":0, + "wr_merged":0 } }, { @@ -2351,7 +2359,9 @@ Example: "flush_operations":0, "wr_total_times_ns":0 "rd_total_times_ns":0 - "flush_total_times_ns":0 + "flush_total_times_ns":0, + "rd_merged":0, + "wr_merged":0 } }, { @@ -2365,7 +2375,9 @@ Example: "flush_operations":0, "wr_total_times_ns":0 "rd_total_times_ns":0 - "flush_total_times_ns":0 + "flush_total_times_ns":0, + "rd_merged":0, + "wr_merged":0 } }, { @@ -2379,7 +2391,9 @@ Example: "flush_operations":0, "wr_total_times_ns":0 "rd_total_times_ns":0 - "flush_total_times_ns":0 + "flush_total_times_ns":0, + "rd_merged":0, + "wr_merged":0 } } ] From d901f3c457ade0b4934427c0e8608dea31610720 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 2 Feb 2015 14:52:19 +0100 Subject: [PATCH 13/47] hw/virtio-blk: add a constant for max number of merged requests As it was not obvious (at least for me) where the 32 comes from; add a constant for it. Signed-off-by: Peter Lieven Reviewed-by: Eric Blake Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 2 +- include/hw/virtio/virtio-blk.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 4032fcae27..e04adb8f16 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -360,7 +360,7 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb) block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size, BLOCK_ACCT_WRITE); - if (mrb->num_writes == 32) { + if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) { virtio_submit_multiwrite(req->dev->blk, mrb); } diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 4652b70b5d..6ef3fa57cf 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -134,8 +134,10 @@ typedef struct VirtIOBlock { struct VirtIOBlockDataPlane *dataplane; } VirtIOBlock; +#define VIRTIO_BLK_MAX_MERGE_REQS 32 + typedef struct MultiReqBuffer { - BlockRequest blkreq[32]; + BlockRequest blkreq[VIRTIO_BLK_MAX_MERGE_REQS]; unsigned int num_writes; } MultiReqBuffer; From 454057b7d9b9ad141bd5df8c4075745e56b4870f Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 2 Feb 2015 14:52:20 +0100 Subject: [PATCH 14/47] block-backend: expose bs->bl.max_transfer_length Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/block-backend.c | 5 +++++ include/sysemu/block-backend.h | 1 + 2 files changed, 6 insertions(+) diff --git a/block/block-backend.c b/block/block-backend.c index d00c129f15..c28e2402c7 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -580,6 +580,11 @@ int blk_get_flags(BlockBackend *blk) return bdrv_get_flags(blk->bs); } +int blk_get_max_transfer_length(BlockBackend *blk) +{ + return blk->bs->bl.max_transfer_length; +} + void blk_set_guest_block_size(BlockBackend *blk, int align) { bdrv_set_guest_block_size(blk->bs, align); diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 8871a021d0..aab12b982c 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -127,6 +127,7 @@ int blk_is_inserted(BlockBackend *blk); void blk_lock_medium(BlockBackend *blk, bool locked); void blk_eject(BlockBackend *blk, bool eject_flag); int blk_get_flags(BlockBackend *blk); +int blk_get_max_transfer_length(BlockBackend *blk); void blk_set_guest_block_size(BlockBackend *blk, int align); void *blk_blockalign(BlockBackend *blk, size_t size); bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp); From 95f7142abc86a916682bd735aecd90172ffa0d30 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 2 Feb 2015 14:52:21 +0100 Subject: [PATCH 15/47] virtio-blk: introduce multiread this patch finally introduces multiread support to virtio-blk. While multiwrite support was there for a long time, read support was missing. The complete merge logic is moved into virtio-blk.c which has been the only user of request merging ever since. This is required to be able to merge chunks of requests and immediately invoke callbacks for those requests. Secondly, this is required to switch to direct invocation of coroutines which is planned at a later stage. The following benchmarks show the performance of running fio with 4 worker threads on a local ram disk. The numbers show the average of 10 test runs after 1 run as warmup phase. | 4k | 64k | 4k MB/s | rd seq | rd rand | rd seq | rd rand | wr seq | wr rand --------------+--------+---------+--------+---------+--------+-------- master | 1221 | 1187 | 4178 | 4114 | 1745 | 1213 multiread | 1829 | 1189 | 4639 | 4110 | 1894 | 1216 Signed-off-by: Peter Lieven Signed-off-by: Kevin Wolf --- hw/block/dataplane/virtio-blk.c | 8 +- hw/block/virtio-blk.c | 296 ++++++++++++++++++++++---------- include/hw/virtio/virtio-blk.h | 19 +- trace-events | 1 + 4 files changed, 218 insertions(+), 106 deletions(-) diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index 39c5d7103c..be957d1117 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -96,9 +96,7 @@ static void handle_notify(EventNotifier *e) event_notifier_test_and_clear(&s->host_notifier); blk_io_plug(s->conf->conf.blk); for (;;) { - MultiReqBuffer mrb = { - .num_writes = 0, - }; + MultiReqBuffer mrb = {}; int ret; /* Disable guest->host notifies to avoid unnecessary vmexits */ @@ -120,7 +118,9 @@ static void handle_notify(EventNotifier *e) virtio_blk_handle_request(req, &mrb); } - virtio_submit_multiwrite(s->conf->conf.blk, &mrb); + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s->conf->conf.blk, &mrb); + } if (likely(ret == -EAGAIN)) { /* vring emptied */ /* Re-enable guest->host notifies and stop processing the vring. diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index e04adb8f16..d0a01a8864 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -34,6 +34,7 @@ VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s) req->dev = s; req->qiov.size = 0; req->next = NULL; + req->mr_next = NULL; return req; } @@ -84,20 +85,32 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, static void virtio_blk_rw_complete(void *opaque, int ret) { - VirtIOBlockReq *req = opaque; + VirtIOBlockReq *next = opaque; - trace_virtio_blk_rw_complete(req, ret); + while (next) { + VirtIOBlockReq *req = next; + next = req->mr_next; + trace_virtio_blk_rw_complete(req, ret); - if (ret) { - int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type); - bool is_read = !(p & VIRTIO_BLK_T_OUT); - if (virtio_blk_handle_rw_error(req, -ret, is_read)) - return; + if (req->qiov.nalloc != -1) { + /* If nalloc is != 1 req->qiov is a local copy of the original + * external iovec. It was allocated in submit_merged_requests + * to be able to merge requests. */ + qemu_iovec_destroy(&req->qiov); + } + + if (ret) { + int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type); + bool is_read = !(p & VIRTIO_BLK_T_OUT); + if (virtio_blk_handle_rw_error(req, -ret, is_read)) { + continue; + } + } + + virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + block_acct_done(blk_get_stats(req->dev->blk), &req->acct); + virtio_blk_free_request(req); } - - virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); - block_acct_done(blk_get_stats(req->dev->blk), &req->acct); - virtio_blk_free_request(req); } static void virtio_blk_flush_complete(void *opaque, int ret) @@ -291,24 +304,127 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) } } -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb) +static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb, + int start, int num_reqs, int niov) { - int i, ret; + QEMUIOVector *qiov = &mrb->reqs[start]->qiov; + int64_t sector_num = mrb->reqs[start]->sector_num; + int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE; + bool is_write = mrb->is_write; - if (!mrb->num_writes) { + if (num_reqs > 1) { + int i; + struct iovec *tmp_iov = qiov->iov; + int tmp_niov = qiov->niov; + + /* mrb->reqs[start]->qiov was initialized from external so we can't + * modifiy it here. We need to initialize it locally and then add the + * external iovecs. */ + qemu_iovec_init(qiov, niov); + + for (i = 0; i < tmp_niov; i++) { + qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len); + } + + for (i = start + 1; i < start + num_reqs; i++) { + qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0, + mrb->reqs[i]->qiov.size); + mrb->reqs[i - 1]->mr_next = mrb->reqs[i]; + nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE; + } + assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE); + + trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num, + nb_sectors, is_write); + block_acct_merge_done(blk_get_stats(blk), + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ, + num_reqs - 1); + } + + if (is_write) { + blk_aio_writev(blk, sector_num, qiov, nb_sectors, + virtio_blk_rw_complete, mrb->reqs[start]); + } else { + blk_aio_readv(blk, sector_num, qiov, nb_sectors, + virtio_blk_rw_complete, mrb->reqs[start]); + } +} + +static int multireq_compare(const void *a, const void *b) +{ + const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a, + *req2 = *(VirtIOBlockReq **)b; + + /* + * Note that we can't simply subtract sector_num1 from sector_num2 + * here as that could overflow the return value. + */ + if (req1->sector_num > req2->sector_num) { + return 1; + } else if (req1->sector_num < req2->sector_num) { + return -1; + } else { + return 0; + } +} + +void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb) +{ + int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0; + int max_xfer_len = 0; + int64_t sector_num = 0; + + if (mrb->num_reqs == 1) { + submit_requests(blk, mrb, 0, 1, -1); + mrb->num_reqs = 0; return; } - ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes); - if (ret != 0) { - for (i = 0; i < mrb->num_writes; i++) { - if (mrb->blkreq[i].error) { - virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO); + max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk); + max_xfer_len = MIN_NON_ZERO(max_xfer_len, INT_MAX); + + qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs), + &multireq_compare); + + for (i = 0; i < mrb->num_reqs; i++) { + VirtIOBlockReq *req = mrb->reqs[i]; + if (num_reqs > 0) { + bool merge = true; + + /* merge would exceed maximum number of IOVs */ + if (niov + req->qiov.niov > IOV_MAX) { + merge = false; + } + + /* merge would exceed maximum transfer length of backend device */ + if (req->qiov.size / BDRV_SECTOR_SIZE + nb_sectors > max_xfer_len) { + merge = false; + } + + /* requests are not sequential */ + if (sector_num + nb_sectors != req->sector_num) { + merge = false; + } + + if (!merge) { + submit_requests(blk, mrb, start, num_reqs, niov); + num_reqs = 0; } } + + if (num_reqs == 0) { + sector_num = req->sector_num; + nb_sectors = niov = 0; + start = i; + } + + nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE; + niov += req->qiov.niov; + num_reqs++; } - mrb->num_writes = 0; + submit_requests(blk, mrb, start, num_reqs, niov); + mrb->num_reqs = 0; } static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) @@ -319,7 +435,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) /* * Make sure all outstanding writes are posted to the backing device. */ - virtio_submit_multiwrite(req->dev->blk, mrb); + if (mrb->is_write && mrb->num_reqs > 0) { + virtio_blk_submit_multireq(req->dev->blk, mrb); + } blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req); } @@ -329,6 +447,9 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev, uint64_t nb_sectors = size >> BDRV_SECTOR_BITS; uint64_t total_sectors; + if (nb_sectors > INT_MAX) { + return false; + } if (sector & dev->sector_mask) { return false; } @@ -342,60 +463,6 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev, return true; } -static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb) -{ - BlockRequest *blkreq; - uint64_t sector; - - sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector); - - trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512); - - if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) { - virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); - virtio_blk_free_request(req); - return; - } - - block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size, - BLOCK_ACCT_WRITE); - - if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) { - virtio_submit_multiwrite(req->dev->blk, mrb); - } - - blkreq = &mrb->blkreq[mrb->num_writes]; - blkreq->sector = sector; - blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE; - blkreq->qiov = &req->qiov; - blkreq->cb = virtio_blk_rw_complete; - blkreq->opaque = req; - blkreq->error = 0; - - mrb->num_writes++; -} - -static void virtio_blk_handle_read(VirtIOBlockReq *req) -{ - uint64_t sector; - - sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector); - - trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512); - - if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) { - virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); - virtio_blk_free_request(req); - return; - } - - block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size, - BLOCK_ACCT_READ); - blk_aio_readv(req->dev->blk, sector, &req->qiov, - req->qiov.size / BDRV_SECTOR_SIZE, - virtio_blk_rw_complete, req); -} - void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) { uint32_t type; @@ -430,11 +497,57 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type); - if (type & VIRTIO_BLK_T_FLUSH) { + /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER + * is an optional flag. Altough a guest should not send this flag if + * not negotiated we ignored it in the past. So keep ignoring it. */ + switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) { + case VIRTIO_BLK_T_IN: + { + bool is_write = type & VIRTIO_BLK_T_OUT; + req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), + &req->out.sector); + + if (is_write) { + qemu_iovec_init_external(&req->qiov, iov, out_num); + trace_virtio_blk_handle_write(req, req->sector_num, + req->qiov.size / BDRV_SECTOR_SIZE); + } else { + qemu_iovec_init_external(&req->qiov, in_iov, in_num); + trace_virtio_blk_handle_read(req, req->sector_num, + req->qiov.size / BDRV_SECTOR_SIZE); + } + + if (!virtio_blk_sect_range_ok(req->dev, req->sector_num, + req->qiov.size)) { + virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + virtio_blk_free_request(req); + return; + } + + block_acct_start(blk_get_stats(req->dev->blk), + &req->acct, req->qiov.size, + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); + + /* merge would exceed maximum number of requests or IO direction + * changes */ + if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS || + is_write != mrb->is_write)) { + virtio_blk_submit_multireq(req->dev->blk, mrb); + } + + assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS); + mrb->reqs[mrb->num_reqs++] = req; + mrb->is_write = is_write; + break; + } + case VIRTIO_BLK_T_FLUSH: virtio_blk_handle_flush(req, mrb); - } else if (type & VIRTIO_BLK_T_SCSI_CMD) { + break; + case VIRTIO_BLK_T_SCSI_CMD: virtio_blk_handle_scsi(req); - } else if (type & VIRTIO_BLK_T_GET_ID) { + break; + case VIRTIO_BLK_T_GET_ID: + { VirtIOBlock *s = req->dev; /* @@ -448,14 +561,9 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) iov_from_buf(in_iov, in_num, 0, serial, size); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); virtio_blk_free_request(req); - } else if (type & VIRTIO_BLK_T_OUT) { - qemu_iovec_init_external(&req->qiov, iov, out_num); - virtio_blk_handle_write(req, mrb); - } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) { - /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */ - qemu_iovec_init_external(&req->qiov, in_iov, in_num); - virtio_blk_handle_read(req); - } else { + break; + } + default: virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); virtio_blk_free_request(req); } @@ -465,9 +573,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBlock *s = VIRTIO_BLK(vdev); VirtIOBlockReq *req; - MultiReqBuffer mrb = { - .num_writes = 0, - }; + MultiReqBuffer mrb = {}; /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start * dataplane here instead of waiting for .set_status(). @@ -481,7 +587,9 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) virtio_blk_handle_request(req, &mrb); } - virtio_submit_multiwrite(s->blk, &mrb); + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s->blk, &mrb); + } /* * FIXME: Want to check for completions before returning to guest mode, @@ -494,9 +602,7 @@ static void virtio_blk_dma_restart_bh(void *opaque) { VirtIOBlock *s = opaque; VirtIOBlockReq *req = s->rq; - MultiReqBuffer mrb = { - .num_writes = 0, - }; + MultiReqBuffer mrb = {}; qemu_bh_delete(s->bh); s->bh = NULL; @@ -509,7 +615,9 @@ static void virtio_blk_dma_restart_bh(void *opaque) req = next; } - virtio_submit_multiwrite(s->blk, &mrb); + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s->blk, &mrb); + } } static void virtio_blk_dma_restart_cb(void *opaque, int running, diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 6ef3fa57cf..2027a64093 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -134,29 +134,32 @@ typedef struct VirtIOBlock { struct VirtIOBlockDataPlane *dataplane; } VirtIOBlock; -#define VIRTIO_BLK_MAX_MERGE_REQS 32 - -typedef struct MultiReqBuffer { - BlockRequest blkreq[VIRTIO_BLK_MAX_MERGE_REQS]; - unsigned int num_writes; -} MultiReqBuffer; - typedef struct VirtIOBlockReq { + int64_t sector_num; VirtIOBlock *dev; VirtQueueElement elem; struct virtio_blk_inhdr *in; struct virtio_blk_outhdr out; QEMUIOVector qiov; struct VirtIOBlockReq *next; + struct VirtIOBlockReq *mr_next; BlockAcctCookie acct; } VirtIOBlockReq; +#define VIRTIO_BLK_MAX_MERGE_REQS 32 + +typedef struct MultiReqBuffer { + VirtIOBlockReq *reqs[VIRTIO_BLK_MAX_MERGE_REQS]; + unsigned int num_reqs; + bool is_write; +} MultiReqBuffer; + VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s); void virtio_blk_free_request(VirtIOBlockReq *req); void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb); -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb); +void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb); #endif diff --git a/trace-events b/trace-events index 907da7ed8a..57f357f7f7 100644 --- a/trace-events +++ b/trace-events @@ -116,6 +116,7 @@ virtio_blk_req_complete(void *req, int status) "req %p status %d" virtio_blk_rw_complete(void *req, int ret) "req %p ret %d" virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu" virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu" +virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t sector, size_t nsectors, bool is_write) "mrb %p start %d num_reqs %d sector %"PRIu64" nsectors %zu is_write %d" # hw/block/dataplane/virtio-blk.c virtio_blk_data_plane_start(void *s) "dataplane %p" From c99495ac1b4a27cb57bf04ed1a169177aeea2649 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 2 Feb 2015 14:52:22 +0100 Subject: [PATCH 16/47] virtio-blk: add a knob to disable request merging this adds a knob to disable request merging for debugging or benchmarks if dedired. Signed-off-by: Peter Lieven Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 5 ++++- include/hw/virtio/virtio-blk.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index d0a01a8864..8c51a295d9 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -531,7 +531,8 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) /* merge would exceed maximum number of requests or IO direction * changes */ if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS || - is_write != mrb->is_write)) { + is_write != mrb->is_write || + !req->dev->conf.request_merging)) { virtio_blk_submit_multireq(req->dev->blk, mrb); } @@ -950,6 +951,8 @@ static Property virtio_blk_properties[] = { #ifdef __linux__ DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, true), #endif + DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, + true), DEFINE_PROP_BIT("x-data-plane", VirtIOBlock, conf.data_plane, 0, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 2027a64093..fc7d311a43 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -113,6 +113,7 @@ struct VirtIOBlkConf uint32_t scsi; uint32_t config_wce; uint32_t data_plane; + uint32_t request_merging; }; struct VirtIOBlockDataPlane; From 79e7a01954d5b171186ef1e946e9159417befc1e Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 16 Jan 2015 09:38:42 +0800 Subject: [PATCH 17/47] qemu-iotests: Fix supported_oses check There is a bug in the recently added sys.platform test, and we no longer run python tests, because "linux2" is the value to compare here. So do a prefix match. According to python doc [1], the way to use sys.platform is "unless you want to test for a specific system version, it is therefore recommended to use the following idiom": if sys.platform.startswith('freebsd'): # FreeBSD-specific code here... elif sys.platform.startswith('linux'): # Linux-specific code here... [1]: https://docs.python.org/2.7/library/sys.html#sys.platform Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- tests/qemu-iotests/iotests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 87002e0e2c..241b5ee9dd 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -288,7 +288,7 @@ def main(supported_fmts=[], supported_oses=['linux']): if supported_fmts and (imgfmt not in supported_fmts): notrun('not suitable for this image format: %s' % imgfmt) - if sys.platform not in supported_oses: + if True not in [sys.platform.startswith(x) for x in supported_oses]: notrun('not suitable for this OS: %s' % sys.platform) # We need to filter out the time taken from the output so that qemu-iotest From 6440d44cea84451ee9facb1237a4e7251631df80 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 23 Jan 2015 14:28:34 -0500 Subject: [PATCH 18/47] iotests: Specify format for qemu-nbd This patch is necessary to suppress the "probed raw" warning when running raw over nbd tests. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- tests/qemu-iotests/common.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index aa093d9d84..22d3514041 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -153,7 +153,7 @@ _make_test_img() # Start an NBD server on the image file, which is what we'll be talking to if [ $IMGPROTO = "nbd" ]; then - eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 $TEST_IMG_FILE &" + eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 -f $IMGFMT $TEST_IMG_FILE &" QEMU_NBD_PID=$! sleep 1 # FIXME: qemu-nbd needs to be listening before we continue fi From e2462113b2003085ad16f15e1442ded64e2d9a29 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Mon, 12 Jan 2015 14:11:13 +0100 Subject: [PATCH 19/47] block: add event when disk usage exceeds threshold Managing applications, like oVirt (http://www.ovirt.org), make extensive use of thin-provisioned disk images. To let the guest run smoothly and be not unnecessarily paused, oVirt sets a disk usage threshold (so called 'high water mark') based on the occupation of the device, and automatically extends the image once the threshold is reached or exceeded. In order to detect the crossing of the threshold, oVirt has no choice but aggressively polling the QEMU monitor using the query-blockstats command. This lead to unnecessary system load, and is made even worse under scale: deployments with hundreds of VMs are no longer rare. To fix this, this patch adds: * A new monitor command `block-set-write-threshold', to set a mark for a given block device. * A new event `BLOCK_WRITE_THRESHOLD', to report if a block device usage exceeds the threshold. * A new `write_threshold' field into the `BlockDeviceInfo' structure, to report the configured threshold. This will allow the managing application to use smarter and more efficient monitoring, greatly reducing the need of polling. [Updated qemu-iotests 067 output to add the new 'write_threshold' property. --Stefan] [Changed g_assert_false() to !g_assert() to fix the build on older glib versions. --Kevin] Signed-off-by: Francesco Romani Reviewed-by: Eric Blake Message-id: 1421068273-692-1-git-send-email-fromani@redhat.com Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/Makefile.objs | 1 + block/qapi.c | 3 + block/write-threshold.c | 125 ++++++++++++++++++++++++++++++++ include/block/block_int.h | 4 + include/block/write-threshold.h | 64 ++++++++++++++++ qapi/block-core.json | 51 ++++++++++++- qmp-commands.hx | 32 ++++++++ tests/Makefile | 3 + tests/qemu-iotests/067.out | 5 ++ tests/test-write-threshold.c | 119 ++++++++++++++++++++++++++++++ 10 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 block/write-threshold.c create mode 100644 include/block/write-threshold.h create mode 100644 tests/test-write-threshold.c diff --git a/block/Makefile.objs b/block/Makefile.objs index 04b0e43eb1..010afad71a 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -20,6 +20,7 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o block-obj-y += accounting.o +block-obj-y += write-threshold.o common-obj-y += stream.o common-obj-y += commit.o diff --git a/block/qapi.c b/block/qapi.c index d1a891796c..1808e67336 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -24,6 +24,7 @@ #include "block/qapi.h" #include "block/block_int.h" +#include "block/write-threshold.h" #include "qmp-commands.h" #include "qapi-visit.h" #include "qapi/qmp-output-visitor.h" @@ -89,6 +90,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) info->iops_size = cfg.op_size; } + info->write_threshold = bdrv_write_threshold_get(bs); + return info; } diff --git a/block/write-threshold.c b/block/write-threshold.c new file mode 100644 index 0000000000..c2cd517716 --- /dev/null +++ b/block/write-threshold.c @@ -0,0 +1,125 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include "block/block_int.h" +#include "block/coroutine.h" +#include "block/write-threshold.h" +#include "qemu/notify.h" +#include "qapi-event.h" +#include "qmp-commands.h" + + +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs) +{ + return bs->write_threshold_offset; +} + +bool bdrv_write_threshold_is_set(const BlockDriverState *bs) +{ + return bs->write_threshold_offset > 0; +} + +static void write_threshold_disable(BlockDriverState *bs) +{ + if (bdrv_write_threshold_is_set(bs)) { + notifier_with_return_remove(&bs->write_threshold_notifier); + bs->write_threshold_offset = 0; + } +} + +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, + const BdrvTrackedRequest *req) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (req->offset > bs->write_threshold_offset) { + return (req->offset - bs->write_threshold_offset) + req->bytes; + } + if ((req->offset + req->bytes) > bs->write_threshold_offset) { + return (req->offset + req->bytes) - bs->write_threshold_offset; + } + } + return 0; +} + +static int coroutine_fn before_write_notify(NotifierWithReturn *notifier, + void *opaque) +{ + BdrvTrackedRequest *req = opaque; + BlockDriverState *bs = req->bs; + uint64_t amount = 0; + + amount = bdrv_write_threshold_exceeded(bs, req); + if (amount > 0) { + qapi_event_send_block_write_threshold( + bs->node_name, + amount, + bs->write_threshold_offset, + &error_abort); + + /* autodisable to avoid flooding the monitor */ + write_threshold_disable(bs); + } + + return 0; /* should always let other notifiers run */ +} + +static void write_threshold_register_notifier(BlockDriverState *bs) +{ + bs->write_threshold_notifier.notify = before_write_notify; + notifier_with_return_list_add(&bs->before_write_notifiers, + &bs->write_threshold_notifier); +} + +static void write_threshold_update(BlockDriverState *bs, + int64_t threshold_bytes) +{ + bs->write_threshold_offset = threshold_bytes; +} + +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (threshold_bytes > 0) { + write_threshold_update(bs, threshold_bytes); + } else { + write_threshold_disable(bs); + } + } else { + if (threshold_bytes > 0) { + /* avoid multiple registration */ + write_threshold_register_notifier(bs); + write_threshold_update(bs, threshold_bytes); + } + /* discard bogus disable request */ + } +} + +void qmp_block_set_write_threshold(const char *node_name, + uint64_t threshold_bytes, + Error **errp) +{ + BlockDriverState *bs; + AioContext *aio_context; + + bs = bdrv_find_node(node_name); + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, node_name); + return; + } + + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + bdrv_write_threshold_set(bs, threshold_bytes); + + aio_context_release(aio_context); +} diff --git a/include/block/block_int.h b/include/block/block_int.h index e264be97b2..7ad19503df 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -412,6 +412,10 @@ struct BlockDriverState { /* The error object in use for blocking operations on backing_hd */ Error *backing_blocker; + + /* threshold limit for writes, in bytes. "High water mark". */ + uint64_t write_threshold_offset; + NotifierWithReturn write_threshold_notifier; }; diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h new file mode 100644 index 0000000000..f1b899cd5f --- /dev/null +++ b/include/block/write-threshold.h @@ -0,0 +1,64 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ +#ifndef BLOCK_WRITE_THRESHOLD_H +#define BLOCK_WRITE_THRESHOLD_H + +#include + +#include "qemu/typedefs.h" +#include "qemu-common.h" + +/* + * bdrv_write_threshold_set: + * + * Set the write threshold for block devices, in bytes. + * Notify when a write exceeds the threshold, meaning the device + * is becoming full, so it can be transparently resized. + * To be used with thin-provisioned block devices. + * + * Use threshold_bytes == 0 to disable. + */ +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes); + +/* + * bdrv_write_threshold_get + * + * Get the configured write threshold, in bytes. + * Zero means no threshold configured. + */ +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs); + +/* + * bdrv_write_threshold_is_set + * + * Tell if a write threshold is set for a given BDS. + */ +bool bdrv_write_threshold_is_set(const BlockDriverState *bs); + +/* + * bdrv_write_threshold_exceeded + * + * Return the extent of a write request that exceeded the threshold, + * or zero if the request is below the threshold. + * Return zero also if the threshold was not set. + * + * NOTE: here we assume the following holds for each request this code + * deals with: + * + * assert((req->offset + req->bytes) <= UINT64_MAX) + * + * Please not there is *not* an actual C assert(). + */ +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, + const BdrvTrackedRequest *req); + +#endif diff --git a/qapi/block-core.json b/qapi/block-core.json index b7d9772444..a3fdaf02ba 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -257,6 +257,9 @@ # # @cache: the cache mode used for the block device (since: 2.3) # +# @write_threshold: configured write threshold for the device. +# 0 if disabled. (Since 2.3) +# # Since: 0.14.0 # ## @@ -271,7 +274,8 @@ '*bps_max': 'int', '*bps_rd_max': 'int', '*bps_wr_max': 'int', '*iops_max': 'int', '*iops_rd_max': 'int', '*iops_wr_max': 'int', - '*iops_size': 'int', 'cache': 'BlockdevCacheInfo' } } + '*iops_size': 'int', 'cache': 'BlockdevCacheInfo', + 'write_threshold': 'int' } } ## # @BlockDeviceIoStatus: @@ -1917,3 +1921,48 @@ ## { 'enum': 'PreallocMode', 'data': [ 'off', 'metadata', 'falloc', 'full' ] } + +## +# @BLOCK_WRITE_THRESHOLD +# +# Emitted when writes on block device reaches or exceeds the +# configured write threshold. For thin-provisioned devices, this +# means the device should be extended to avoid pausing for +# disk exhaustion. +# The event is one shot. Once triggered, it needs to be +# re-registered with another block-set-threshold command. +# +# @node-name: graph node name on which the threshold was exceeded. +# +# @amount-exceeded: amount of data which exceeded the threshold, in bytes. +# +# @write-threshold: last configured threshold, in bytes. +# +# Since: 2.3 +## +{ 'event': 'BLOCK_WRITE_THRESHOLD', + 'data': { 'node-name': 'str', + 'amount-exceeded': 'uint64', + 'write-threshold': 'uint64' } } + +## +# @block-set-write-threshold +# +# Change the write threshold for a block drive. An event will be delivered +# if a write to this block drive crosses the configured threshold. +# This is useful to transparently resize thin-provisioned drives without +# the guest OS noticing. +# +# @node-name: graph node name on which the threshold must be set. +# +# @write-threshold: configured threshold for the block device, bytes. +# Use 0 to disable the threshold. +# +# Returns: Nothing on success +# If @node name is not found on the block device graph, +# DeviceNotFound +# +# Since: 2.3 +## +{ 'command': 'block-set-write-threshold', + 'data': { 'node-name': 'str', 'write-threshold': 'uint64' } } diff --git a/qmp-commands.hx b/qmp-commands.hx index af3fd19935..a85d8479e3 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2146,6 +2146,8 @@ Each json-object contain the following: - "iops_size": I/O size when limiting by iops (json-int) - "detect_zeroes": detect and optimize zero writing (json-string) - Possible values: "off", "on", "unmap" + - "write_threshold": write offset threshold in bytes, a event will be + emitted if crossed. Zero if disabled (json-int) - "image": the detail of the image, it is a json-object containing the following: - "filename": image file name (json-string) @@ -2223,6 +2225,7 @@ Example: "iops_wr_max": 0, "iops_size": 0, "detect_zeroes": "on", + "write_threshold": 0, "image":{ "filename":"disks/test.qcow2", "format":"qcow2", @@ -3685,6 +3688,7 @@ Example: "iops_rd_max": 0, "iops_wr_max": 0, "iops_size": 0, + "write_threshold": 0, "image":{ "filename":"disks/test.qcow2", "format":"qcow2", @@ -3920,4 +3924,32 @@ Move mouse pointer to absolute coordinates (20000, 400). { "type": "abs", "data" : { "axis": "Y", "value" : 400 } } ] } } <- { "return": {} } +EQMP + + { + .name = "block-set-write-threshold", + .args_type = "node-name:s,write-threshold:l", + .mhandler.cmd_new = qmp_marshal_input_block_set_write_threshold, + }, + +SQMP +block-set-write-threshold +------------ + +Change the write threshold for a block drive. The threshold is an offset, +thus must be non-negative. Default is no write threshold. +Setting the threshold to zero disables it. + +Arguments: + +- "node-name": the node name in the block driver state graph (json-string) +- "write-threshold": the write threshold in bytes (json-int) + +Example: + +-> { "execute": "block-set-write-threshold", + "arguments": { "node-name": "mydev", + "write-threshold": 17179869184 } } +<- { "return": {} } + EQMP diff --git a/tests/Makefile b/tests/Makefile index 5caccf765a..d5df16882d 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -68,6 +68,8 @@ check-unit-y += tests/check-qom-interface$(EXESUF) gcov-files-check-qom-interface-y = qom/object.c check-unit-y += tests/test-qemu-opts$(EXESUF) gcov-files-test-qemu-opts-y = qom/test-qemu-opts.c +check-unit-y += tests/test-write-threshold$(EXESUF) +gcov-files-test-write-threshold-y = block/write-threshold.c check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh @@ -360,6 +362,7 @@ tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y) tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y) tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a +tests/test-write-threshold$(EXESUF): tests/test-write-threshold.o $(block-obj-y) libqemuutil.a libqemustub.a ifeq ($(CONFIG_POSIX),y) LIBS += -lutil diff --git a/tests/qemu-iotests/067.out b/tests/qemu-iotests/067.out index 13ff3cd7aa..00b3eaefc7 100644 --- a/tests/qemu-iotests/067.out +++ b/tests/qemu-iotests/067.out @@ -43,6 +43,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk -device virti "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -218,6 +219,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -423,6 +425,7 @@ Testing: "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -607,6 +610,7 @@ Testing: "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, @@ -717,6 +721,7 @@ Testing: "drv": "qcow2", "iops": 0, "bps_wr": 0, + "write_threshold": 0, "encrypted": false, "bps": 0, "bps_rd": 0, diff --git a/tests/test-write-threshold.c b/tests/test-write-threshold.c new file mode 100644 index 0000000000..faffa7b855 --- /dev/null +++ b/tests/test-write-threshold.c @@ -0,0 +1,119 @@ +/* + * Test block device write threshold + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include +#include +#include "block/block_int.h" +#include "block/write-threshold.h" + + +static void test_threshold_not_set_on_init(void) +{ + uint64_t res; + BlockDriverState bs; + memset(&bs, 0, sizeof(bs)); + + g_assert(!bdrv_write_threshold_is_set(&bs)); + + res = bdrv_write_threshold_get(&bs); + g_assert_cmpint(res, ==, 0); +} + +static void test_threshold_set_get(void) +{ + uint64_t threshold = 4 * 1024 * 1024; + uint64_t res; + BlockDriverState bs; + memset(&bs, 0, sizeof(bs)); + + bdrv_write_threshold_set(&bs, threshold); + + g_assert(bdrv_write_threshold_is_set(&bs)); + + res = bdrv_write_threshold_get(&bs); + g_assert_cmpint(res, ==, threshold); +} + +static void test_threshold_multi_set_get(void) +{ + uint64_t threshold1 = 4 * 1024 * 1024; + uint64_t threshold2 = 15 * 1024 * 1024; + uint64_t res; + BlockDriverState bs; + memset(&bs, 0, sizeof(bs)); + + bdrv_write_threshold_set(&bs, threshold1); + bdrv_write_threshold_set(&bs, threshold2); + res = bdrv_write_threshold_get(&bs); + g_assert_cmpint(res, ==, threshold2); +} + +static void test_threshold_not_trigger(void) +{ + uint64_t amount = 0; + uint64_t threshold = 4 * 1024 * 1024; + BlockDriverState bs; + BdrvTrackedRequest req; + + memset(&bs, 0, sizeof(bs)); + memset(&req, 0, sizeof(req)); + req.offset = 1024; + req.bytes = 1024; + + bdrv_write_threshold_set(&bs, threshold); + amount = bdrv_write_threshold_exceeded(&bs, &req); + g_assert_cmpuint(amount, ==, 0); +} + + +static void test_threshold_trigger(void) +{ + uint64_t amount = 0; + uint64_t threshold = 4 * 1024 * 1024; + BlockDriverState bs; + BdrvTrackedRequest req; + + memset(&bs, 0, sizeof(bs)); + memset(&req, 0, sizeof(req)); + req.offset = (4 * 1024 * 1024) - 1024; + req.bytes = 2 * 1024; + + bdrv_write_threshold_set(&bs, threshold); + amount = bdrv_write_threshold_exceeded(&bs, &req); + g_assert_cmpuint(amount, >=, 1024); +} + +typedef struct TestStruct { + const char *name; + void (*func)(void); +} TestStruct; + + +int main(int argc, char **argv) +{ + size_t i; + TestStruct tests[] = { + { "/write-threshold/not-set-on-init", + test_threshold_not_set_on_init }, + { "/write-threshold/set-get", + test_threshold_set_get }, + { "/write-threshold/multi-set-get", + test_threshold_multi_set_get }, + { "/write-threshold/not-trigger", + test_threshold_not_trigger }, + { "/write-threshold/trigger", + test_threshold_trigger }, + { NULL, NULL } + }; + + g_test_init(&argc, &argv, NULL); + for (i = 0; tests[i].name != NULL; i++) { + g_test_add_func(tests[i].name, tests[i].func); + } + return g_test_run(); +} From fa8354bd226617e2afcc45c27a74959c15291cc0 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:04 +0100 Subject: [PATCH 20/47] block/dmg: properly detect the UDIF trailer DMG files have a variable length with a UDIF trailer at the end of a file. This UDIF trailer is essential as it describes the contents of the image. At the moment however, the start of this trailer is almost always incorrect as bdrv_getlength() returns a multiple of the block size (rounded up). This results in a failure to recognize DMG files, resulting in Invalid argument (EINVAL) errors. As there is no API to retrieve the real file size, look for the magic header in the last two sectors to find the start of this 512-byte UDIF trailer (the "koly" block). The resource fork offset ("info_begin") has its offset adjusted as the initial value of offset does not mean "end of file" anymore, but "begin of UDIF trailer". [Replaced error_set(errp, ERROR_CLASS_GENERIC_ERROR, ...) with error_setg(errp, ...) as discussed with Peter. --Stefan] Signed-off-by: Peter Wu Reviewed-by: John Snow Reviewed-by: Stefan Hajnoczi Message-id: 1420566495-13284-2-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 47 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index e455886d2b..cdad28fe14 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -131,6 +131,46 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, } } +static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) +{ + int64_t length; + int64_t offset = 0; + uint8_t buffer[515]; + int i, ret; + + /* bdrv_getlength returns a multiple of block size (512), rounded up. Since + * dmg images can have odd sizes, try to look for the "koly" magic which + * marks the begin of the UDIF trailer (512 bytes). This magic can be found + * in the last 511 bytes of the second-last sector or the first 4 bytes of + * the last sector (search space: 515 bytes) */ + length = bdrv_getlength(file_bs); + if (length < 0) { + error_setg_errno(errp, -length, + "Failed to get file size while reading UDIF trailer"); + return length; + } else if (length < 512) { + error_setg(errp, "dmg file must be at least 512 bytes long"); + return -EINVAL; + } + if (length > 511 + 512) { + offset = length - 511 - 512; + } + length = length < 515 ? length : 515; + ret = bdrv_pread(file_bs, offset, buffer, length); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed while reading UDIF trailer"); + return ret; + } + for (i = 0; i < length - 3; i++) { + if (buffer[i] == 'k' && buffer[i+1] == 'o' && + buffer[i+2] == 'l' && buffer[i+3] == 'y') { + return offset + i; + } + } + error_setg(errp, "Could not locate UDIF trailer in dmg file"); + return -EINVAL; +} + static int dmg_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -145,15 +185,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; - /* read offset of info blocks */ - offset = bdrv_getlength(bs->file); + /* locate the UDIF trailer */ + offset = dmg_find_koly_offset(bs->file, errp); if (offset < 0) { ret = offset; goto fail; } - offset -= 0x1d8; - ret = read_uint64(bs, offset, &info_begin); + ret = read_uint64(bs, offset + 0x28, &info_begin); if (ret < 0) { goto fail; } else if (info_begin == 0) { From 65a1c7c96a64cd69269c9ba854fa347dd0bbda0b Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:05 +0100 Subject: [PATCH 21/47] block/dmg: extract mish block decoding functionality Extract the mish block decoder such that this can be used for other formats in the future. A new DmgHeaderState struct is introduced to share state while decoding. The code is kept unchanged as much as possible, a "fail" label is added for example where a simple return would probably do. In dmg_open, the variable "tmp" is renamed to "rsrc_data_offset" for clarity and comments have been added explaining various data. Note that this patch has one subtle difference with the previous version which should not affect functionality. In the previous code, the end of a resource was inferred from the mish block (the offsets would be increased by the fields). In this patch, the resource length is used instead to avoid the need to rely on the previous offsets. Signed-off-by: Peter Wu Reviewed-by: John Snow Reviewed-by: Stefan Hajnoczi Message-id: 1420566495-13284-3-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 228 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 133 insertions(+), 95 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index cdad28fe14..c571ac9121 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -171,19 +171,138 @@ static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) return -EINVAL; } +/* used when building the sector table */ +typedef struct DmgHeaderState { + /* used internally by dmg_read_mish_block to remember offsets of blocks + * across calls */ + uint64_t last_in_offset; + uint64_t last_out_offset; + /* exported for dmg_open */ + uint32_t max_compressed_size; + uint32_t max_sectors_per_chunk; +} DmgHeaderState; + +static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds, + int64_t offset, uint32_t count) +{ + BDRVDMGState *s = bs->opaque; + uint32_t type, i; + int ret; + size_t new_size; + uint32_t chunk_count; + + ret = read_uint32(bs, offset, &type); + if (ret < 0) { + goto fail; + } + + /* skip data that is not a valid MISH block (invalid magic or too small) */ + if (type != 0x6d697368 || count < 244) { + /* assume success for now */ + return 0; + } + + offset += 4; + offset += 200; + + chunk_count = (count - 204) / 40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = g_realloc(s->types, new_size / 2); + s->offsets = g_realloc(s->offsets, new_size); + s->lengths = g_realloc(s->lengths, new_size); + s->sectors = g_realloc(s->sectors, new_size); + s->sectorcounts = g_realloc(s->sectorcounts, new_size); + + for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { + ret = read_uint32(bs, offset, &s->types[i]); + if (ret < 0) { + goto fail; + } + offset += 4; + if (s->types[i] != 0x80000005 && s->types[i] != 1 && + s->types[i] != 2) { + if (s->types[i] == 0xffffffff && i > 0) { + ds->last_in_offset = s->offsets[i - 1] + s->lengths[i - 1]; + ds->last_out_offset = s->sectors[i - 1] + + s->sectorcounts[i - 1]; + } + chunk_count--; + i--; + offset += 36; + continue; + } + offset += 4; + + ret = read_uint64(bs, offset, &s->sectors[i]); + if (ret < 0) { + goto fail; + } + s->sectors[i] += ds->last_out_offset; + offset += 8; + + ret = read_uint64(bs, offset, &s->sectorcounts[i]); + if (ret < 0) { + goto fail; + } + offset += 8; + + if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { + error_report("sector count %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + ret = -EINVAL; + goto fail; + } + + ret = read_uint64(bs, offset, &s->offsets[i]); + if (ret < 0) { + goto fail; + } + s->offsets[i] += ds->last_in_offset; + offset += 8; + + ret = read_uint64(bs, offset, &s->lengths[i]); + if (ret < 0) { + goto fail; + } + offset += 8; + + if (s->lengths[i] > DMG_LENGTHS_MAX) { + error_report("length %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->lengths[i], i, DMG_LENGTHS_MAX); + ret = -EINVAL; + goto fail; + } + + update_max_chunk_size(s, i, &ds->max_compressed_size, + &ds->max_sectors_per_chunk); + } + s->n_chunks += chunk_count; + return 0; + +fail: + return ret; +} + static int dmg_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVDMGState *s = bs->opaque; - uint64_t info_begin, info_end, last_in_offset, last_out_offset; - uint32_t count, tmp; - uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i; + DmgHeaderState ds; + uint64_t info_begin, info_end; + uint32_t count, rsrc_data_offset; int64_t offset; int ret; bs->read_only = 1; s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; + /* used by dmg_read_mish_block to keep track of the current I/O position */ + ds.last_in_offset = 0; + ds.last_out_offset = 0; + ds.max_compressed_size = 1; + ds.max_sectors_per_chunk = 1; /* locate the UDIF trailer */ offset = dmg_find_koly_offset(bs->file, errp); @@ -200,10 +319,10 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = read_uint32(bs, info_begin, &tmp); + ret = read_uint32(bs, info_begin, &rsrc_data_offset); if (ret < 0) { goto fail; - } else if (tmp != 0x100) { + } else if (rsrc_data_offset != 0x100) { ret = -EINVAL; goto fail; } @@ -215,15 +334,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } + /* end of resource data, ignoring the following resource map */ info_end = info_begin + count; + /* begin of resource data (consisting of one or more resources) */ offset = info_begin + 0x100; - /* read offsets */ - last_in_offset = last_out_offset = 0; + /* read offsets (mish blocks) from one or more resources in resource data */ while (offset < info_end) { - uint32_t type; - + /* size of following resource */ ret = read_uint32(bs, offset, &count); if (ret < 0) { goto fail; @@ -233,100 +352,19 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, } offset += 4; - ret = read_uint32(bs, offset, &type); + ret = dmg_read_mish_block(bs, &ds, offset, count); if (ret < 0) { goto fail; } - - if (type == 0x6d697368 && count >= 244) { - size_t new_size; - uint32_t chunk_count; - - offset += 4; - offset += 200; - - chunk_count = (count - 204) / 40; - new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); - s->types = g_realloc(s->types, new_size / 2); - s->offsets = g_realloc(s->offsets, new_size); - s->lengths = g_realloc(s->lengths, new_size); - s->sectors = g_realloc(s->sectors, new_size); - s->sectorcounts = g_realloc(s->sectorcounts, new_size); - - for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { - ret = read_uint32(bs, offset, &s->types[i]); - if (ret < 0) { - goto fail; - } - offset += 4; - if (s->types[i] != 0x80000005 && s->types[i] != 1 && - s->types[i] != 2) { - if (s->types[i] == 0xffffffff && i > 0) { - last_in_offset = s->offsets[i - 1] + s->lengths[i - 1]; - last_out_offset = s->sectors[i - 1] + - s->sectorcounts[i - 1]; - } - chunk_count--; - i--; - offset += 36; - continue; - } - offset += 4; - - ret = read_uint64(bs, offset, &s->sectors[i]); - if (ret < 0) { - goto fail; - } - s->sectors[i] += last_out_offset; - offset += 8; - - ret = read_uint64(bs, offset, &s->sectorcounts[i]); - if (ret < 0) { - goto fail; - } - offset += 8; - - if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { - error_report("sector count %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); - ret = -EINVAL; - goto fail; - } - - ret = read_uint64(bs, offset, &s->offsets[i]); - if (ret < 0) { - goto fail; - } - s->offsets[i] += last_in_offset; - offset += 8; - - ret = read_uint64(bs, offset, &s->lengths[i]); - if (ret < 0) { - goto fail; - } - offset += 8; - - if (s->lengths[i] > DMG_LENGTHS_MAX) { - error_report("length %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->lengths[i], i, DMG_LENGTHS_MAX); - ret = -EINVAL; - goto fail; - } - - update_max_chunk_size(s, i, &max_compressed_size, - &max_sectors_per_chunk); - } - s->n_chunks += chunk_count; - } + /* advance offset by size of resource */ + offset += count; } /* initialize zlib engine */ s->compressed_chunk = qemu_try_blockalign(bs->file, - max_compressed_size + 1); + ds.max_compressed_size + 1); s->uncompressed_chunk = qemu_try_blockalign(bs->file, - 512 * max_sectors_per_chunk); + 512 * ds.max_sectors_per_chunk); if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { ret = -ENOMEM; goto fail; From b0e8dc5d54225d2e7012bd306e62ac90ba6fc1ea Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:06 +0100 Subject: [PATCH 22/47] block/dmg: extract processing of resource forks Besides the offset, also read the resource length. This length is now used in the extracted function to verify the end of the resource fork against "count" from the resource fork. Instead of relying on the value of offset to conclude whether the resource fork is available or not (info_begin==0), check the rsrc_fork_length instead. This would allow a dmg file to begin with a resource fork. This seemingly unnecessary restriction was found while trying to craft a DMG file by hand. Other changes: - Do not require resource data offset to be 0x100 (but check that it is within bounds though). - Further improve boundary checking (resource data must be within the resource fork). - Use correct value for resource data length (spotted by John Snow) - Consider the resource data offset when determining info_end. This fixes an EINVAL on the tuxpaint dmg example. The resource fork format is documented at https://developer.apple.com/legacy/library/documentation/mac/pdf/MoreMacintoshToolbox.pdf#page=151 Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-4-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 108 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index c571ac9121..04bae729a4 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -285,13 +285,70 @@ fail: return ret; } +static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, + uint64_t info_begin, uint64_t info_length) +{ + int ret; + uint32_t count, rsrc_data_offset; + uint64_t info_end; + uint64_t offset; + + /* read offset from begin of resource fork (info_begin) to resource data */ + ret = read_uint32(bs, info_begin, &rsrc_data_offset); + if (ret < 0) { + goto fail; + } else if (rsrc_data_offset > info_length) { + ret = -EINVAL; + goto fail; + } + + /* read length of resource data */ + ret = read_uint32(bs, info_begin + 8, &count); + if (ret < 0) { + goto fail; + } else if (count == 0 || rsrc_data_offset + count > info_length) { + ret = -EINVAL; + goto fail; + } + + /* begin of resource data (consisting of one or more resources) */ + offset = info_begin + rsrc_data_offset; + + /* end of resource data (there is possibly a following resource map + * which will be ignored). */ + info_end = offset + count; + + /* read offsets (mish blocks) from one or more resources in resource data */ + while (offset < info_end) { + /* size of following resource */ + ret = read_uint32(bs, offset, &count); + if (ret < 0) { + goto fail; + } else if (count == 0) { + ret = -EINVAL; + goto fail; + } + offset += 4; + + ret = dmg_read_mish_block(bs, ds, offset, count); + if (ret < 0) { + goto fail; + } + /* advance offset by size of resource */ + offset += count; + } + return 0; + +fail: + return ret; +} + static int dmg_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVDMGState *s = bs->opaque; DmgHeaderState ds; - uint64_t info_begin, info_end; - uint32_t count, rsrc_data_offset; + uint64_t rsrc_fork_offset, rsrc_fork_length; int64_t offset; int ret; @@ -311,53 +368,24 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = read_uint64(bs, offset + 0x28, &info_begin); + /* offset of resource fork (RsrcForkOffset) */ + ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset); if (ret < 0) { goto fail; - } else if (info_begin == 0) { - ret = -EINVAL; - goto fail; } - - ret = read_uint32(bs, info_begin, &rsrc_data_offset); + ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length); if (ret < 0) { goto fail; - } else if (rsrc_data_offset != 0x100) { - ret = -EINVAL; - goto fail; } - - ret = read_uint32(bs, info_begin + 4, &count); - if (ret < 0) { - goto fail; - } else if (count == 0) { - ret = -EINVAL; - goto fail; - } - /* end of resource data, ignoring the following resource map */ - info_end = info_begin + count; - - /* begin of resource data (consisting of one or more resources) */ - offset = info_begin + 0x100; - - /* read offsets (mish blocks) from one or more resources in resource data */ - while (offset < info_end) { - /* size of following resource */ - ret = read_uint32(bs, offset, &count); - if (ret < 0) { - goto fail; - } else if (count == 0) { - ret = -EINVAL; - goto fail; - } - offset += 4; - - ret = dmg_read_mish_block(bs, &ds, offset, count); + if (rsrc_fork_length != 0) { + ret = dmg_read_resource_fork(bs, &ds, + rsrc_fork_offset, rsrc_fork_length); if (ret < 0) { goto fail; } - /* advance offset by size of resource */ - offset += count; + } else { + ret = -EINVAL; + goto fail; } /* initialize zlib engine */ From 7aee37b93a4f694cdd670807f30b8efd33d0c721 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:07 +0100 Subject: [PATCH 23/47] block/dmg: process a buffer instead of reading ints As the decoded plist XML is not a pointer in the file, dmg_read_mish_block must be able to process a buffer instead of a file pointer. Since the full buffer must be processed, let's change the return value again to just a success flag. Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-5-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 04bae729a4..4f56227fc5 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -100,6 +100,16 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) return 0; } +static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset) +{ + return be64_to_cpu(*(uint64_t *)&buffer[offset]); +} + +static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset) +{ + return be32_to_cpu(*(uint32_t *)&buffer[offset]); +} + /* Increase max chunk sizes, if necessary. This function is used to calculate * the buffer sizes needed for compressed/uncompressed chunk I/O. */ @@ -182,20 +192,16 @@ typedef struct DmgHeaderState { uint32_t max_sectors_per_chunk; } DmgHeaderState; -static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds, - int64_t offset, uint32_t count) +static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, + uint8_t *buffer, uint32_t count) { - BDRVDMGState *s = bs->opaque; uint32_t type, i; int ret; size_t new_size; uint32_t chunk_count; + int64_t offset = 0; - ret = read_uint32(bs, offset, &type); - if (ret < 0) { - goto fail; - } - + type = buff_read_uint32(buffer, offset); /* skip data that is not a valid MISH block (invalid magic or too small) */ if (type != 0x6d697368 || count < 244) { /* assume success for now */ @@ -214,10 +220,7 @@ static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds, s->sectorcounts = g_realloc(s->sectorcounts, new_size); for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { - ret = read_uint32(bs, offset, &s->types[i]); - if (ret < 0) { - goto fail; - } + s->types[i] = buff_read_uint32(buffer, offset); offset += 4; if (s->types[i] != 0x80000005 && s->types[i] != 1 && s->types[i] != 2) { @@ -233,17 +236,11 @@ static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds, } offset += 4; - ret = read_uint64(bs, offset, &s->sectors[i]); - if (ret < 0) { - goto fail; - } + s->sectors[i] = buff_read_uint64(buffer, offset); s->sectors[i] += ds->last_out_offset; offset += 8; - ret = read_uint64(bs, offset, &s->sectorcounts[i]); - if (ret < 0) { - goto fail; - } + s->sectorcounts[i] = buff_read_uint64(buffer, offset); offset += 8; if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { @@ -254,17 +251,11 @@ static int dmg_read_mish_block(BlockDriverState *bs, DmgHeaderState *ds, goto fail; } - ret = read_uint64(bs, offset, &s->offsets[i]); - if (ret < 0) { - goto fail; - } + s->offsets[i] = buff_read_uint64(buffer, offset); s->offsets[i] += ds->last_in_offset; offset += 8; - ret = read_uint64(bs, offset, &s->lengths[i]); - if (ret < 0) { - goto fail; - } + s->lengths[i] = buff_read_uint64(buffer, offset); offset += 8; if (s->lengths[i] > DMG_LENGTHS_MAX) { @@ -288,8 +279,10 @@ fail: static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, uint64_t info_begin, uint64_t info_length) { + BDRVDMGState *s = bs->opaque; int ret; uint32_t count, rsrc_data_offset; + uint8_t *buffer = NULL; uint64_t info_end; uint64_t offset; @@ -330,16 +323,23 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, } offset += 4; - ret = dmg_read_mish_block(bs, ds, offset, count); + buffer = g_realloc(buffer, count); + ret = bdrv_pread(bs->file, offset, buffer, count); + if (ret < 0) { + goto fail; + } + + ret = dmg_read_mish_block(s, ds, buffer, count); if (ret < 0) { goto fail; } /* advance offset by size of resource */ offset += count; } - return 0; + ret = 0; fail: + g_free(buffer); return ret; } From f6e6652d7c9251236fc1ecc6cece36104c7af15b Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:08 +0100 Subject: [PATCH 24/47] block/dmg: validate chunk size to avoid overflow Previously the chunk size was not checked, allowing for a large memory allocation. This patch checks whether the chunks size is within the resource fork length, and whether the resource fork is below the trailer of the dmg file. Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-6-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/dmg.c b/block/dmg.c index 4f56227fc5..5c2c2c231d 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -317,7 +317,7 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, ret = read_uint32(bs, offset, &count); if (ret < 0) { goto fail; - } else if (count == 0) { + } else if (count == 0 || count > info_end - offset) { ret = -EINVAL; goto fail; } @@ -377,6 +377,11 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, if (ret < 0) { goto fail; } + if (rsrc_fork_offset >= offset || + rsrc_fork_length > offset - rsrc_fork_offset) { + ret = -EINVAL; + goto fail; + } if (rsrc_fork_length != 0) { ret = dmg_read_resource_fork(bs, &ds, rsrc_fork_offset, rsrc_fork_length); From 0599e56ed468d245148530eb194be4a5056a0583 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:09 +0100 Subject: [PATCH 25/47] block/dmg: process XML plists The format is simple enough to avoid using a full-blown XML parser. It assumes that all BLKX items begin with the "mish" magic word, therefore it is not a problem if other values get matched which are not a BLKX block. The offsets are based on the description at http://newosxbook.com/DMG.html For compatibility with glib 2.12, use g_base64_decode (which additionally requires an extra buffer allocation) instead of g_base64_decode_inplace (which is only available since glib 2.20). Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-7-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/block/dmg.c b/block/dmg.c index 5c2c2c231d..a78506ad77 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -26,6 +26,7 @@ #include "qemu/bswap.h" #include "qemu/module.h" #include +#include enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used @@ -343,12 +344,67 @@ fail: return ret; } +static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, + uint64_t info_begin, uint64_t info_length) +{ + BDRVDMGState *s = bs->opaque; + int ret; + uint8_t *buffer = NULL; + char *data_begin, *data_end; + + /* Have at least some length to avoid NULL for g_malloc. Attempt to set a + * safe upper cap on the data length. A test sample had a XML length of + * about 1 MiB. */ + if (info_length == 0 || info_length > 16 * 1024 * 1024) { + ret = -EINVAL; + goto fail; + } + + buffer = g_malloc(info_length + 1); + buffer[info_length] = '\0'; + ret = bdrv_pread(bs->file, info_begin, buffer, info_length); + if (ret != info_length) { + ret = -EINVAL; + goto fail; + } + + /* look for .... The data is 284 (0x11c) bytes after base64 + * decode. The actual data element has 431 (0x1af) bytes which includes tabs + * and line feeds. */ + data_end = (char *)buffer; + while ((data_begin = strstr(data_end, "")) != NULL) { + guchar *mish; + gsize out_len = 0; + + data_begin += 6; + data_end = strstr(data_begin, ""); + /* malformed XML? */ + if (data_end == NULL) { + ret = -EINVAL; + goto fail; + } + *data_end++ = '\0'; + mish = g_base64_decode(data_begin, &out_len); + ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len); + g_free(mish); + if (ret < 0) { + goto fail; + } + } + ret = 0; + +fail: + g_free(buffer); + return ret; +} + static int dmg_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVDMGState *s = bs->opaque; DmgHeaderState ds; uint64_t rsrc_fork_offset, rsrc_fork_length; + uint64_t plist_xml_offset, plist_xml_length; int64_t offset; int ret; @@ -382,12 +438,31 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } + /* offset of property list (XMLOffset) */ + ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset); + if (ret < 0) { + goto fail; + } + ret = read_uint64(bs, offset + 0xe0, &plist_xml_length); + if (ret < 0) { + goto fail; + } + if (plist_xml_offset >= offset || + plist_xml_length > offset - plist_xml_offset) { + ret = -EINVAL; + goto fail; + } if (rsrc_fork_length != 0) { ret = dmg_read_resource_fork(bs, &ds, rsrc_fork_offset, rsrc_fork_length); if (ret < 0) { goto fail; } + } else if (plist_xml_length != 0) { + ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length); + if (ret < 0) { + goto fail; + } } else { ret = -EINVAL; goto fail; From 8daf425794ed624083de98caab04b9fedb873420 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:10 +0100 Subject: [PATCH 26/47] block/dmg: set virtual size to a non-zero value Right now the virtual size is always reported as zero which makes it impossible to convert between formats. After this patch, the number of sectors will be read from the trailer ("koly" block). To verify the behavior, the output of `dmg2img foo.dmg foo.img` was compared against `qemu-img convert -f dmg -O raw foo.dmg foo.raw`. The tests showed that the file contents are exactly the same, except that QEMU creates a slightly larger file (it matches the total sectors count). Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-8-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/dmg.c b/block/dmg.c index a78506ad77..a33c131a9f 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -452,6 +452,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } + ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors); + if (ret < 0) { + goto fail; + } + if (bs->total_sectors < 0) { + ret = -EINVAL; + goto fail; + } if (rsrc_fork_length != 0) { ret = dmg_read_resource_fork(bs, &ds, rsrc_fork_offset, rsrc_fork_length); From c6d34865fa02463cf34634f45369ebcc725b101b Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:11 +0100 Subject: [PATCH 27/47] block/dmg: fix sector data offset calculation This patch addresses two issues: - The data fork offset was not taken into account, resulting in failure to read an InstallESD.dmg file (5164763151 bytes) which had a non-zero DataForkOffset field. - The offset of the previous block ("partition") was unconditionally added to the current block because older files would start the input offset of a new block at zero. Newer files (including vlc-2.1.5.dmg, tuxpaint-0.9.15-macosx.dmg and OS X Yosemite [MAS].dmg) failed in reads because these files have chunk offsets, relative to the begin of a data fork. Now the data offset of the mish is taken into account. While we could check that the data_offset is within the data fork, let's not do that here as it would only result in parse failures on invalid files (rather than gracefully handling such bad files). dmg_read will error out if the offset is incorrect. Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-9-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index a33c131a9f..34dc6322e9 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -186,7 +186,7 @@ static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) typedef struct DmgHeaderState { /* used internally by dmg_read_mish_block to remember offsets of blocks * across calls */ - uint64_t last_in_offset; + uint64_t data_fork_offset; uint64_t last_out_offset; /* exported for dmg_open */ uint32_t max_compressed_size; @@ -201,6 +201,8 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, size_t new_size; uint32_t chunk_count; int64_t offset = 0; + uint64_t data_offset; + uint64_t in_offset = ds->data_fork_offset; type = buff_read_uint32(buffer, offset); /* skip data that is not a valid MISH block (invalid magic or too small) */ @@ -209,8 +211,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, return 0; } - offset += 4; - offset += 200; + /* location in data fork for (compressed) blob (in bytes) */ + data_offset = buff_read_uint64(buffer, offset + 0x18); + in_offset += data_offset; + + /* move to begin of chunk entries */ + offset += 204; chunk_count = (count - 204) / 40; new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); @@ -226,7 +232,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, if (s->types[i] != 0x80000005 && s->types[i] != 1 && s->types[i] != 2) { if (s->types[i] == 0xffffffff && i > 0) { - ds->last_in_offset = s->offsets[i - 1] + s->lengths[i - 1]; ds->last_out_offset = s->sectors[i - 1] + s->sectorcounts[i - 1]; } @@ -253,7 +258,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, } s->offsets[i] = buff_read_uint64(buffer, offset); - s->offsets[i] += ds->last_in_offset; + s->offsets[i] += in_offset; offset += 8; s->lengths[i] = buff_read_uint64(buffer, offset); @@ -412,7 +417,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; /* used by dmg_read_mish_block to keep track of the current I/O position */ - ds.last_in_offset = 0; + ds.data_fork_offset = 0; ds.last_out_offset = 0; ds.max_compressed_size = 1; ds.max_sectors_per_chunk = 1; @@ -424,6 +429,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + /* offset of data fork (DataForkOffset) */ + ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset); + if (ret < 0) { + goto fail; + } else if (ds.data_fork_offset > offset) { + ret = -EINVAL; + goto fail; + } + /* offset of resource fork (RsrcForkOffset) */ ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset); if (ret < 0) { From 66ec3bba972b6d4433698397d25d039c230949dc Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:12 +0100 Subject: [PATCH 28/47] block/dmg: use SectorNumber from BLKX header Previously the sector table parsing relied on the previous offset of the DMG file. Now it uses the sector number from the BLKX header (see http://newosxbook.com/DMG.html). The implementation of dmg2img (from vu1tur) does not base the output sector on the location of the terminator (0xffffffff) either so it should be safe to drop this dependency on the previous state. (It makes somehow makes sense, a terminator should halt further processing of a block and is perhaps used to preallocate some space.) Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-10-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 34dc6322e9..d144a5e810 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -187,7 +187,6 @@ typedef struct DmgHeaderState { /* used internally by dmg_read_mish_block to remember offsets of blocks * across calls */ uint64_t data_fork_offset; - uint64_t last_out_offset; /* exported for dmg_open */ uint32_t max_compressed_size; uint32_t max_sectors_per_chunk; @@ -203,6 +202,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, int64_t offset = 0; uint64_t data_offset; uint64_t in_offset = ds->data_fork_offset; + uint64_t out_offset; type = buff_read_uint32(buffer, offset); /* skip data that is not a valid MISH block (invalid magic or too small) */ @@ -211,6 +211,9 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, return 0; } + /* chunk offsets are relative to this sector number */ + out_offset = buff_read_uint64(buffer, offset + 8); + /* location in data fork for (compressed) blob (in bytes) */ data_offset = buff_read_uint64(buffer, offset + 0x18); in_offset += data_offset; @@ -231,10 +234,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, offset += 4; if (s->types[i] != 0x80000005 && s->types[i] != 1 && s->types[i] != 2) { - if (s->types[i] == 0xffffffff && i > 0) { - ds->last_out_offset = s->sectors[i - 1] + - s->sectorcounts[i - 1]; - } chunk_count--; i--; offset += 36; @@ -243,7 +242,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, offset += 4; s->sectors[i] = buff_read_uint64(buffer, offset); - s->sectors[i] += ds->last_out_offset; + s->sectors[i] += out_offset; offset += 8; s->sectorcounts[i] = buff_read_uint64(buffer, offset); @@ -418,7 +417,6 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; /* used by dmg_read_mish_block to keep track of the current I/O position */ ds.data_fork_offset = 0; - ds.last_out_offset = 0; ds.max_compressed_size = 1; ds.max_sectors_per_chunk = 1; From a8b10c6ead7f62e8eadbdaf944f371889c3c4c29 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:13 +0100 Subject: [PATCH 29/47] block/dmg: factor out block type check In preparation for adding bzip2 support, split the type check into a separate function. Make all offsets relative to the begin of a chunk such that it is easier to recognize the position without having to add up all offsets. Some comments are added to describe the fields. There is no functional change. Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-11-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index d144a5e810..cc584b55aa 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -192,6 +192,18 @@ typedef struct DmgHeaderState { uint32_t max_sectors_per_chunk; } DmgHeaderState; +static bool dmg_is_known_block_type(uint32_t entry_type) +{ + switch (entry_type) { + case 0x00000001: /* uncompressed */ + case 0x00000002: /* zeroes */ + case 0x80000005: /* zlib */ + return true; + default: + return false; + } +} + static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, uint8_t *buffer, uint32_t count) { @@ -231,22 +243,19 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { s->types[i] = buff_read_uint32(buffer, offset); - offset += 4; - if (s->types[i] != 0x80000005 && s->types[i] != 1 && - s->types[i] != 2) { + if (!dmg_is_known_block_type(s->types[i])) { chunk_count--; i--; - offset += 36; + offset += 40; continue; } - offset += 4; - s->sectors[i] = buff_read_uint64(buffer, offset); + /* sector number */ + s->sectors[i] = buff_read_uint64(buffer, offset + 8); s->sectors[i] += out_offset; - offset += 8; - s->sectorcounts[i] = buff_read_uint64(buffer, offset); - offset += 8; + /* sector count */ + s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { error_report("sector count %" PRIu64 " for chunk %" PRIu32 @@ -256,12 +265,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, goto fail; } - s->offsets[i] = buff_read_uint64(buffer, offset); + /* offset in (compressed) data fork */ + s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); s->offsets[i] += in_offset; - offset += 8; - s->lengths[i] = buff_read_uint64(buffer, offset); - offset += 8; + /* length in (compressed) data fork */ + s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); if (s->lengths[i] > DMG_LENGTHS_MAX) { error_report("length %" PRIu64 " for chunk %" PRIu32 @@ -273,6 +282,7 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, update_max_chunk_size(s, i, &ds->max_compressed_size, &ds->max_sectors_per_chunk); + offset += 40; } s->n_chunks += chunk_count; return 0; From 6b383c08c46468ecee98ecf71ffd8362e9bd7f42 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:14 +0100 Subject: [PATCH 30/47] block/dmg: support bzip2 block entry types This patch adds support for bzip2-compressed block entries as introduced with OS X 10.4 (source: https://en.wikipedia.org/wiki/Apple_Disk_Image). It was tested against a 5.2G "OS X Yosemite" installation image which stores the BLXX block in the XML property list (instead of resource forks) and has over 5k chunks. New configure entries are added (--enable-bzip2 / --disable-bzip2) to control inclusion of bzip2 functionality (which requires linking against libbz2). The help message suggests that this option is needed for DMG files, but the tests are generic enough that other parts of QEMU can use bzip2 if needed. The identifiers are based on http://newosxbook.com/DMG.html. The decompression routines are based on the zlib case, but as there is no way to reset the decompression state (unlike zlib), memory is allocated and deallocated for every decompression. This should not be problematic as the decompression takes most of the time and as blocks are typically about/over 1 MiB in size, only one allocation is done every 2000 sectors. Signed-off-by: Peter Wu Reviewed-by: John Snow Acked-by: Paolo Bonzini Message-id: 1420566495-13284-12-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/Makefile.objs | 1 + block/dmg.c | 43 ++++++++++++++++++++++++++++++++++++++++++- configure | 31 +++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 1 deletion(-) diff --git a/block/Makefile.objs b/block/Makefile.objs index 010afad71a..db2933e469 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -37,5 +37,6 @@ gluster.o-libs := $(GLUSTERFS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) archipelago.o-libs := $(ARCHIPELAGO_LIBS) +dmg.o-libs := $(BZIP2_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio diff --git a/block/dmg.c b/block/dmg.c index cc584b55aa..797fda3ec8 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -26,6 +26,9 @@ #include "qemu/bswap.h" #include "qemu/module.h" #include +#ifdef CONFIG_BZIP2 +#include +#endif #include enum { @@ -56,6 +59,9 @@ typedef struct BDRVDMGState { uint8_t *compressed_chunk; uint8_t *uncompressed_chunk; z_stream zstream; +#ifdef CONFIG_BZIP2 + bz_stream bzstream; +#endif } BDRVDMGState; static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -123,6 +129,7 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, switch (s->types[chunk]) { case 0x80000005: /* zlib compressed */ + case 0x80000006: /* bzip2 compressed */ compressed_size = s->lengths[chunk]; uncompressed_sectors = s->sectorcounts[chunk]; break; @@ -198,6 +205,9 @@ static bool dmg_is_known_block_type(uint32_t entry_type) case 0x00000001: /* uncompressed */ case 0x00000002: /* zeroes */ case 0x80000005: /* zlib */ +#ifdef CONFIG_BZIP2 + case 0x80000006: /* bzip2 */ +#endif return true; default: return false; @@ -564,13 +574,16 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { int ret; uint32_t chunk = search_chunk(s, sector_num); +#ifdef CONFIG_BZIP2 + uint64_t total_out; +#endif if (chunk >= s->n_chunks) { return -1; } s->current_chunk = s->n_chunks; - switch (s->types[chunk]) { + switch (s->types[chunk]) { /* block entry type */ case 0x80000005: { /* zlib compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ @@ -594,6 +607,34 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return -1; } break; } +#ifdef CONFIG_BZIP2 + case 0x80000006: /* bzip2 compressed */ + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0); + if (ret != BZ_OK) { + return -1; + } + s->bzstream.next_in = (char *)s->compressed_chunk; + s->bzstream.avail_in = (unsigned int) s->lengths[chunk]; + s->bzstream.next_out = (char *)s->uncompressed_chunk; + s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk]; + ret = BZ2_bzDecompress(&s->bzstream); + total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) + + s->bzstream.total_out_lo32; + BZ2_bzDecompressEnd(&s->bzstream); + if (ret != BZ_STREAM_END || + total_out != 512 * s->sectorcounts[chunk]) { + return -1; + } + break; +#endif /* CONFIG_BZIP2 */ case 1: /* copy */ ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); diff --git a/configure b/configure index e00e03af32..7ba4bcb866 100755 --- a/configure +++ b/configure @@ -313,6 +313,7 @@ glx="" zlib="yes" lzo="" snappy="" +bzip2="" guest_agent="" guest_agent_with_vss="no" vss_win32_sdk="" @@ -1060,6 +1061,10 @@ for opt do ;; --enable-snappy) snappy="yes" ;; + --disable-bzip2) bzip2="no" + ;; + --enable-bzip2) bzip2="yes" + ;; --enable-guest-agent) guest_agent="yes" ;; --disable-guest-agent) guest_agent="no" @@ -1374,6 +1379,8 @@ Advanced options (experts only): --enable-usb-redir enable usb network redirection support --enable-lzo enable the support of lzo compression library --enable-snappy enable the support of snappy compression library + --enable-bzip2 enable the support of bzip2 compression library (for + reading bzip2-compressed dmg images) --disable-guest-agent disable building of the QEMU Guest Agent --enable-guest-agent enable building of the QEMU Guest Agent --with-vss-sdk=SDK-path enable Windows VSS support in QEMU Guest Agent @@ -1819,6 +1826,24 @@ EOF fi fi +########################################## +# bzip2 check + +if test "$bzip2" != "no" ; then + cat > $TMPC << EOF +#include +int main(void) { BZ2_bzlibVersion(); return 0; } +EOF + if compile_prog "" "-lbz2" ; then + bzip2="yes" + else + if test "$bzip2" = "yes"; then + feature_not_found "libbzip2" "Install libbzip2 devel" + fi + bzip2="no" + fi +fi + ########################################## # libseccomp check @@ -4385,6 +4410,7 @@ echo "vhdx $vhdx" echo "Quorum $quorum" echo "lzo support $lzo" echo "snappy support $snappy" +echo "bzip2 support $bzip2" echo "NUMA host support $numa" if test "$sdl_too_old" = "yes"; then @@ -4743,6 +4769,11 @@ if test "$snappy" = "yes" ; then echo "CONFIG_SNAPPY=y" >> $config_host_mak fi +if test "$bzip2" = "yes" ; then + echo "CONFIG_BZIP2=y" >> $config_host_mak + echo "BZIP2_LIBS=-lbz2" >> $config_host_mak +fi + if test "$libiscsi" = "yes" ; then echo "CONFIG_LIBISCSI=m" >> $config_host_mak echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak From 177b75104da3e3a9af84975c32a44782d903c41f Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 6 Jan 2015 18:48:15 +0100 Subject: [PATCH 31/47] block/dmg: improve zeroes handling Disk images may contain large all-zeroes gaps (1.66k sectors or 812 MiB is seen in the real world). These blocks (type 2) do not need to be extracted into a temporary buffer, there is no need to allocate memory for these blocks nor to check its length. (For the test image, the maximum uncompressed size is 1054371 bytes, probably for a bzip2-compressed block.) Signed-off-by: Peter Wu Reviewed-by: John Snow Message-id: 1420566495-13284-13-git-send-email-peter@lekensteyn.nl Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/dmg.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 797fda3ec8..825c49d59a 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -137,7 +137,9 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, uncompressed_sectors = (s->lengths[chunk] + 511) / 512; break; case 2: /* zero */ - uncompressed_sectors = s->sectorcounts[chunk]; + /* as the all-zeroes block may be large, it is treated specially: the + * sector is not copied from a large buffer, a simple memset is used + * instead. Therefore uncompressed_sectors does not need to be set. */ break; } @@ -267,7 +269,9 @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, /* sector count */ s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); - if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { + /* all-zeroes sector (type 2) does not need to be "uncompressed" and can + * therefore be unbounded. */ + if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { error_report("sector count %" PRIu64 " for chunk %" PRIu32 " is larger than max (%u)", s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); @@ -643,7 +647,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) } break; case 2: /* zero */ - memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]); + /* see dmg_read, it is treated specially. No buffer needs to be + * pre-filled, the zeroes can be set directly. */ break; } s->current_chunk = chunk; @@ -662,6 +667,13 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num, if (dmg_read_chunk(bs, sector_num + i) != 0) { return -1; } + /* Special case: current chunk is all zeroes. Do not perform a memcpy as + * s->uncompressed_chunk may be too small to cover the large all-zeroes + * section. dmg_read_chunk is called to find s->current_chunk */ + if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ + memset(buf + i * 512, 0, 512); + continue; + } sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; memcpy(buf + i * 512, s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); From 0adfa1ed655904d5bf17fe047635a563f0229789 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 12 Jan 2015 12:31:32 +0000 Subject: [PATCH 32/47] qed: check for header size overflow Header size is denoted in clusters. The maximum cluster size is 64 MB but there is no limit on header size. Check for uint32_t overflow in case the header size field has a whacky value. Signed-off-by: Stefan Hajnoczi Message-id: 1421065893-18875-2-git-send-email-stefanha@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block/qed.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/qed.c b/block/qed.c index 80f18d82e2..892b13c806 100644 --- a/block/qed.c +++ b/block/qed.c @@ -440,6 +440,11 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, s->l2_mask = s->table_nelems - 1; s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + /* Header size calculation must not overflow uint32_t */ + if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { + return -EINVAL; + } + if ((s->header.features & QED_F_BACKING_FILE)) { if ((uint64_t)s->header.backing_filename_offset + s->header.backing_filename_size > From 319fc53e344d5cead970c74f088ae5c607d426b3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 12 Jan 2015 12:31:33 +0000 Subject: [PATCH 33/47] qemu-iotests: add 116 invalid QED input file tests These tests exercise error code paths in the QED image format. The tests are very simple, they just prove that the error path exits cleanly. Signed-off-by: Stefan Hajnoczi Message-id: 1421065893-18875-3-git-send-email-stefanha@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- tests/qemu-iotests/116 | 96 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/116.out | 37 +++++++++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 134 insertions(+) create mode 100755 tests/qemu-iotests/116 create mode 100644 tests/qemu-iotests/116.out diff --git a/tests/qemu-iotests/116 b/tests/qemu-iotests/116 new file mode 100755 index 0000000000..713ed484ba --- /dev/null +++ b/tests/qemu-iotests/116 @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Test error code paths for invalid QED images +# +# The aim of this test is to exercise the error paths in qed_open() to ensure +# there are no crashes with invalid input files. +# +# Copyright (C) 2015 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=stefanha@redhat.com + +seq=`basename $0` +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qed +_supported_proto generic +_supported_os Linux + + +size=128M + +echo +echo "== truncated header cluster ==" +_make_test_img $size +truncate -s 512 "$TEST_IMG" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo "== invalid header magic ==" +_make_test_img $size +poke_file "$TEST_IMG" "0" "QEDX" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo "== invalid cluster size ==" +_make_test_img $size +poke_file "$TEST_IMG" "4" "\xff\xff\xff\xff" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo "== invalid table size ==" +_make_test_img $size +poke_file "$TEST_IMG" "8" "\xff\xff\xff\xff" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo "== invalid header size ==" +_make_test_img $size +poke_file "$TEST_IMG" "12" "\xff\xff\xff\xff" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo "== invalid L1 table offset ==" +_make_test_img $size +poke_file "$TEST_IMG" "40" "\xff\xff\xff\xff\xff\xff\xff\xff" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +echo +echo "== invalid image size ==" +_make_test_img $size +poke_file "$TEST_IMG" "48" "\xff\xff\xff\xff\xff\xff\xff\xff" +$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/116.out b/tests/qemu-iotests/116.out new file mode 100644 index 0000000000..b679ceea63 --- /dev/null +++ b/tests/qemu-iotests/116.out @@ -0,0 +1,37 @@ +QA output created by 116 + +== truncated header cluster == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument +no file open, try 'help open' + +== invalid header magic == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Image not in QED format +no file open, try 'help open' + +== invalid cluster size == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument +no file open, try 'help open' + +== invalid table size == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument +no file open, try 'help open' + +== invalid header size == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument +no file open, try 'help open' + +== invalid L1 table offset == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument +no file open, try 'help open' + +== invalid image size == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 +qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument +no file open, try 'help open' +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index f8bf354156..4b2b93bc19 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -116,3 +116,4 @@ 111 rw auto quick 113 rw auto quick 114 rw auto quick +116 rw auto quick From e729fa6afed3aa917287b63034244f548b79ec60 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Tue, 27 Jan 2015 08:33:55 -0500 Subject: [PATCH 34/47] block: fix off-by-one error in qcow and qcow2 This fixes an off-by-one error introduced in 9a29e18. Both qcow and qcow2 need to make sure to leave room for string terminator '\0' for the backing file, so the max length of the non-terminated string is either 1023 or PATH_MAX - 1. Reported-by: Kevin Wolf Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/qcow.c | 2 +- block/qcow2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index ccbe9e0d2c..055896910e 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -215,7 +215,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, /* read the backing file name */ if (header.backing_file_offset != 0) { len = header.backing_file_size; - if (len > 1023 || len > sizeof(bs->backing_file)) { + if (len > 1023 || len >= sizeof(bs->backing_file)) { error_setg(errp, "Backing file name too long"); ret = -EINVAL; goto fail; diff --git a/block/qcow2.c b/block/qcow2.c index dbaf016bc7..7e614d76a4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -869,7 +869,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (header.backing_file_offset != 0) { len = header.backing_file_size; if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || - len > sizeof(bs->backing_file)) { + len >= sizeof(bs->backing_file)) { error_setg(errp, "Backing file name too long"); ret = -EINVAL; goto fail; From 53f9e77f4ed898a4ea4ad2bc118c06c336ba30dd Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 26 Jan 2015 21:02:56 -0500 Subject: [PATCH 35/47] iotests: Fix 083 As of 8f9e835fd2e687d2bfe936819c3494af4343614d, probing should be disabled in the qemu-iotests (at least when using qemu-io). This broke 083's reference output (which consisted mostly of "Could not read image for determining its format"). This patch fixes it. Note that one case which failed before is now successful: Disconnect after data. This is due to qemu having read twice before (once for probing, once for the qemu-io read command), but only once now (the qemu-io read command). Therefore, reading is successful (which is correct). Signed-off-by: Max Reitz Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- tests/qemu-iotests/083.out | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out index 85ee8d6dd7..fd397f421f 100644 --- a/tests/qemu-iotests/083.out +++ b/tests/qemu-iotests/083.out @@ -62,8 +62,7 @@ no file open, try 'help open' === Check disconnect after neg2 === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect 8 neg2 === @@ -80,49 +79,43 @@ no file open, try 'help open' === Check disconnect before request === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect after request === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect before reply === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect after reply === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect 4 reply === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect 8 reply === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect before data === -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error === Check disconnect after data === -read failed: Input/output error +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) === Check disconnect before neg-classic === @@ -157,7 +150,6 @@ no file open, try 'help open' === Check disconnect after neg-classic === -qemu-io: can't open device nbd:127.0.0.1:PORT: Could not read image for determining its format: Input/output error -no file open, try 'help open' +read failed: Input/output error *** done From 23ab6953f4675b8663888ce6cfe40f4191be3e48 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 26 Jan 2015 21:02:57 -0500 Subject: [PATCH 36/47] iotests: Fix 100 for nbd In case of NBD, _make_test_img starts a new NBD server. Therefore, _cleanup_test_img (which shuts that server down) has to be invoked before the next _make_test_img call in order to make 100 work for NBD. Signed-off-by: Max Reitz Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- tests/qemu-iotests/100 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/qemu-iotests/100 b/tests/qemu-iotests/100 index 9124aba760..7c1b235b51 100755 --- a/tests/qemu-iotests/100 +++ b/tests/qemu-iotests/100 @@ -55,6 +55,8 @@ echo "== verify pattern ==" $QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io +_cleanup_test_img + echo echo "== Sequential requests ==" _make_test_img $size @@ -66,6 +68,8 @@ $QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0xce 4k 4k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0 8k 4k" "$TEST_IMG" | _filter_qemu_io +_cleanup_test_img + echo echo "== Superset overlapping requests ==" _make_test_img $size @@ -79,6 +83,8 @@ $QEMU_IO -c "read -P 0xcd 0 1k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0xcd 3k 1k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io +_cleanup_test_img + echo echo "== Subset overlapping requests ==" _make_test_img $size @@ -92,6 +98,8 @@ $QEMU_IO -c "read -P 0xce 0 1k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0xce 3k 1k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io +_cleanup_test_img + echo echo "== Head overlapping requests ==" _make_test_img $size @@ -104,6 +112,8 @@ echo "== verify pattern ==" $QEMU_IO -c "read -P 0xce 2k 2k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io +_cleanup_test_img + echo echo "== Tail overlapping requests ==" _make_test_img $size @@ -116,6 +126,8 @@ echo "== verify pattern ==" $QEMU_IO -c "read -P 0xce 0k 2k" "$TEST_IMG" | _filter_qemu_io $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io +_cleanup_test_img + echo echo "== Disjoint requests ==" _make_test_img $size From a231cb272611c758d45135f6a35a4dd1beaf5585 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 26 Jan 2015 21:02:58 -0500 Subject: [PATCH 37/47] iotests: Fix 104 for NBD _make_test_img sets up an NBD server, _cleanup_test_img shuts it down; thus, _cleanup_test_img has to be called before _make_test_img is invoked another time. Furthermore, the pipe through _filter_test_img was unnecessary; _make_test_img already takes care of that. And finally, a filter is added to _filter_img_info to replace "nbd://127.0.0.1:10810" by "TEST_DIR/t.IMGFMT", since the former is the way to express the full image path (normally the latter) for NBD tests. Signed-off-by: Max Reitz Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- tests/qemu-iotests/104 | 9 +++------ tests/qemu-iotests/common.filter | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/qemu-iotests/104 b/tests/qemu-iotests/104 index b471aa5bb1..f32752bb6f 100755 --- a/tests/qemu-iotests/104 +++ b/tests/qemu-iotests/104 @@ -28,11 +28,7 @@ here=`pwd` tmp=/tmp/$$ status=1 # failure is the default! -_cleanup() -{ - _cleanup_test_img -} -trap "_cleanup; exit \$status" 0 1 2 3 15 +trap "exit \$status" 0 1 2 3 15 # get standard environment, filters and checks . ./common.rc @@ -47,8 +43,9 @@ echo image_sizes="1024 1234" for s in $image_sizes; do - _make_test_img $s | _filter_img_create + _make_test_img $s _img_info | _filter_img_info + _cleanup_test_img done # success, all done diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter index b73c70be95..06e1bb045f 100644 --- a/tests/qemu-iotests/common.filter +++ b/tests/qemu-iotests/common.filter @@ -200,6 +200,7 @@ _filter_img_info() sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \ -e "s#$TEST_DIR#TEST_DIR#g" \ -e "s#$IMGFMT#IMGFMT#g" \ + -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \ -e "/encrypted: yes/d" \ -e "/cluster_size: [0-9]\\+/d" \ -e "/table_size: [0-9]\\+/d" \ From 1ce52846d3ce0e4b58caebcae84719bef6401fbb Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 26 Jan 2015 21:02:59 -0500 Subject: [PATCH 38/47] nbd: Improve error messages This patch makes use of the Error object for nbd_receive_negotiate() so that errors during negotiation look nicer. Furthermore, this patch adds an additional error message if the received magic was wrong, but would be correct for the other protocol version, respectively: So if an export name was specified, but the NBD server magic corresponds to an old handshake, this condition is explicitly signaled to the user, and vice versa. As these messages are now part of the "Could not open image" error message, additional filtering has to be employed in iotest 083, which this patch does as well. Signed-off-by: Max Reitz Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/nbd-client.c | 4 +-- block/nbd-client.h | 2 +- block/nbd.c | 2 +- include/block/nbd.h | 2 +- nbd.c | 42 ++++++++++++++++++------------- qemu-nbd.c | 7 +++++- tests/qemu-iotests/083 | 3 ++- tests/qemu-iotests/083.out | 51 +++++++++++++------------------------- 8 files changed, 55 insertions(+), 58 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 6e1c97cad0..28bfb62bed 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -373,7 +373,7 @@ void nbd_client_session_close(NbdClientSession *client) } int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, - int sock, const char *export) + int sock, const char *export, Error **errp) { int ret; @@ -382,7 +382,7 @@ int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, qemu_set_block(sock); ret = nbd_receive_negotiate(sock, export, &client->nbdflags, &client->size, - &client->blocksize); + &client->blocksize, errp); if (ret < 0) { logout("Failed to negotiate with the NBD server\n"); closesocket(sock); diff --git a/block/nbd-client.h b/block/nbd-client.h index cd478f3a98..cfeecc2775 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -36,7 +36,7 @@ typedef struct NbdClientSession { } NbdClientSession; int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, - int sock, const char *export_name); + int sock, const char *export_name, Error **errp); void nbd_client_session_close(NbdClientSession *client); int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, diff --git a/block/nbd.c b/block/nbd.c index 04cc845076..2e20831715 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -271,7 +271,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, } /* NBD handshake */ - result = nbd_client_session_init(&s->client, bs, sock, export); + result = nbd_client_session_init(&s->client, bs, sock, export, errp); g_free(export); return result; } diff --git a/include/block/nbd.h b/include/block/nbd.h index 348302c90b..b75959556c 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -75,7 +75,7 @@ enum { ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read); int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, - off_t *size, size_t *blocksize); + off_t *size, size_t *blocksize, Error **errp); int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize); ssize_t nbd_send_request(int csock, struct nbd_request *request); ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply); diff --git a/nbd.c b/nbd.c index 53cf82be0b..e56afbc162 100644 --- a/nbd.c +++ b/nbd.c @@ -494,7 +494,7 @@ fail: } int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, - off_t *size, size_t *blocksize) + off_t *size, size_t *blocksize, Error **errp) { char buf[256]; uint64_t magic, s; @@ -506,13 +506,13 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, rc = -EINVAL; if (read_sync(csock, buf, 8) != 8) { - LOG("read failed"); + error_setg(errp, "Failed to read data"); goto fail; } buf[8] = '\0'; if (strlen(buf) == 0) { - LOG("server connection closed"); + error_setg(errp, "Server connection closed unexpectedly"); goto fail; } @@ -527,12 +527,12 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, qemu_isprint(buf[7]) ? buf[7] : '.'); if (memcmp(buf, "NBDMAGIC", 8) != 0) { - LOG("Invalid magic received"); + error_setg(errp, "Invalid magic received"); goto fail; } if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) { - LOG("read failed"); + error_setg(errp, "Failed to read magic"); goto fail; } magic = be64_to_cpu(magic); @@ -545,52 +545,60 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, TRACE("Checking magic (opts_magic)"); if (magic != NBD_OPTS_MAGIC) { - LOG("Bad magic received"); + if (magic == NBD_CLIENT_MAGIC) { + error_setg(errp, "Server does not support export names"); + } else { + error_setg(errp, "Bad magic received"); + } goto fail; } if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) { - LOG("flags read failed"); + error_setg(errp, "Failed to read server flags"); goto fail; } *flags = be16_to_cpu(tmp) << 16; /* reserved for future use */ if (write_sync(csock, &reserved, sizeof(reserved)) != sizeof(reserved)) { - LOG("write failed (reserved)"); + error_setg(errp, "Failed to read reserved field"); goto fail; } /* write the export name */ magic = cpu_to_be64(magic); if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) { - LOG("write failed (magic)"); + error_setg(errp, "Failed to send export name magic"); goto fail; } opt = cpu_to_be32(NBD_OPT_EXPORT_NAME); if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) { - LOG("write failed (opt)"); + error_setg(errp, "Failed to send export name option number"); goto fail; } namesize = cpu_to_be32(strlen(name)); if (write_sync(csock, &namesize, sizeof(namesize)) != sizeof(namesize)) { - LOG("write failed (namesize)"); + error_setg(errp, "Failed to send export name length"); goto fail; } if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) { - LOG("write failed (name)"); + error_setg(errp, "Failed to send export name"); goto fail; } } else { TRACE("Checking magic (cli_magic)"); if (magic != NBD_CLIENT_MAGIC) { - LOG("Bad magic received"); + if (magic == NBD_OPTS_MAGIC) { + error_setg(errp, "Server requires an export name"); + } else { + error_setg(errp, "Bad magic received"); + } goto fail; } } if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) { - LOG("read failed"); + error_setg(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); @@ -599,19 +607,19 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags, if (!name) { if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) { - LOG("read failed (flags)"); + error_setg(errp, "Failed to read export flags"); goto fail; } *flags = be32_to_cpup(flags); } else { if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) { - LOG("read failed (tmp)"); + error_setg(errp, "Failed to read export flags"); goto fail; } *flags |= be32_to_cpu(tmp); } if (read_sync(csock, &buf, 124) != 124) { - LOG("read failed (buf)"); + error_setg(errp, "Failed to read reserved block"); goto fail; } rc = 0; diff --git a/qemu-nbd.c b/qemu-nbd.c index d222512412..4d8df08385 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -284,6 +284,7 @@ static void *nbd_client_thread(void *arg) int fd, sock; int ret; pthread_t show_parts_thread; + Error *local_error = NULL; sock = unix_socket_outgoing(sockpath); if (sock < 0) { @@ -291,8 +292,12 @@ static void *nbd_client_thread(void *arg) } ret = nbd_receive_negotiate(sock, NULL, &nbdflags, - &size, &blocksize); + &size, &blocksize, &local_error); if (ret < 0) { + if (local_error) { + fprintf(stderr, "%s\n", error_get_pretty(local_error)); + error_free(local_error); + } goto out_socket; } diff --git a/tests/qemu-iotests/083 b/tests/qemu-iotests/083 index 991a9d91db..1b2d3f1156 100755 --- a/tests/qemu-iotests/083 +++ b/tests/qemu-iotests/083 @@ -56,7 +56,8 @@ filter_nbd() { # # Filter out the TCP port number since this changes between runs. sed -e 's#^.*nbd\.c:.*##g' \ - -e 's#nbd:127\.0\.0\.1:[^:]*:#nbd:127\.0\.0\.1:PORT:#g' + -e 's#nbd:127\.0\.0\.1:[^:]*:#nbd:127\.0\.0\.1:PORT:#g' \ + -e 's#\(exportname=foo\|PORT\): Failed to .*$#\1#' } check_disconnect() { diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out index fd397f421f..8c1441bf4f 100644 --- a/tests/qemu-iotests/083.out +++ b/tests/qemu-iotests/083.out @@ -1,62 +1,52 @@ QA output created by 083 === Check disconnect before neg1 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect after neg1 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect 8 neg1 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect 16 neg1 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect before export === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect after export === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect 4 export === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect 12 export === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect 16 export === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect before neg2 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect after neg2 === @@ -66,14 +56,12 @@ read failed: Input/output error === Check disconnect 8 neg2 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect 10 neg2 === - -qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo no file open, try 'help open' === Check disconnect before request === @@ -119,32 +107,27 @@ read 512/512 bytes at offset 0 === Check disconnect before neg-classic === - -qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT no file open, try 'help open' === Check disconnect 8 neg-classic === - -qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT no file open, try 'help open' === Check disconnect 16 neg-classic === - -qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT no file open, try 'help open' === Check disconnect 24 neg-classic === - -qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT no file open, try 'help open' === Check disconnect 28 neg-classic === - -qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument +qemu-io: can't open device nbd:127.0.0.1:PORT no file open, try 'help open' === Check disconnect after neg-classic === From 75af1f34cd5b07c3c7fcf86dfc99a42de48a600d Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 6 Feb 2015 11:54:11 +0100 Subject: [PATCH 39/47] block: introduce BDRV_REQUEST_MAX_SECTORS we check and adjust request sizes at several places with sometimes inconsistent checks or default values: INT_MAX INT_MAX >> BDRV_SECTOR_BITS UINT_MAX >> BDRV_SECTOR_BITS SIZE_MAX >> BDRV_SECTOR_BITS This patches introdocues a macro for the maximal allowed sectors per request and uses it at several places. Signed-off-by: Peter Lieven Reviewed-by: Denis V. Lunev Signed-off-by: Kevin Wolf --- block.c | 21 +++++++++------------ hw/block/virtio-blk.c | 4 ++-- include/block/block.h | 3 +++ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/block.c b/block.c index 8272ef901e..49e0073ce9 100644 --- a/block.c +++ b/block.c @@ -2647,7 +2647,7 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, { int64_t len; - if (size > INT_MAX) { + if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { return -EIO; } @@ -2671,7 +2671,7 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { - if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { return -EIO; } @@ -2758,7 +2758,7 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, .iov_len = nb_sectors * BDRV_SECTOR_SIZE, }; - if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { return -EINVAL; } @@ -2826,13 +2826,10 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) } for (;;) { - nb_sectors = target_sectors - sector_num; + nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); if (nb_sectors <= 0) { return 0; } - if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { - nb_sectors = INT_MAX / BDRV_SECTOR_SIZE; - } ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); if (ret < 0) { error_report("error getting block status at sector %" PRId64 ": %s", @@ -3167,7 +3164,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags) { - if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { return -EINVAL; } @@ -3202,8 +3199,8 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, struct iovec iov = {0}; int ret = 0; - int max_write_zeroes = bs->bl.max_write_zeroes ? - bs->bl.max_write_zeroes : INT_MAX; + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, + BDRV_REQUEST_MAX_SECTORS); while (nb_sectors > 0 && !ret) { int num = nb_sectors; @@ -3458,7 +3455,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags) { - if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { return -EINVAL; } @@ -5120,7 +5117,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } - max_discard = bs->bl.max_discard ? bs->bl.max_discard : INT_MAX; + max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); while (nb_sectors > 0) { int ret; int num = nb_sectors; diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 8c51a295d9..1a8a176dcc 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -381,7 +381,7 @@ void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb) } max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk); - max_xfer_len = MIN_NON_ZERO(max_xfer_len, INT_MAX); + max_xfer_len = MIN_NON_ZERO(max_xfer_len, BDRV_REQUEST_MAX_SECTORS); qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs), &multireq_compare); @@ -447,7 +447,7 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev, uint64_t nb_sectors = size >> BDRV_SECTOR_BITS; uint64_t total_sectors; - if (nb_sectors > INT_MAX) { + if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) { return false; } if (sector & dev->sector_mask) { diff --git a/include/block/block.h b/include/block/block.h index 3082d2b80e..25a6d62d1b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -83,6 +83,9 @@ typedef enum { #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) #define BDRV_SECTOR_MASK ~(BDRV_SECTOR_SIZE - 1) +#define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \ + INT_MAX >> BDRV_SECTOR_BITS) + /* * Allocation status flags * BDRV_BLOCK_DATA: data is read from bs->file or another file From fa21e6faa6f1d7de49fd030ebdb0722b59cf9a41 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Fri, 6 Feb 2015 14:24:43 +0300 Subject: [PATCH 40/47] nbd: fix max_discard/max_transfer_length nbd_co_discard calls nbd_client_session_co_discard which uses uint32_t as the length in bytes of the data to discard due to the following definition: struct nbd_request { uint32_t magic; uint32_t type; uint64_t handle; uint64_t from; uint32_t len; <-- the length of data to be discarded, in bytes } QEMU_PACKED; Thus we should limit bl_max_discard to UINT32_MAX >> BDRV_SECTOR_BITS to avoid overflow. NBD read/write code uses the same structure for transfers. Fix max_transfer_length accordingly. Signed-off-by: Denis V. Lunev CC: Peter Lieven CC: Kevin Wolf Signed-off-by: Kevin Wolf --- block/nbd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/nbd.c b/block/nbd.c index 2e20831715..b05d1d0407 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -301,6 +301,12 @@ static int nbd_co_flush(BlockDriverState *bs) return nbd_client_session_co_flush(&s->client); } +static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; + bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; +} + static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { @@ -396,6 +402,7 @@ static BlockDriver bdrv_nbd = { .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, .bdrv_attach_aio_context = nbd_attach_aio_context, @@ -413,6 +420,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, .bdrv_attach_aio_context = nbd_attach_aio_context, @@ -430,6 +438,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_detach_aio_context, .bdrv_attach_aio_context = nbd_attach_aio_context, From 8e8cb375e0964b4ed099cb8563029028db26a834 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 5 Feb 2015 14:55:31 +0200 Subject: [PATCH 41/47] block: Give always priority to unused entries in the qcow2 L2 cache The current algorithm to replace entries from the L2 cache gives priority to newer hits by dividing the hit count of all existing entries by two everytime there is a cache miss. However, if there are several cache misses the hit count of the existing entries can easily go down to 0. This will result in those entries being replaced even when there are others that have never been used. This problem is more noticeable with larger disk images and cache sizes, since the chances of having several misses before the cache is full are higher. If we make sure that the hit count can never go down to 0 again, unused entries will always have priority. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/qcow2-cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 904f6b1f44..b1155492a3 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -253,7 +253,9 @@ static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) /* Give newer hits priority */ /* TODO Check how to optimize the replacement strategy */ - c->entries[i].cache_hits /= 2; + if (c->entries[i].cache_hits > 1) { + c->entries[i].cache_hits /= 2; + } } if (min_index == -1) { From 8c44dfbc62a50a8bc4113f199b8662861f757591 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 6 Feb 2015 09:39:16 -0500 Subject: [PATCH 42/47] qcow2: Rewrite qcow2_alloc_bytes() qcow2_alloc_bytes() is a function with insufficient error handling and an unnecessary goto. This patch rewrites it. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 80 +++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 9afdb40b40..9b80ca79ea 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -759,54 +759,54 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) { BDRVQcowState *s = bs->opaque; - int64_t offset, cluster_offset; - int free_in_cluster; + int64_t offset; + size_t free_in_cluster; + int ret; BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); assert(size > 0 && size <= s->cluster_size); - if (s->free_byte_offset == 0) { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - return offset; + assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); + + offset = s->free_byte_offset; + + if (offset) { + int refcount = qcow2_get_refcount(bs, offset >> s->cluster_bits); + if (refcount < 0) { + return refcount; } - s->free_byte_offset = offset; - } - redo: - free_in_cluster = s->cluster_size - - offset_into_cluster(s, s->free_byte_offset); - if (size <= free_in_cluster) { - /* enough space in current cluster */ - offset = s->free_byte_offset; - s->free_byte_offset += size; - free_in_cluster -= size; - if (free_in_cluster == 0) - s->free_byte_offset = 0; - if (offset_into_cluster(s, offset) != 0) - qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - } else { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - return offset; - } - cluster_offset = start_of_cluster(s, s->free_byte_offset); - if ((cluster_offset + s->cluster_size) == offset) { - /* we are lucky: contiguous data */ - offset = s->free_byte_offset; - qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - s->free_byte_offset += size; - } else { - s->free_byte_offset = offset; - goto redo; + + if (refcount == 0xffff) { + offset = 0; } } - /* The cluster refcount was incremented, either by qcow2_alloc_clusters() - * or explicitly by qcow2_update_cluster_refcount(). Refcount blocks must - * be flushed before the caller's L2 table updates. - */ + free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); + if (!offset || free_in_cluster < size) { + int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); + if (new_cluster < 0) { + return new_cluster; + } + + if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { + offset = new_cluster; + } + } + + assert(offset); + ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + if (ret < 0) { + return ret; + } + + /* The cluster refcount was incremented; refcount blocks must be flushed + * before the caller's L2 table updates. */ qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); + + s->free_byte_offset = offset + size; + if (!offset_into_cluster(s, s->free_byte_offset)) { + s->free_byte_offset = 0; + } + return offset; } From 24d6bffe8ab523b1dcd8b8a2be678775f3e26b1c Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 29 Jan 2015 10:36:58 +0100 Subject: [PATCH 43/47] blockdev: Give find_block_job() an Error ** parameter When find_block_job() fails, all its callers build the same Error object. Build it in find_block_job() instead. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Message-id: 1422524221-8566-2-git-send-email-armbru@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- blockdev.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/blockdev.c b/blockdev.c index d59efd3f15..8d6ca35ac8 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2653,7 +2653,8 @@ out: } /* Get the block job for a given device name and acquire its AioContext */ -static BlockJob *find_block_job(const char *device, AioContext **aio_context) +static BlockJob *find_block_job(const char *device, AioContext **aio_context, + Error **errp) { BlockDriverState *bs; @@ -2673,6 +2674,7 @@ static BlockJob *find_block_job(const char *device, AioContext **aio_context) return bs->job; notfound: + error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); *aio_context = NULL; return NULL; } @@ -2680,10 +2682,9 @@ notfound: void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp) { AioContext *aio_context; - BlockJob *job = find_block_job(device, &aio_context); + BlockJob *job = find_block_job(device, &aio_context, errp); if (!job) { - error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); return; } @@ -2695,10 +2696,9 @@ void qmp_block_job_cancel(const char *device, bool has_force, bool force, Error **errp) { AioContext *aio_context; - BlockJob *job = find_block_job(device, &aio_context); + BlockJob *job = find_block_job(device, &aio_context, errp); if (!job) { - error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); return; } @@ -2721,10 +2721,9 @@ out: void qmp_block_job_pause(const char *device, Error **errp) { AioContext *aio_context; - BlockJob *job = find_block_job(device, &aio_context); + BlockJob *job = find_block_job(device, &aio_context, errp); if (!job) { - error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); return; } @@ -2736,10 +2735,9 @@ void qmp_block_job_pause(const char *device, Error **errp) void qmp_block_job_resume(const char *device, Error **errp) { AioContext *aio_context; - BlockJob *job = find_block_job(device, &aio_context); + BlockJob *job = find_block_job(device, &aio_context, errp); if (!job) { - error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); return; } @@ -2751,10 +2749,9 @@ void qmp_block_job_resume(const char *device, Error **errp) void qmp_block_job_complete(const char *device, Error **errp) { AioContext *aio_context; - BlockJob *job = find_block_job(device, &aio_context); + BlockJob *job = find_block_job(device, &aio_context, errp); if (!job) { - error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); return; } From 2e3a0266bd84a9be9f5e23c1568db6eb7f3e9e94 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 29 Jan 2015 10:36:59 +0100 Subject: [PATCH 44/47] blockdev: Eliminate silly QERR_BLOCK_JOB_NOT_ACTIVE macro The QERR_ macros are leftovers from the days of "rich" error objects. They're used with error_set() and qerror_report(), and expand into the first *two* arguments. This trickiness has become pointless. Clean this one up. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Message-id: 1422524221-8566-3-git-send-email-armbru@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- blockdev.c | 3 ++- include/qapi/qmp/qerror.h | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/blockdev.c b/blockdev.c index 8d6ca35ac8..287d7af901 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2674,7 +2674,8 @@ static BlockJob *find_block_job(const char *device, AioContext **aio_context, return bs->job; notfound: - error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); + error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE, + "No active block job on device '%s'", device); *aio_context = NULL; return NULL; } diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h index eeaf0cb981..85f1699df7 100644 --- a/include/qapi/qmp/qerror.h +++ b/include/qapi/qmp/qerror.h @@ -37,9 +37,6 @@ void qerror_report_err(Error *err); #define QERR_BASE_NOT_FOUND \ ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found" -#define QERR_BLOCK_JOB_NOT_ACTIVE \ - ERROR_CLASS_DEVICE_NOT_ACTIVE, "No active block job on device '%s'" - #define QERR_BLOCK_JOB_NOT_READY \ ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed" From 4d2855a348c5e90f56584ab9777fc877965ca2e0 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 29 Jan 2015 10:37:00 +0100 Subject: [PATCH 45/47] block: New bdrv_add_key(), convert monitor to use it Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Message-id: 1422524221-8566-4-git-send-email-armbru@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block.c | 29 +++++++++++++++++++++++++++++ blockdev.c | 24 ++---------------------- include/block/block.h | 1 + monitor.c | 16 +++++++++++----- qmp.c | 8 ++++---- 5 files changed, 47 insertions(+), 31 deletions(-) diff --git a/block.c b/block.c index 49e0073ce9..84af3cd210 100644 --- a/block.c +++ b/block.c @@ -3713,6 +3713,35 @@ int bdrv_set_key(BlockDriverState *bs, const char *key) return ret; } +/* + * Provide an encryption key for @bs. + * If @key is non-null: + * If @bs is not encrypted, fail. + * Else if the key is invalid, fail. + * Else set @bs's key to @key, replacing the existing key, if any. + * If @key is null: + * If @bs is encrypted and still lacks a key, fail. + * Else do nothing. + * On failure, store an error object through @errp if non-null. + */ +void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) +{ + if (key) { + if (!bdrv_is_encrypted(bs)) { + error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, + bdrv_get_device_name(bs)); + } else if (bdrv_set_key(bs, key) < 0) { + error_set(errp, QERR_INVALID_PASSWORD); + } + } else { + if (bdrv_key_required(bs)) { + error_set(errp, QERR_DEVICE_ENCRYPTED, + bdrv_get_device_name(bs), + bdrv_get_encrypted_filename(bs)); + } + } +} + const char *bdrv_get_format_name(BlockDriverState *bs) { return bs->drv ? bs->drv->format_name : NULL; diff --git a/blockdev.c b/blockdev.c index 287d7af901..7d34960b96 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1793,7 +1793,6 @@ void qmp_block_passwd(bool has_device, const char *device, Error *local_err = NULL; BlockDriverState *bs; AioContext *aio_context; - int err; bs = bdrv_lookup_bs(has_device ? device : NULL, has_node_name ? node_name : NULL, @@ -1806,16 +1805,8 @@ void qmp_block_passwd(bool has_device, const char *device, aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - err = bdrv_set_key(bs, password); - if (err == -EINVAL) { - error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, bdrv_get_device_name(bs)); - goto out; - } else if (err < 0) { - error_set(errp, QERR_INVALID_PASSWORD); - goto out; - } + bdrv_add_key(bs, password, errp); -out: aio_context_release(aio_context); } @@ -1833,18 +1824,7 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename, return; } - if (bdrv_key_required(bs)) { - if (password) { - if (bdrv_set_key(bs, password) < 0) { - error_set(errp, QERR_INVALID_PASSWORD); - } - } else { - error_set(errp, QERR_DEVICE_ENCRYPTED, bdrv_get_device_name(bs), - bdrv_get_encrypted_filename(bs)); - } - } else if (password) { - error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, bdrv_get_device_name(bs)); - } + bdrv_add_key(bs, password, errp); } void qmp_change_blockdev(const char *device, const char *filename, diff --git a/include/block/block.h b/include/block/block.h index 25a6d62d1b..321295e5f7 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -381,6 +381,7 @@ BlockDriverState *bdrv_next(BlockDriverState *bs); int bdrv_is_encrypted(BlockDriverState *bs); int bdrv_key_required(BlockDriverState *bs); int bdrv_set_key(BlockDriverState *bs, const char *key); +void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp); int bdrv_query_missing_keys(void); void bdrv_iterate_format(void (*it)(void *opaque, const char *name), void *opaque); diff --git a/monitor.c b/monitor.c index 5a24311844..c3cc060b45 100644 --- a/monitor.c +++ b/monitor.c @@ -5368,9 +5368,12 @@ static void bdrv_password_cb(void *opaque, const char *password, Monitor *mon = opaque; BlockDriverState *bs = readline_opaque; int ret = 0; + Error *local_err = NULL; - if (bdrv_set_key(bs, password) != 0) { - monitor_printf(mon, "invalid password\n"); + bdrv_add_key(bs, password, &local_err); + if (local_err) { + monitor_printf(mon, "%s\n", error_get_pretty(local_err)); + error_free(local_err); ret = -EPERM; } if (mon->password_completion_cb) @@ -5388,17 +5391,20 @@ int monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs, BlockCompletionFunc *completion_cb, void *opaque) { + Error *local_err = NULL; int err; - if (!bdrv_key_required(bs)) { + bdrv_add_key(bs, NULL, &local_err); + if (!local_err) { if (completion_cb) completion_cb(opaque, 0); return 0; } + /* Need a key for @bs */ + if (monitor_ctrl_mode(mon)) { - qerror_report(QERR_DEVICE_ENCRYPTED, bdrv_get_device_name(bs), - bdrv_get_encrypted_filename(bs)); + qerror_report_err(local_err); return -1; } diff --git a/qmp.c b/qmp.c index 7f2d25a492..20a9e9739f 100644 --- a/qmp.c +++ b/qmp.c @@ -154,6 +154,7 @@ SpiceInfo *qmp_query_spice(Error **errp) void qmp_cont(Error **errp) { + Error *local_err = NULL; BlockDriverState *bs; if (runstate_needs_reset()) { @@ -167,10 +168,9 @@ void qmp_cont(Error **errp) bdrv_iostatus_reset(bs); } for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) { - if (bdrv_key_required(bs)) { - error_set(errp, QERR_DEVICE_ENCRYPTED, - bdrv_get_device_name(bs), - bdrv_get_encrypted_filename(bs)); + bdrv_add_key(bs, NULL, &local_err); + if (local_err) { + error_propagate(errp, local_err); return; } } From b1ca639184d93984551b423d8e538ad4add5eb15 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 29 Jan 2015 10:37:01 +0100 Subject: [PATCH 46/47] block: Eliminate silly QERR_ macros used for encryption keys The QERR_ macros are leftovers from the days of "rich" error objects. They're used with error_set() and qerror_report(), and expand into the first *two* arguments. This trickiness has become pointless. Clean up QERR_DEVICE_ENCRYPTED and QERR_DEVICE_NOT_ENCRYPTED. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Message-id: 1422524221-8566-5-git-send-email-armbru@redhat.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block.c | 5 +++-- include/qapi/qmp/qerror.h | 6 ------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index 84af3cd210..210fd5f997 100644 --- a/block.c +++ b/block.c @@ -3728,14 +3728,15 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) { if (key) { if (!bdrv_is_encrypted(bs)) { - error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, + error_setg(errp, "Device '%s' is not encrypted", bdrv_get_device_name(bs)); } else if (bdrv_set_key(bs, key) < 0) { error_set(errp, QERR_INVALID_PASSWORD); } } else { if (bdrv_key_required(bs)) { - error_set(errp, QERR_DEVICE_ENCRYPTED, + error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED, + "'%s' (%s) is encrypted", bdrv_get_device_name(bs), bdrv_get_encrypted_filename(bs)); } diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h index 85f1699df7..986260f466 100644 --- a/include/qapi/qmp/qerror.h +++ b/include/qapi/qmp/qerror.h @@ -49,9 +49,6 @@ void qerror_report_err(Error *err); #define QERR_BUS_NOT_FOUND \ ERROR_CLASS_GENERIC_ERROR, "Bus '%s' not found" -#define QERR_DEVICE_ENCRYPTED \ - ERROR_CLASS_DEVICE_ENCRYPTED, "'%s' (%s) is encrypted" - #define QERR_DEVICE_HAS_NO_MEDIUM \ ERROR_CLASS_GENERIC_ERROR, "Device '%s' has no medium" @@ -67,9 +64,6 @@ void qerror_report_err(Error *err); #define QERR_DEVICE_NO_HOTPLUG \ ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging" -#define QERR_DEVICE_NOT_ENCRYPTED \ - ERROR_CLASS_GENERIC_ERROR, "Device '%s' is not encrypted" - #define QERR_DEVICE_NOT_FOUND \ ERROR_CLASS_DEVICE_NOT_FOUND, "Device '%s' not found" From 728dacbda817b2ca259e9d337fab06bcf14e94a6 Mon Sep 17 00:00:00 2001 From: Programmingkid Date: Mon, 19 Jan 2015 17:12:55 -0500 Subject: [PATCH 47/47] block/raw-posix.c: Fix raw_getlength() on Mac OS X block devices This patch replaces the dummy code in raw_getlength() for block devices on OS X, which always returned LLONG_MAX, with a real implementation that returns the actual block device size. Signed-off-by: John Arbuckle Reviewed-by: Stefan Hajnoczi Tested-by: Peter Maydell Signed-off-by: Kevin Wolf --- block/raw-posix.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 7b42f37d83..e474c17974 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1375,7 +1375,20 @@ again: if (size == 0) #endif #if defined(__APPLE__) && defined(__MACH__) - size = LLONG_MAX; + { + uint64_t sectors = 0; + uint32_t sector_size = 0; + + if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 + && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { + size = sectors * sector_size; + } else { + size = lseek(fd, 0LL, SEEK_END); + if (size < 0) { + return -errno; + } + } + } #else size = lseek(fd, 0LL, SEEK_END); if (size < 0) {