From 8a4266144ebffceb8fda2a5feb03d23c535923d4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 16 Jul 2010 17:17:01 +0200 Subject: [PATCH 1/6] block: Change bdrv_commit to handle multiple sectors at once bdrv_commit copies the image to its backing file sector by sector, which is (surprise!) relatively slow. Let's take a larger buffer and handle more sectors at once if possible. With a 1G qcow2 file, this brought the time bdrv_commit takes down from 5:06 min to 1:14 min for me. Signed-off-by: Kevin Wolf --- block.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/block.c b/block.c index 452ae94eea..c4d6814726 100644 --- a/block.c +++ b/block.c @@ -739,14 +739,16 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res) return bs->drv->bdrv_check(bs, res); } +#define COMMIT_BUF_SECTORS 2048 + /* commit COW file into the raw image */ int bdrv_commit(BlockDriverState *bs) { BlockDriver *drv = bs->drv; - int64_t i, total_sectors; - int n, j, ro, open_flags; + int64_t sector, total_sectors; + int n, ro, open_flags; int ret = 0, rw_ret = 0; - unsigned char sector[BDRV_SECTOR_SIZE]; + uint8_t *buf; char filename[1024]; BlockDriverState *bs_rw, *bs_ro; @@ -789,22 +791,20 @@ int bdrv_commit(BlockDriverState *bs) } total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; - for (i = 0; i < total_sectors;) { - if (drv->bdrv_is_allocated(bs, i, 65536, &n)) { - for(j = 0; j < n; j++) { - if (bdrv_read(bs, i, sector, 1) != 0) { - ret = -EIO; - goto ro_cleanup; - } + buf = qemu_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); - if (bdrv_write(bs->backing_hd, i, sector, 1) != 0) { - ret = -EIO; - goto ro_cleanup; - } - i++; - } - } else { - i += n; + for (sector = 0; sector < total_sectors; sector += n) { + if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { + + if (bdrv_read(bs, sector, buf, n) != 0) { + ret = -EIO; + goto ro_cleanup; + } + + if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { + ret = -EIO; + goto ro_cleanup; + } } } @@ -821,6 +821,7 @@ int bdrv_commit(BlockDriverState *bs) bdrv_flush(bs->backing_hd); ro_cleanup: + qemu_free(buf); if (ro) { /* re-open as RO */ From f0aa7a8b2d518c54430e4382309281b93e51981a Mon Sep 17 00:00:00 2001 From: Miguel Di Ciurcio Filho Date: Mon, 19 Jul 2010 15:25:01 -0300 Subject: [PATCH 2/6] loadvm: improve tests before bdrv_snapshot_goto() This patch improves the resilience of the load_vmstate() function, doing further and better ordered tests. In load_vmstate(), if there is any error on bdrv_snapshot_goto(), except if the error is on VM state device, load_vmstate() will return zero and the VM will be started with major corruption chances. The current process: - test if there is any writable device without snapshot support - if exists return -error - get the device that saves the VM state, possible return -error but unlikely because it was tested earlier - flush I/O - run bdrv_snapshot_goto() on devices - if fails, give an warning and goes to the next (not good!) - if fails on the VM state device, return zero (not good!) - check if the requested snapshot exists on the device that saves the VM state and the state is not zero - if fails return -error - open the file with the VM state - if fails return -error - load the VM state - if fails return -error - return zero New behavior: - get the device that saves the VM state - if fails return -error - check if the requested snapshot exists on the device that saves the VM state and the state is not zero - if fails return -error - test if there is any writable device without snapshot support - if exists return -error - test if the devices with snapshot support have the requested snapshot - if anyone fails, return -error - flush I/O - run snapshot_goto() on devices - if anyone fails, return -error - open the file with the VM state - if fails return -error - load the VM state - if fails return -error - return zero do_loadvm must not call vm_start if any error has occurred in load_vmstate. Signed-off-by: Miguel Di Ciurcio Filho Signed-off-by: Kevin Wolf --- monitor.c | 3 ++- savevm.c | 71 ++++++++++++++++++++++++++----------------------------- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/monitor.c b/monitor.c index 5366c36525..c313d5ac14 100644 --- a/monitor.c +++ b/monitor.c @@ -2274,8 +2274,9 @@ static void do_loadvm(Monitor *mon, const QDict *qdict) vm_stop(0); - if (load_vmstate(name) >= 0 && saved_vm_running) + if (load_vmstate(name) == 0 && saved_vm_running) { vm_start(); + } } int monitor_get_fd(Monitor *mon, const char *fdname) diff --git a/savevm.c b/savevm.c index 4c0e5d3b10..53d47a6234 100644 --- a/savevm.c +++ b/savevm.c @@ -1894,12 +1894,27 @@ void do_savevm(Monitor *mon, const QDict *qdict) int load_vmstate(const char *name) { - BlockDriverState *bs, *bs1; + BlockDriverState *bs, *bs_vm_state; QEMUSnapshotInfo sn; QEMUFile *f; int ret; - /* Verify if there is a device that doesn't support snapshots and is writable */ + bs_vm_state = bdrv_snapshots(); + if (!bs_vm_state) { + error_report("No block device supports snapshots"); + return -ENOTSUP; + } + + /* Don't even try to load empty VM states */ + ret = bdrv_snapshot_find(bs_vm_state, &sn, name); + if (ret < 0) { + return ret; + } else if (sn.vm_state_size == 0) { + return -EINVAL; + } + + /* Verify if there is any device that doesn't support snapshots and is + writable and check if the requested snapshot is available too. */ bs = NULL; while ((bs = bdrv_next(bs))) { @@ -1912,63 +1927,45 @@ int load_vmstate(const char *name) bdrv_get_device_name(bs)); return -ENOTSUP; } - } - bs = bdrv_snapshots(); - if (!bs) { - error_report("No block device supports snapshots"); - return -EINVAL; + ret = bdrv_snapshot_find(bs, &sn, name); + if (ret < 0) { + error_report("Device '%s' does not have the requested snapshot '%s'", + bdrv_get_device_name(bs), name); + return ret; + } } /* Flush all IO requests so they don't interfere with the new state. */ qemu_aio_flush(); - bs1 = NULL; - while ((bs1 = bdrv_next(bs1))) { - if (bdrv_can_snapshot(bs1)) { - ret = bdrv_snapshot_goto(bs1, name); + bs = NULL; + while ((bs = bdrv_next(bs))) { + if (bdrv_can_snapshot(bs)) { + ret = bdrv_snapshot_goto(bs, name); if (ret < 0) { - switch(ret) { - case -ENOTSUP: - error_report("%sSnapshots not supported on device '%s'", - bs != bs1 ? "Warning: " : "", - bdrv_get_device_name(bs1)); - break; - case -ENOENT: - error_report("%sCould not find snapshot '%s' on device '%s'", - bs != bs1 ? "Warning: " : "", - name, bdrv_get_device_name(bs1)); - break; - default: - error_report("%sError %d while activating snapshot on '%s'", - bs != bs1 ? "Warning: " : "", - ret, bdrv_get_device_name(bs1)); - break; - } - /* fatal on snapshot block device */ - if (bs == bs1) - return 0; + error_report("Error %d while activating snapshot '%s' on '%s'", + ret, name, bdrv_get_device_name(bs)); + return ret; } } } - /* Don't even try to load empty VM states */ - ret = bdrv_snapshot_find(bs, &sn, name); - if ((ret >= 0) && (sn.vm_state_size == 0)) - return -EINVAL; - /* restore the VM state */ - f = qemu_fopen_bdrv(bs, 0); + f = qemu_fopen_bdrv(bs_vm_state, 0); if (!f) { error_report("Could not open VM state file"); return -EINVAL; } + ret = qemu_loadvm_state(f); + qemu_fclose(f); if (ret < 0) { error_report("Error %d while loading VM state", ret); return ret; } + return 0; } From bd0858bb460c0c134e9a62c73e60e465037b1240 Mon Sep 17 00:00:00 2001 From: Yoshiaki Tamura Date: Mon, 26 Jul 2010 13:25:41 +0900 Subject: [PATCH 3/6] block migration: replace tabs by spaces. Signed-off-by: Yoshiaki Tamura Signed-off-by: Kevin Wolf --- block-migration.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block-migration.c b/block-migration.c index 8eda307d7f..0bfdb73c8b 100644 --- a/block-migration.c +++ b/block-migration.c @@ -346,7 +346,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); - blk->time = qemu_get_clock_ns(rt_clock); + blk->time = qemu_get_clock_ns(rt_clock); blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, nr_sectors, blk_mig_read_cb, blk); @@ -449,13 +449,13 @@ static int is_stage2_completed(void) if (block_mig_state.bulk_completed == 1) { remaining_dirty = get_remaining_dirty(); - if (remaining_dirty == 0) { - return 1; - } + if (remaining_dirty == 0) { + return 1; + } - bwidth = compute_read_bwidth(); + bwidth = compute_read_bwidth(); - if ((remaining_dirty / bwidth) <= + if ((remaining_dirty / bwidth) <= migrate_max_downtime()) { /* finish stage2 because we think that we can finish remaing work below max_downtime */ From 336c1c12551ff0a6e1a2af226d6cbdbadd2e02b5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 28 Jul 2010 11:26:29 +0200 Subject: [PATCH 4/6] block: Fix bdrv_has_zero_init Assuming that any image on a block device is not properly zero-initialized is actually wrong: Only raw images have this problem. Any other image format shouldn't care about it, they initialize everything properly themselves. Signed-off-by: Kevin Wolf --- block.c | 6 ++---- block/raw-posix.c | 13 +++++++++---- block/raw-win32.c | 6 ++++++ block/raw.c | 6 ++++++ block_int.h | 7 +++++-- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/block.c b/block.c index c4d6814726..a12be0bac1 100644 --- a/block.c +++ b/block.c @@ -1477,10 +1477,8 @@ int bdrv_has_zero_init(BlockDriverState *bs) { assert(bs->drv); - if (bs->drv->no_zero_init) { - return 0; - } else if (bs->file) { - return bdrv_has_zero_init(bs->file); + if (bs->drv->bdrv_has_zero_init) { + return bs->drv->bdrv_has_zero_init(bs); } return 1; diff --git a/block/raw-posix.c b/block/raw-posix.c index a11170ed16..72fb8cebd4 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -993,6 +993,11 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options) return ret; } +static int hdev_has_zero_init(BlockDriverState *bs) +{ + return 0; +} + static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", @@ -1002,7 +1007,7 @@ static BlockDriver bdrv_host_device = { .bdrv_close = raw_close, .bdrv_create = hdev_create, .create_options = raw_create_options, - .no_zero_init = 1, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, .bdrv_aio_readv = raw_aio_readv, @@ -1117,7 +1122,7 @@ static BlockDriver bdrv_host_floppy = { .bdrv_close = raw_close, .bdrv_create = hdev_create, .create_options = raw_create_options, - .no_zero_init = 1, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, .bdrv_aio_readv = raw_aio_readv, @@ -1217,7 +1222,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_close = raw_close, .bdrv_create = hdev_create, .create_options = raw_create_options, - .no_zero_init = 1, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, .bdrv_aio_readv = raw_aio_readv, @@ -1340,7 +1345,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_close = raw_close, .bdrv_create = hdev_create, .create_options = raw_create_options, - .no_zero_init = 1, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, .bdrv_aio_readv = raw_aio_readv, diff --git a/block/raw-win32.c b/block/raw-win32.c index 745bbde673..503ed3959a 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -394,6 +394,11 @@ static int raw_set_locked(BlockDriverState *bs, int locked) } #endif +static int hdev_has_zero_init(BlockDriverState *bs) +{ + return 0; +} + static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", @@ -402,6 +407,7 @@ static BlockDriver bdrv_host_device = { .bdrv_file_open = hdev_open, .bdrv_close = raw_close, .bdrv_flush = raw_flush, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_read = raw_read, .bdrv_write = raw_write, diff --git a/block/raw.c b/block/raw.c index 1414e777b3..61e674856d 100644 --- a/block/raw.c +++ b/block/raw.c @@ -237,6 +237,11 @@ static QEMUOptionParameter raw_create_options[] = { { NULL } }; +static int raw_has_zero_init(BlockDriverState *bs) +{ + return bdrv_has_zero_init(bs->file); +} + static BlockDriver bdrv_raw = { .format_name = "raw", @@ -264,6 +269,7 @@ static BlockDriver bdrv_raw = { .bdrv_create = raw_create, .create_options = raw_create_options, + .bdrv_has_zero_init = raw_has_zero_init, }; static void bdrv_raw_init(void) diff --git a/block_int.h b/block_int.h index f075a8cba5..7d5e7515d2 100644 --- a/block_int.h +++ b/block_int.h @@ -127,8 +127,11 @@ struct BlockDriver { void (*bdrv_debug_event)(BlockDriverState *bs, BlkDebugEvent event); - /* Set if newly created images are not guaranteed to contain only zeros */ - int no_zero_init; + /* + * Returns 1 if newly created images are guaranteed to contain only + * zeros, 0 otherwise. + */ + int (*bdrv_has_zero_init)(BlockDriverState *bs); QLIST_ENTRY(BlockDriver) list; }; From 4be9762adb0947a353e6efef2fed354f69218bfb Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Tue, 27 Jul 2010 14:02:01 +0200 Subject: [PATCH 5/6] block: Change bdrv_eject() not to drop the image bdrv_eject() gets called when a device model opens or closes the tray. If the block driver implements method bdrv_eject(), that method gets called. Drivers host_cdrom implements it, and it opens and closes the physical tray, and nothing else. When a device model opens, then closes the tray, media changes only if the user actively changes the physical media while the tray is open. This is matches how physical hardware behaves. If the block driver doesn't implement method bdrv_eject(), we do something quite different: opening the tray severs the connection to the image by calling bdrv_close(), and closing the tray does nothing. When the device model opens, then closes the tray, media is gone, unless the user actively inserts another one while the tray is open, with a suitable change command in the monitor. This isn't how physical hardware behaves. Rather inconvenient when programs "helpfully" eject media to give you a chance to change it. The way bdrv_eject() behaves here turns that chance into a must, which is not what these programs or their users expect. Change the default action not to call bdrv_close(). Instead, note the tray status in new BlockDriverState member tray_open. Use it in bdrv_is_inserted(). Arguably, the device models should keep track of tray status themselves. But this is less invasive. Signed-off-by: Markus Armbruster Signed-off-by: Kevin Wolf --- block.c | 7 ++++--- block_int.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index a12be0bac1..da70f2968b 100644 --- a/block.c +++ b/block.c @@ -2517,7 +2517,7 @@ int bdrv_is_inserted(BlockDriverState *bs) if (!drv) return 0; if (!drv->bdrv_is_inserted) - return 1; + return !bs->tray_open; ret = drv->bdrv_is_inserted(bs); return ret; } @@ -2559,10 +2559,11 @@ int bdrv_eject(BlockDriverState *bs, int eject_flag) ret = drv->bdrv_eject(bs, eject_flag); } if (ret == -ENOTSUP) { - if (eject_flag) - bdrv_close(bs); ret = 0; } + if (ret >= 0) { + bs->tray_open = eject_flag; + } return ret; } diff --git a/block_int.h b/block_int.h index 7d5e7515d2..b863451774 100644 --- a/block_int.h +++ b/block_int.h @@ -144,6 +144,7 @@ struct BlockDriverState { int open_flags; /* flags used to open the file, re-used for re-open */ int removable; /* if true, the media can be removed */ int locked; /* if true, the media cannot temporarily be ejected */ + int tray_open; /* if true, the virtual tray is open */ int encrypted; /* if true, the media is encrypted */ int valid_key; /* if true, a valid encryption key has been set */ int sg; /* if true, the device is a /dev/sg* */ From 953844d102f5b682f0835f021f2ed2ad9fb7734c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 27 Jul 2010 21:04:36 +0200 Subject: [PATCH 6/6] ide: Avoid canceling IDE DMA The reason for not actually canceling the I/O is because with virtualization and lots of VM running, a guest fs may mistake a overload of the host, as an IDE timeout. So rather than canceling the I/O, it's safer to wait I/O completion and simulate that the I/O has completed just before the io cancellation was requested by the guest. This way if ntfs or an app writes data without checking for -EIO retval, and it thinks the write has succeeded, it's less likely to run into troubles. Similar issues for reads. Furthermore because the DMA operation is splitted into many synchronous aio_read/write if there's more than one entry in the SG table, without this patch the DMA would be cancelled in the middle, something we've no idea if it happens on real hardware too or not. Overall this seems a great risk for zero gain. This approach is sure safer than previous code given we can't pretend all guest fs code out there to check for errors and reply the DMA if it was completed partially, given a timeout would never materialize on a real harddisk unless there are defective blocks (and defective blocks are practically only an issue for reads never for writes in any recent hardware as writing to blocks is the way to fix them) or the harddisk breaks as a whole. Signed-off-by: Izik Eidus Signed-off-by: Andrea Arcangeli Signed-off-by: Kevin Wolf --- hw/ide/pci.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/hw/ide/pci.c b/hw/ide/pci.c index 4331d77232..ec90f266e9 100644 --- a/hw/ide/pci.c +++ b/hw/ide/pci.c @@ -40,8 +40,27 @@ void bmdma_cmd_writeb(void *opaque, uint32_t addr, uint32_t val) printf("%s: 0x%08x\n", __func__, val); #endif if (!(val & BM_CMD_START)) { - /* XXX: do it better */ - ide_dma_cancel(bm); + /* + * We can't cancel Scatter Gather DMA in the middle of the + * operation or a partial (not full) DMA transfer would reach + * the storage so we wait for completion instead (we beahve + * like if the DMA was completed by the time the guest trying + * to cancel dma with bmdma_cmd_writeb with BM_CMD_START not + * set). + * + * In the future we'll be able to safely cancel the I/O if the + * whole DMA operation will be submitted to disk with a single + * aio operation with preadv/pwritev. + */ + if (bm->aiocb) { + qemu_aio_flush(); +#ifdef DEBUG_IDE + if (bm->aiocb) + printf("ide_dma_cancel: aiocb still pending"); + if (bm->status & BM_STATUS_DMAING) + printf("ide_dma_cancel: BM_STATUS_DMAING still pending"); +#endif + } bm->cmd = val & 0x09; } else { if (!(bm->status & BM_STATUS_DMAING)) {