From 1a86938f04b1abfd28e053b8c6f4b8ed9e4ffe08 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:01 +0100 Subject: [PATCH 01/28] block: Add options QDict to .bdrv_open() Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- block.c | 4 ++-- block/bochs.c | 2 +- block/cloop.c | 2 +- block/cow.c | 2 +- block/dmg.c | 2 +- block/parallels.c | 2 +- block/qcow.c | 2 +- block/qcow2.c | 4 ++-- block/qed.c | 4 ++-- block/raw.c | 2 +- block/vdi.c | 2 +- block/vmdk.c | 2 +- block/vpc.c | 2 +- include/block/block_int.h | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/block.c b/block.c index 124a9ebf65..01cee877c5 100644 --- a/block.c +++ b/block.c @@ -710,7 +710,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } else { assert(file != NULL); bs->file = file; - ret = drv->bdrv_open(bs, open_flags); + ret = drv->bdrv_open(bs, NULL, open_flags); } if (ret < 0) { @@ -3190,7 +3190,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs, if (bs->file) { drv->bdrv_close(bs); ret = bdrv_snapshot_goto(bs->file, snapshot_id); - open_ret = drv->bdrv_open(bs, bs->open_flags); + open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); if (open_ret < 0) { bdrv_delete(bs->file); bs->drv = NULL; diff --git a/block/bochs.c b/block/bochs.c index a6eb33da42..d7078c0775 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -108,7 +108,7 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int bochs_open(BlockDriverState *bs, int flags) +static int bochs_open(BlockDriverState *bs, QDict *options, int flags) { BDRVBochsState *s = bs->opaque; int i; diff --git a/block/cloop.c b/block/cloop.c index 8fe13e92a1..6ea7cf4046 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -53,7 +53,7 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cloop_open(BlockDriverState *bs, int flags) +static int cloop_open(BlockDriverState *bs, QDict *options, int flags) { BDRVCloopState *s = bs->opaque; uint32_t offsets_size, max_compressed_block_size = 1, i; diff --git a/block/cow.c b/block/cow.c index 4baf9042fe..d73e08cf92 100644 --- a/block/cow.c +++ b/block/cow.c @@ -58,7 +58,7 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cow_open(BlockDriverState *bs, int flags) +static int cow_open(BlockDriverState *bs, QDict *options, int flags) { BDRVCowState *s = bs->opaque; struct cow_header_v2 cow_header; diff --git a/block/dmg.c b/block/dmg.c index 6d85801a84..c1066df13a 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -85,7 +85,7 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) return 0; } -static int dmg_open(BlockDriverState *bs, int flags) +static int dmg_open(BlockDriverState *bs, QDict *options, int flags) { BDRVDMGState *s = bs->opaque; uint64_t info_begin,info_end,last_in_offset,last_out_offset; diff --git a/block/parallels.c b/block/parallels.c index 8688f6ca4c..18b3ac0b28 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -68,7 +68,7 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam return 0; } -static int parallels_open(BlockDriverState *bs, int flags) +static int parallels_open(BlockDriverState *bs, QDict *options, int flags) { BDRVParallelsState *s = bs->opaque; int i; diff --git a/block/qcow.c b/block/qcow.c index a7135eea47..f6750a5bd3 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -92,7 +92,7 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int qcow_open(BlockDriverState *bs, int flags) +static int qcow_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQcowState *s = bs->opaque; int len, i, shift, ret; diff --git a/block/qcow2.c b/block/qcow2.c index 7610e569e3..4c501a5ffc 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -285,7 +285,7 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, return ret; } -static int qcow2_open(BlockDriverState *bs, int flags) +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQcowState *s = bs->opaque; int len, i, ret = 0; @@ -912,7 +912,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) qcow2_close(bs); memset(s, 0, sizeof(BDRVQcowState)); - qcow2_open(bs, flags); + qcow2_open(bs, NULL, flags); if (crypt_method) { s->crypt_method = crypt_method; diff --git a/block/qed.c b/block/qed.c index b8515e58b3..46e12b358b 100644 --- a/block/qed.c +++ b/block/qed.c @@ -373,7 +373,7 @@ static void bdrv_qed_rebind(BlockDriverState *bs) s->bs = bs; } -static int bdrv_qed_open(BlockDriverState *bs, int flags) +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQEDState *s = bs->opaque; QEDHeader le_header; @@ -1526,7 +1526,7 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs) bdrv_qed_close(bs); memset(s, 0, sizeof(BDRVQEDState)); - bdrv_qed_open(bs, bs->open_flags); + bdrv_qed_open(bs, NULL, bs->open_flags); } static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, diff --git a/block/raw.c b/block/raw.c index 75812db3c2..ce10422006 100644 --- a/block/raw.c +++ b/block/raw.c @@ -3,7 +3,7 @@ #include "block/block_int.h" #include "qemu/module.h" -static int raw_open(BlockDriverState *bs, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags) { bs->sg = bs->file->sg; return 0; diff --git a/block/vdi.c b/block/vdi.c index 87c691b504..2662d89af6 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -364,7 +364,7 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) return result; } -static int vdi_open(BlockDriverState *bs, int flags) +static int vdi_open(BlockDriverState *bs, QDict *options, int flags) { BDRVVdiState *s = bs->opaque; VdiHeader header; diff --git a/block/vmdk.c b/block/vmdk.c index aef1abcb4f..4a13fa69c0 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -723,7 +723,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, return vmdk_parse_extents(buf, bs, bs->file->filename); } -static int vmdk_open(BlockDriverState *bs, int flags) +static int vmdk_open(BlockDriverState *bs, QDict *options, int flags) { int ret; BDRVVmdkState *s = bs->opaque; diff --git a/block/vpc.c b/block/vpc.c index 82229ef5a0..3cad52e54c 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -155,7 +155,7 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int vpc_open(BlockDriverState *bs, int flags) +static int vpc_open(BlockDriverState *bs, QDict *options, int flags) { BDRVVPCState *s = bs->opaque; int i; diff --git a/include/block/block_int.h b/include/block/block_int.h index eaad53e426..4b659fa960 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -82,7 +82,7 @@ struct BlockDriver { void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); - int (*bdrv_open)(BlockDriverState *bs, int flags); + int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags); int (*bdrv_file_open)(BlockDriverState *bs, const char *filename, int flags); int (*bdrv_read)(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); From de9c0cec6c823071b903ebeebf1ee70b394ff46f Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:02 +0100 Subject: [PATCH 02/28] block: Add options QDict to bdrv_open() prototype It doesn't do anything yet except storing the options QDict in the BlockDriverState. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- block.c | 47 +++++++++++++++++++++++++++++---------- block/blkverify.c | 2 +- block/qcow2.c | 2 +- block/vmdk.c | 2 +- block/vvfat.c | 2 +- blockdev.c | 10 +++++---- hw/xen_disk.c | 2 +- include/block/block.h | 4 ++-- include/block/block_int.h | 1 + qemu-img.c | 6 ++--- qemu-io.c | 2 +- qemu-nbd.c | 2 +- 12 files changed, 54 insertions(+), 28 deletions(-) diff --git a/block.c b/block.c index 01cee877c5..761ea860a9 100644 --- a/block.c +++ b/block.c @@ -788,7 +788,8 @@ int bdrv_open_backing_file(BlockDriverState *bs) /* backing files always opened read-only */ back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT); - ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv); + ret = bdrv_open(bs->backing_hd, backing_filename, NULL, + back_flags, back_drv); if (ret < 0) { bdrv_delete(bs->backing_hd); bs->backing_hd = NULL; @@ -800,15 +801,28 @@ int bdrv_open_backing_file(BlockDriverState *bs) /* * Opens a disk image (raw, qcow2, vmdk, ...) + * + * options is a QDict of options to pass to the block drivers, or NULL for an + * empty set of options. The reference to the QDict belongs to the block layer + * after the call (even on failure), so if the caller intends to reuse the + * dictionary, it needs to use QINCREF() before calling bdrv_open. */ -int bdrv_open(BlockDriverState *bs, const char *filename, int flags, - BlockDriver *drv) +int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, + int flags, BlockDriver *drv) { int ret; /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char tmp_filename[PATH_MAX + 1]; BlockDriverState *file = NULL; + /* NULL means an empty set of options */ + if (options == NULL) { + options = qdict_new(); + } + + bs->options = options; + + /* For snapshot=on, create a temporary qcow2 overlay */ if (flags & BDRV_O_SNAPSHOT) { BlockDriverState *bs1; int64_t total_size; @@ -822,10 +836,10 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, /* if there is a backing file, use it */ bs1 = bdrv_new(""); - ret = bdrv_open(bs1, filename, 0, drv); + ret = bdrv_open(bs1, filename, NULL, 0, drv); if (ret < 0) { bdrv_delete(bs1); - return ret; + goto fail; } total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; @@ -836,15 +850,17 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); if (ret < 0) { - return ret; + goto fail; } /* Real path is meaningless for protocols */ - if (is_protocol) + if (is_protocol) { snprintf(backing_filename, sizeof(backing_filename), "%s", filename); - else if (!realpath(filename, backing_filename)) - return -errno; + } else if (!realpath(filename, backing_filename)) { + ret = -errno; + goto fail; + } bdrv_qcow2 = bdrv_find_format("qcow2"); options = parse_option_parameters("", bdrv_qcow2->create_options, NULL); @@ -859,7 +875,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, ret = bdrv_create(bdrv_qcow2, tmp_filename, options); free_option_parameters(options); if (ret < 0) { - return ret; + goto fail; } filename = tmp_filename; @@ -874,7 +890,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, ret = bdrv_file_open(&file, filename, bdrv_open_flags(bs, flags)); if (ret < 0) { - return ret; + goto fail; } /* Find the right image format driver */ @@ -924,6 +940,10 @@ unlink_and_fail: if (bs->is_temporary) { unlink(filename); } +fail: + QDECREF(bs->options); + bs->options = NULL; + return ret; } @@ -1193,6 +1213,8 @@ void bdrv_close(BlockDriverState *bs) bs->valid_key = 0; bs->sg = 0; bs->growable = 0; + QDECREF(bs->options); + bs->options = NULL; if (bs->file != NULL) { bdrv_delete(bs->file); @@ -4594,7 +4616,8 @@ void bdrv_img_create(const char *filename, const char *fmt, bs = bdrv_new(""); - ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv); + ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags, + backing_drv); if (ret < 0) { error_setg_errno(errp, -ret, "Could not open '%s'", backing_file->value.s); diff --git a/block/blkverify.c b/block/blkverify.c index a7dd45909b..2086d97234 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -98,7 +98,7 @@ static int blkverify_open(BlockDriverState *bs, const char *filename, int flags) /* Open the test file */ s->test_file = bdrv_new(""); - ret = bdrv_open(s->test_file, filename, flags, NULL); + ret = bdrv_open(s->test_file, filename, NULL, flags, NULL); if (ret < 0) { bdrv_delete(s->test_file); s->test_file = NULL; diff --git a/block/qcow2.c b/block/qcow2.c index 4c501a5ffc..f5e4269363 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1265,7 +1265,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, */ BlockDriver* drv = bdrv_find_format("qcow2"); assert(drv != NULL); - ret = bdrv_open(bs, filename, + ret = bdrv_open(bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); if (ret < 0) { goto out; diff --git a/block/vmdk.c b/block/vmdk.c index 4a13fa69c0..e92104a830 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1527,7 +1527,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) if (backing_file) { char parent_filename[PATH_MAX]; BlockDriverState *bs = bdrv_new(""); - ret = bdrv_open(bs, backing_file, 0, NULL); + ret = bdrv_open(bs, backing_file, NULL, 0, NULL); if (ret != 0) { bdrv_delete(bs); return ret; diff --git a/block/vvfat.c b/block/vvfat.c index 06e6654824..b8eb38ab36 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2830,7 +2830,7 @@ static int enable_write_target(BDRVVVFATState *s) return -1; } - ret = bdrv_open(s->qcow, s->qcow_filename, + ret = bdrv_open(s->qcow, s->qcow_filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow); if (ret < 0) { return ret; diff --git a/blockdev.c b/blockdev.c index 0e67d06330..d67917486f 100644 --- a/blockdev.c +++ b/blockdev.c @@ -635,7 +635,7 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) error_report("warning: disabling copy_on_read on readonly drive"); } - ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv); + ret = bdrv_open(dinfo->bdrv, file, NULL, bdrv_flags, drv); if (ret < 0) { if (ret == -EMEDIUMTYPE) { error_report("could not open disk image %s: not in %s format", @@ -820,7 +820,9 @@ void qmp_transaction(BlockdevActionList *dev_list, Error **errp) /* We will manually add the backing_hd field to the bs later */ states->new_bs = bdrv_new(""); - ret = bdrv_open(states->new_bs, new_image_file, + /* TODO Inherit bs->options or only take explicit options with an + * extended QMP command? */ + ret = bdrv_open(states->new_bs, new_image_file, NULL, flags | BDRV_O_NO_BACKING, drv); if (ret != 0) { error_set(errp, QERR_OPEN_FILE_FAILED, new_image_file); @@ -921,7 +923,7 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename, int bdrv_flags, BlockDriver *drv, const char *password, Error **errp) { - if (bdrv_open(bs, filename, bdrv_flags, drv) < 0) { + if (bdrv_open(bs, filename, NULL, bdrv_flags, drv) < 0) { error_set(errp, QERR_OPEN_FILE_FAILED, filename); return; } @@ -1330,7 +1332,7 @@ void qmp_drive_mirror(const char *device, const char *target, * file. */ target_bs = bdrv_new(""); - ret = bdrv_open(target_bs, target, flags | BDRV_O_NO_BACKING, drv); + ret = bdrv_open(target_bs, target, NULL, flags | BDRV_O_NO_BACKING, drv); if (ret < 0) { bdrv_delete(target_bs); diff --git a/hw/xen_disk.c b/hw/xen_disk.c index cc09a2f1fa..83329e2e69 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -763,7 +763,7 @@ static int blk_init(struct XenDevice *xendev) xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n"); blkdev->bs = bdrv_new(blkdev->dev); if (blkdev->bs) { - if (bdrv_open(blkdev->bs, blkdev->filename, qflags, + if (bdrv_open(blkdev->bs, blkdev->filename, NULL, qflags, bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) { bdrv_delete(blkdev->bs); blkdev->bs = NULL; diff --git a/include/block/block.h b/include/block/block.h index 0f750d7da3..d4f34d6462 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -137,8 +137,8 @@ int bdrv_parse_cache_flags(const char *mode, int *flags); int bdrv_parse_discard_flags(const char *mode, int *flags); int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags); int bdrv_open_backing_file(BlockDriverState *bs); -int bdrv_open(BlockDriverState *bs, const char *filename, int flags, - BlockDriver *drv); +int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, + int flags, BlockDriver *drv); BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, int flags); int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); diff --git a/include/block/block_int.h b/include/block/block_int.h index 4b659fa960..baf80e3c42 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -286,6 +286,7 @@ struct BlockDriverState { /* long-running background operation */ BlockJob *job; + QDict *options; }; int get_tmp_filename(char *filename, int size); diff --git a/qemu-img.c b/qemu-img.c index 471de7d646..31627b0da8 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -276,7 +276,7 @@ static BlockDriverState *bdrv_new_open(const char *filename, drv = NULL; } - ret = bdrv_open(bs, filename, flags, drv); + ret = bdrv_open(bs, filename, NULL, flags, drv); if (ret < 0) { error_report("Could not open '%s': %s", filename, strerror(-ret)); goto fail; @@ -2156,7 +2156,7 @@ static int img_rebase(int argc, char **argv) bs_old_backing = bdrv_new("old_backing"); bdrv_get_backing_filename(bs, backing_name, sizeof(backing_name)); - ret = bdrv_open(bs_old_backing, backing_name, BDRV_O_FLAGS, + ret = bdrv_open(bs_old_backing, backing_name, NULL, BDRV_O_FLAGS, old_backing_drv); if (ret) { error_report("Could not open old backing file '%s'", backing_name); @@ -2164,7 +2164,7 @@ static int img_rebase(int argc, char **argv) } if (out_baseimg[0]) { bs_new_backing = bdrv_new("new_backing"); - ret = bdrv_open(bs_new_backing, out_baseimg, BDRV_O_FLAGS, + ret = bdrv_open(bs_new_backing, out_baseimg, NULL, BDRV_O_FLAGS, new_backing_drv); if (ret) { error_report("Could not open new backing file '%s'", diff --git a/qemu-io.c b/qemu-io.c index 7b3de42773..79be516953 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -1773,7 +1773,7 @@ static int openfile(char *name, int flags, int growable) } else { bs = bdrv_new("hda"); - if (bdrv_open(bs, name, flags, NULL) < 0) { + if (bdrv_open(bs, name, NULL, flags, NULL) < 0) { fprintf(stderr, "%s: can't open device %s\n", progname, name); bdrv_delete(bs); bs = NULL; diff --git a/qemu-nbd.c b/qemu-nbd.c index e7268d0a9f..ca722ed425 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -557,7 +557,7 @@ int main(int argc, char **argv) bs = bdrv_new("hda"); srcpath = argv[optind]; - if ((ret = bdrv_open(bs, srcpath, flags, NULL)) < 0) { + if ((ret = bdrv_open(bs, srcpath, NULL, flags, NULL)) < 0) { errno = -ret; err(EXIT_FAILURE, "Failed to bdrv_open '%s'", argv[optind]); } From b382bc9a1504c7f7c112881695d08293b906b88f Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:03 +0100 Subject: [PATCH 03/28] Add qdict_clone_shallow() Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- include/qapi/qmp/qdict.h | 2 ++ qobject/qdict.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/qapi/qmp/qdict.h b/include/qapi/qmp/qdict.h index 6d9a4be3a5..685b2e3fcb 100644 --- a/include/qapi/qmp/qdict.h +++ b/include/qapi/qmp/qdict.h @@ -64,4 +64,6 @@ int64_t qdict_get_try_int(const QDict *qdict, const char *key, int qdict_get_try_bool(const QDict *qdict, const char *key, int def_value); const char *qdict_get_try_str(const QDict *qdict, const char *key); +QDict *qdict_clone_shallow(const QDict *src); + #endif /* QDICT_H */ diff --git a/qobject/qdict.c b/qobject/qdict.c index 7543ccc10f..ed381f9a50 100644 --- a/qobject/qdict.c +++ b/qobject/qdict.c @@ -400,6 +400,28 @@ const QDictEntry *qdict_next(const QDict *qdict, const QDictEntry *entry) return ret; } +/** + * qdict_clone_shallow(): Clones a given QDict. Its entries are not copied, but + * another reference is added. + */ +QDict *qdict_clone_shallow(const QDict *src) +{ + QDict *dest; + QDictEntry *entry; + int i; + + dest = qdict_new(); + + for (i = 0; i < QDICT_BUCKET_MAX; i++) { + QLIST_FOREACH(entry, &src->table[i], next) { + qobject_incref(entry->value); + qdict_put_obj(dest, entry->key, entry->value); + } + } + + return dest; +} + /** * qentry_destroy(): Free all the memory allocated by a QDictEntry */ From b6ad491a4921557efc8815fe0ca5f11c6d8d587e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:04 +0100 Subject: [PATCH 04/28] block: Add options QDict to bdrv_open_common() The options are passed down to the block drivers, which are supposed to remove all options they have processed. Anything that is left over in the end is an unknown option and results in an error. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- block.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index 761ea860a9..0ab164d992 100644 --- a/block.c +++ b/block.c @@ -665,15 +665,18 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags) /* * Common part for opening disk images and files + * + * Removes all processed options from *options. */ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, - const char *filename, + const char *filename, QDict *options, int flags, BlockDriver *drv) { int ret, open_flags; assert(drv != NULL); assert(bs->file == NULL); + assert(options == NULL || bs->options != options); trace_bdrv_open_common(bs, filename, flags, drv->format_name); @@ -710,7 +713,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } else { assert(file != NULL); bs->file = file; - ret = drv->bdrv_open(bs, NULL, open_flags); + ret = drv->bdrv_open(bs, options, open_flags); } if (ret < 0) { @@ -752,7 +755,7 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags) } bs = bdrv_new(""); - ret = bdrv_open_common(bs, NULL, filename, flags, drv); + ret = bdrv_open_common(bs, NULL, filename, NULL, flags, drv); if (ret < 0) { bdrv_delete(bs); return ret; @@ -821,6 +824,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, } bs->options = options; + options = qdict_clone_shallow(options); /* For snapshot=on, create a temporary qcow2 overlay */ if (flags & BDRV_O_SNAPSHOT) { @@ -903,7 +907,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, } /* Open the image */ - ret = bdrv_open_common(bs, file, filename, flags, drv); + ret = bdrv_open_common(bs, file, filename, options, flags, drv); if (ret < 0) { goto unlink_and_fail; } @@ -917,11 +921,22 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, if ((flags & BDRV_O_NO_BACKING) == 0) { ret = bdrv_open_backing_file(bs); if (ret < 0) { - bdrv_close(bs); - return ret; + goto close_and_fail; } } + /* Check if any unknown options were used */ + if (qdict_size(options) != 0) { + const QDictEntry *entry = qdict_first(options); + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by " + "device '%s' doesn't support the option '%s'", + drv->format_name, bs->device_name, entry->key); + + ret = -EINVAL; + goto close_and_fail; + } + QDECREF(options); + if (!bdrv_key_required(bs)) { bdrv_dev_change_media_cb(bs, true); } @@ -942,8 +957,13 @@ unlink_and_fail: } fail: QDECREF(bs->options); + QDECREF(options); bs->options = NULL; + return ret; +close_and_fail: + bdrv_close(bs); + QDECREF(options); return ret; } From 376609cc6c03c2ffc8c323d804d27f95346cac08 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:05 +0100 Subject: [PATCH 05/28] qemu-option: Add qemu_opts_absorb_qdict() This adds a function that adds all entries of a QDict to a QemuOpts if the keys are known, and leaves only the rest in the QDict. This way a single QDict of -drive options can be processed in multiple places (generic block layer, block driver, backing file block driver, etc.), where each part picks the options it knows. If at the end of the process the QDict isn't empty, the user specified an invalid option. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- include/qemu/option.h | 1 + util/qemu-option.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/qemu/option.h b/include/qemu/option.h index ba197cddcf..bdb6d218b4 100644 --- a/include/qemu/option.h +++ b/include/qemu/option.h @@ -149,6 +149,7 @@ void qemu_opts_set_defaults(QemuOptsList *list, const char *params, QemuOpts *qemu_opts_from_qdict(QemuOptsList *list, const QDict *qdict, Error **errp); QDict *qemu_opts_to_qdict(QemuOpts *opts, QDict *qdict); +void qemu_opts_absorb_qdict(QemuOpts *opts, QDict *qdict, Error **errp); typedef int (*qemu_opts_loopfunc)(QemuOpts *opts, void *opaque); int qemu_opts_print(QemuOpts *opts, void *dummy); diff --git a/util/qemu-option.c b/util/qemu-option.c index 5a1d03c331..8b74bf1f7a 100644 --- a/util/qemu-option.c +++ b/util/qemu-option.c @@ -1066,6 +1066,40 @@ QemuOpts *qemu_opts_from_qdict(QemuOptsList *list, const QDict *qdict, return opts; } +/* + * Adds all QDict entries to the QemuOpts that can be added and removes them + * from the QDict. When this function returns, the QDict contains only those + * entries that couldn't be added to the QemuOpts. + */ +void qemu_opts_absorb_qdict(QemuOpts *opts, QDict *qdict, Error **errp) +{ + const QDictEntry *entry, *next; + + entry = qdict_first(qdict); + + while (entry != NULL) { + Error *local_err = NULL; + OptsFromQDictState state = { + .errp = &local_err, + .opts = opts, + }; + + next = qdict_next(qdict, entry); + + if (find_desc_by_name(opts->list->desc, entry->key)) { + qemu_opts_from_qdict_1(entry->key, entry->value, &state); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + return; + } else { + qdict_del(qdict, entry->key); + } + } + + entry = next; + } +} + /* * Convert from QemuOpts to QDict. * The QDict values are of type QString. From bb44619b06c0bef20b658ff532cf850c16362ae7 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:06 +0100 Subject: [PATCH 06/28] blockdev: Keep a copy of DriveInfo.serial Pointing to a QemuOpts element is surprising and can lead to subtle use-after-free errors when the QemuOpts is freed after all options are parsed. Signed-off-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- blockdev.c | 5 ++++- include/sysemu/blockdev.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index d67917486f..acf1c32481 100644 --- a/blockdev.c +++ b/blockdev.c @@ -191,6 +191,7 @@ static void drive_uninit(DriveInfo *dinfo) bdrv_delete(dinfo->bdrv); g_free(dinfo->id); QTAILQ_REMOVE(&drives, dinfo, next); + g_free(dinfo->serial); g_free(dinfo); } @@ -566,7 +567,9 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) dinfo->trans = translation; dinfo->opts = opts; dinfo->refcount = 1; - dinfo->serial = serial; + if (serial != NULL) { + dinfo->serial = g_strdup(serial); + } QTAILQ_INSERT_TAIL(&drives, dinfo, next); bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h index 1fe533299e..804ec8839b 100644 --- a/include/sysemu/blockdev.h +++ b/include/sysemu/blockdev.h @@ -40,7 +40,7 @@ struct DriveInfo { int media_cd; int cyls, heads, secs, trans; QemuOpts *opts; - const char *serial; + char *serial; QTAILQ_ENTRY(DriveInfo) next; int refcount; }; From 0006383e1549b6f9b264575d124c553efca19d62 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:07 +0100 Subject: [PATCH 07/28] block: Support driver specific options in drive_init() Any non-default -drive options are now passed down to the block drivers. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- blockdev.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/blockdev.c b/blockdev.c index acf1c32481..7ae619860b 100644 --- a/blockdev.c +++ b/blockdev.c @@ -22,6 +22,7 @@ #include "sysemu/arch_init.h" static QTAILQ_HEAD(drivelist, DriveInfo) drives = QTAILQ_HEAD_INITIALIZER(drives); +extern QemuOptsList qemu_common_drive_opts; static const char *const if_name[IF_COUNT] = { [IF_NONE] = "none", @@ -288,7 +289,7 @@ static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp) return true; } -DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) +DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type) { const char *buf; const char *file = NULL; @@ -311,10 +312,36 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) bool copy_on_read; int ret; Error *error = NULL; + QemuOpts *opts; + QDict *bs_opts; + const char *id; translation = BIOS_ATA_TRANSLATION_AUTO; media = MEDIA_DISK; + /* Check common options by copying from all_opts to opts, all other options + * are stored in bs_opts. */ + id = qemu_opts_id(all_opts); + opts = qemu_opts_create(&qemu_common_drive_opts, id, 1, &error); + if (error_is_set(&error)) { + qerror_report_err(error); + error_free(error); + return NULL; + } + + bs_opts = qdict_new(); + qemu_opts_to_qdict(all_opts, bs_opts); + qemu_opts_absorb_qdict(opts, bs_opts, &error); + if (error_is_set(&error)) { + qerror_report_err(error); + error_free(error); + return NULL; + } + + if (id) { + qdict_del(bs_opts, "id"); + } + /* extract parameters */ bus_id = qemu_opt_get_number(opts, "bus", 0); unit_id = qemu_opt_get_number(opts, "unit", -1); @@ -565,7 +592,7 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) dinfo->heads = heads; dinfo->secs = secs; dinfo->trans = translation; - dinfo->opts = opts; + dinfo->opts = all_opts; dinfo->refcount = 1; if (serial != NULL) { dinfo->serial = g_strdup(serial); @@ -590,17 +617,20 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) case IF_MTD: break; case IF_VIRTIO: + { /* add virtio block device */ - opts = qemu_opts_create_nofail(qemu_find_opts("device")); + QemuOpts *devopts; + devopts = qemu_opts_create_nofail(qemu_find_opts("device")); if (arch_type == QEMU_ARCH_S390X) { - qemu_opt_set(opts, "driver", "virtio-blk-s390"); + qemu_opt_set(devopts, "driver", "virtio-blk-s390"); } else { - qemu_opt_set(opts, "driver", "virtio-blk-pci"); + qemu_opt_set(devopts, "driver", "virtio-blk-pci"); } - qemu_opt_set(opts, "drive", dinfo->id); + qemu_opt_set(devopts, "drive", dinfo->id); if (devaddr) - qemu_opt_set(opts, "addr", devaddr); + qemu_opt_set(devopts, "addr", devaddr); break; + } default: abort(); } @@ -638,7 +668,9 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) error_report("warning: disabling copy_on_read on readonly drive"); } - ret = bdrv_open(dinfo->bdrv, file, NULL, bdrv_flags, drv); + ret = bdrv_open(dinfo->bdrv, file, bs_opts, bdrv_flags, drv); + bs_opts = NULL; + if (ret < 0) { if (ret == -EMEDIUMTYPE) { error_report("could not open disk image %s: not in %s format", @@ -652,9 +684,14 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) if (bdrv_key_required(dinfo->bdrv)) autostart = 0; + + qemu_opts_del(opts); + return dinfo; err: + qemu_opts_del(opts); + QDECREF(bs_opts); bdrv_delete(dinfo->bdrv); g_free(dinfo->id); QTAILQ_REMOVE(&drives, dinfo, next); @@ -1464,9 +1501,9 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp) return dummy.next; } -QemuOptsList qemu_drive_opts = { +QemuOptsList qemu_common_drive_opts = { .name = "drive", - .head = QTAILQ_HEAD_INITIALIZER(qemu_drive_opts.head), + .head = QTAILQ_HEAD_INITIALIZER(qemu_common_drive_opts.head), .desc = { { .name = "bus", @@ -1585,3 +1622,15 @@ QemuOptsList qemu_drive_opts = { { /* end of list */ } }, }; + +QemuOptsList qemu_drive_opts = { + .name = "drive", + .head = QTAILQ_HEAD_INITIALIZER(qemu_drive_opts.head), + .desc = { + /* + * no elements => accept any params + * validation will happen later + */ + { /* end of list */ } + }, +}; From 74c4510a3cf7c369ee524c6f8fa8a87cf08ba608 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 15 Mar 2013 10:35:08 +0100 Subject: [PATCH 08/28] qcow2: Allow lazy refcounts to be enabled on the command line qcow2 images now accept a boolean lazy_refcounts options. Use it like this: -drive file=test.qcow2,lazy_refcounts=on If the option is specified on the command line, it overrides the default specified by the qcow2 header flags that were set when creating the image. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- block/qcow2-cluster.c | 2 +- block/qcow2.c | 37 +++++++++++++++++++++++++++++++++++++ block/qcow2.h | 1 + 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 56fccf9487..ff9ae18e4a 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -668,7 +668,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) } /* Update L2 table. */ - if (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS) { + if (s->use_lazy_refcounts) { qcow2_mark_dirty(bs); } if (qcow2_need_accurate_refcounts(s)) { diff --git a/block/qcow2.c b/block/qcow2.c index f5e4269363..ad43a13cb0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -285,11 +285,26 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, return ret; } +static QemuOptsList qcow2_runtime_opts = { + .name = "qcow2", + .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), + .desc = { + { + .name = "lazy_refcounts", + .type = QEMU_OPT_BOOL, + .help = "Postpone refcount updates", + }, + { /* end of list */ } + }, +}; + static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) { BDRVQcowState *s = bs->opaque; int len, i, ret = 0; QCowHeader header; + QemuOpts *opts; + Error *local_err = NULL; uint64_t ext_end; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); @@ -495,6 +510,28 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) } } + /* Enable lazy_refcounts according to image and command line options */ + opts = qemu_opts_create_nofail(&qcow2_runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + s->use_lazy_refcounts = qemu_opt_get_bool(opts, "lazy_refcounts", + (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + + qemu_opts_del(opts); + + if (s->use_lazy_refcounts && s->qcow_version < 3) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " + "a qcow2 image with at least qemu 1.1 compatibility level"); + ret = -EINVAL; + goto fail; + } + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; diff --git a/block/qcow2.h b/block/qcow2.h index 718b52baca..103abdb2c0 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -173,6 +173,7 @@ typedef struct BDRVQcowState { int flags; int qcow_version; + bool use_lazy_refcounts; uint64_t incompatible_features; uint64_t compatible_features; From 9991923b262dc35f6dd8393ab4853edd7fc3724f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 4 Mar 2013 15:02:30 +0100 Subject: [PATCH 09/28] qcow2: flush refcount cache correctly in alloc_refcount_block() update_refcount() affects the refcount cache, it does not write to disk. Therefore bdrv_flush(bs->file) does nothing. We need to flush the refcount cache in order to write out the refcount updates! While we're here also add error returns when qcow2_cache_flush() fails. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 55543edf77..e8b5d0a4ac 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -201,7 +201,10 @@ static int alloc_refcount_block(BlockDriverState *bs, *refcount_block = NULL; /* We write to the refcount table, so we might depend on L2 tables */ - qcow2_cache_flush(bs, s->l2_table_cache); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + return ret; + } /* Allocate the refcount block itself and mark it as used */ int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); @@ -237,7 +240,10 @@ static int alloc_refcount_block(BlockDriverState *bs, goto fail_block; } - bdrv_flush(bs->file); + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } /* Initialize the new refcount block only after updating its refcount, * update_refcount uses the refcount cache itself */ From f6977f15561973d4a67b6aa46da88aa678c505dd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 4 Mar 2013 15:02:31 +0100 Subject: [PATCH 10/28] qcow2: flush refcount cache correctly in qcow2_write_snapshots() Since qcow2 metadata is cached we need to flush the caches, not just the underlying file. Use bdrv_flush(bs) instead of bdrv_flush(bs->file). Also add the error return path when bdrv_flush() fails and move the flush after checking for qcow2_alloc_clusters() failure so that the qcow2_alloc_clusters() error return value takes precedence. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qcow2-snapshot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index eb8fcd5549..f4719d9909 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -180,11 +180,14 @@ static int qcow2_write_snapshots(BlockDriverState *bs) /* Allocate space for the new snapshot list */ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); - bdrv_flush(bs->file); offset = snapshots_offset; if (offset < 0) { return offset; } + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } /* Write all snapshots to the new list */ for(i = 0; i < s->nb_snapshots; i++) { From c1f5bafd70a8d9366570a3de472ef108415cd631 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 4 Mar 2013 15:02:32 +0100 Subject: [PATCH 11/28] qcow2: set L2 cache dependency in qcow2_alloc_bytes() Compressed writes use qcow2_alloc_bytes() to allocate space with byte granularity. The affected clusters' refcounts will be incremented but we do not need to flush yet. Set a L2 cache dependency on the refcount block cache, so that the refcounts get written out before the L2 updates. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index e8b5d0a4ac..4d9df5fe95 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -669,7 +669,11 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } } - bdrv_flush(bs->file); + /* The cluster refcount was incremented, either by qcow2_alloc_clusters() + * or explicitly by update_cluster_refcount(). Refcount blocks must be + * flushed before the caller's L2 table updates. + */ + qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); return offset; } From 2154f24e4e3be4435f17fa4382032f95f19e2972 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 4 Mar 2013 15:02:33 +0100 Subject: [PATCH 12/28] qcow2: flush in qcow2_update_snapshot_refcount() Users of qcow2_update_snapshot_refcount() do not flush consistently. qcow2_snapshot_create() flushes but qcow2_snapshot_goto() and qcow2_snapshot_delete() do not. Solve this by moving the bdrv_flush() into qcow2_update_snapshot_refcount(). Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 2 +- block/qcow2-snapshot.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 4d9df5fe95..3d29d30f03 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -851,7 +851,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } } - ret = 0; + ret = bdrv_flush(bs); fail: if (l2_table) { qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index f4719d9909..992a5c865c 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -381,11 +381,6 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto fail; } - ret = bdrv_flush(bs); - if (ret < 0) { - goto fail; - } - /* Append the new snapshot to the snapshot list */ new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); if (s->snapshots) { From f9cb2860bd12e223036b61a1f5d1444e57b7efce Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 4 Mar 2013 15:02:34 +0100 Subject: [PATCH 13/28] qcow2: drop flush in update_cluster_refcount() The update_cluster_refcount() function increments/decrements a cluster's refcount and then returns the new refcount value. There is no need to flush since both update_cluster_refcount() callers already take care of this: 1. qcow2_alloc_bytes() calls update_cluster_refcount() when compressed sectors will be appended to an existing cluster with enough free space. qcow2_alloc_bytes() already flushes so there is no need to do so in update_cluster_refcount(). 2. qcow2_update_snapshot_refcount() sets a cache dependency on refcounts if it needs to update L2 entries. It also flushes before completing. Removing this flush significantly speeds up qcow2 snapshot creation: $ qemu-img create -f qcow2 test.qcow2 -o size=50G,preallocation=metadata $ time qemu-img snapshot -c new test.qcow2 Time drops from more than 3 minutes to under 1 second. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 3d29d30f03..92519ea62a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -532,8 +532,6 @@ static int update_cluster_refcount(BlockDriverState *bs, return ret; } - bdrv_flush(bs->file); - return get_refcount(bs, cluster_index); } From 3647917919dfce5f731755f0ac5955a64bc12570 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 4 Mar 2013 15:02:35 +0100 Subject: [PATCH 14/28] qcow2: drop unnecessary flush in qcow2_update_snapshot_refcount() We already flush when the function completes. There is no need to flush after every compressed cluster. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 92519ea62a..9bfb390519 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -790,10 +790,6 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (ret < 0) { goto fail; } - - /* TODO Flushing once for the whole function should - * be enough */ - bdrv_flush(bs->file); } /* compressed clusters are never modified */ refcount = 2; From 381b487d54ba18c73df9db8452028a330058c505 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 6 Mar 2013 18:02:01 +0100 Subject: [PATCH 15/28] qcow2: make is_allocated return true for zero clusters Otherwise, live migration of the top layer will miss zero clusters and let the backing file show through. This also matches what is done in qed. QCOW2_CLUSTER_ZERO clusters are invalid in v2 image files. Check this directly in qcow2_get_cluster_offset instead of replicating the test everywhere. Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block/qcow2-cluster.c | 3 +++ block/qcow2.c | 6 +----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index ff9ae18e4a..d72d063e6d 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -454,6 +454,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; break; case QCOW2_CLUSTER_ZERO: + if (s->qcow_version < 3) { + return -EIO; + } c = count_contiguous_clusters(nb_clusters, s->cluster_size, &l2_table[l2_index], 0, QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); diff --git a/block/qcow2.c b/block/qcow2.c index ad43a13cb0..1f99866cf4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -621,7 +621,7 @@ static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, *pnum = 0; } - return (cluster_offset != 0); + return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); } /* handle reading after the end of the backing file */ @@ -702,10 +702,6 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, break; case QCOW2_CLUSTER_ZERO: - if (s->qcow_version < 3) { - ret = -EIO; - goto fail; - } qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); break; From 0d6db300cd95339cec6b8ab2ef46bd7cdb040959 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Tue, 12 Mar 2013 16:05:42 +0900 Subject: [PATCH 16/28] sheepdog: use non-blocking fd in coroutine context Using a blocking socket in the coroutine context reduces the chance of switching to other work. This patch makes the sheepdog driver use a non-blocking fd always. Signed-off-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index c711c28613..27abef2129 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -468,6 +468,8 @@ static int connect_to_sdog(BDRVSheepdogState *s) if (err != NULL) { qerror_report_err(err); error_free(err); + } else { + socket_set_nonblock(fd); } return fd; @@ -523,7 +525,6 @@ static coroutine_fn void do_co_req(void *opaque) co = qemu_coroutine_self(); qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co); - socket_set_block(sockfd); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; @@ -553,7 +554,6 @@ static coroutine_fn void do_co_req(void *opaque) ret = 0; out: qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); - socket_set_nonblock(sockfd); srco->ret = ret; srco->finished = true; @@ -776,8 +776,6 @@ static int get_sheep_fd(BDRVSheepdogState *s) return fd; } - socket_set_nonblock(fd); - qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s); return fd; } From ed9ba7246758e3250849bfa0a055b4eb98fc5152 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Tue, 12 Mar 2013 16:05:43 +0900 Subject: [PATCH 17/28] sheepdog: set io_flush handler in do_co_req If an io_flush handler is not set, qemu_aio_wait doesn't invoke callbacks. Signed-off-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 27abef2129..4245328569 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -501,6 +501,13 @@ static void restart_co_req(void *opaque) qemu_coroutine_enter(co, NULL); } +static int have_co_req(void *opaque) +{ + /* this handler is set only when there is a pending request, so + * always returns 1. */ + return 1; +} + typedef struct SheepdogReqCo { int sockfd; SheepdogReq *hdr; @@ -523,14 +530,14 @@ static coroutine_fn void do_co_req(void *opaque) unsigned int *rlen = srco->rlen; co = qemu_coroutine_self(); - qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co); + qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; } - qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co); + qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co); ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); if (ret < sizeof(*hdr)) { @@ -553,6 +560,8 @@ static coroutine_fn void do_co_req(void *opaque) } ret = 0; out: + /* there is at most one request for this sockfd, so it is safe to + * set each handler to NULL. */ qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); srco->ret = ret; From 5f3aa1ff4781f39e05b9892d58319a09fedc8918 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:44 +0100 Subject: [PATCH 18/28] main-loop: add qemu_get_aio_context() It is very useful to get the main loop AioContext, which is a static variable in main-loop.c. I'm not sure whether qemu_get_aio_context() will be necessary in the future once devices focus on using their own AioContext instead of the main loop AioContext, but for now it allows us to refactor code to support multiple AioContext while actually passing the main loop AioContext. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- include/qemu/main-loop.h | 5 +++++ main-loop.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 09952885a9..6f0200a7ac 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -81,6 +81,11 @@ int qemu_init_main_loop(void); */ int main_loop_wait(int nonblocking); +/** + * qemu_get_aio_context: Return the main loop's AioContext + */ +AioContext *qemu_get_aio_context(void); + /** * qemu_notify_event: Force processing of pending events. * diff --git a/main-loop.c b/main-loop.c index 8c9b58c14c..eb80ff369f 100644 --- a/main-loop.c +++ b/main-loop.c @@ -109,6 +109,11 @@ static int qemu_signal_init(void) static AioContext *qemu_aio_context; +AioContext *qemu_get_aio_context(void) +{ + return qemu_aio_context; +} + void qemu_notify_event(void) { if (!qemu_aio_context) { From b811203cf2fbf83e26f8e8feb2c77784259a4cbd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:45 +0100 Subject: [PATCH 19/28] threadpool: move globals into struct ThreadPool Move global variables into a struct so multiple thread pools can be supported in the future. This patch does not change thread-pool.h interfaces. There is still a global thread pool and it is not yet possible to create/destroy individual thread pools. Moving the variables into a struct first makes later patches easier to review. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- thread-pool.c | 184 +++++++++++++++++++++++++++++--------------------- trace-events | 4 +- 2 files changed, 109 insertions(+), 79 deletions(-) diff --git a/thread-pool.c b/thread-pool.c index e3ca64d790..a0aecd08fe 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -24,7 +24,9 @@ #include "qemu/event_notifier.h" #include "block/thread-pool.h" -static void do_spawn_thread(void); +typedef struct ThreadPool ThreadPool; + +static void do_spawn_thread(ThreadPool *pool); typedef struct ThreadPoolElement ThreadPoolElement; @@ -37,6 +39,7 @@ enum ThreadState { struct ThreadPoolElement { BlockDriverAIOCB common; + ThreadPool *pool; ThreadPoolFunc *func; void *arg; @@ -54,49 +57,56 @@ struct ThreadPoolElement { QLIST_ENTRY(ThreadPoolElement) all; }; -static EventNotifier notifier; -static QemuMutex lock; -static QemuCond check_cancel; -static QemuSemaphore sem; -static int max_threads = 64; -static QEMUBH *new_thread_bh; +struct ThreadPool { + EventNotifier notifier; + QemuMutex lock; + QemuCond check_cancel; + QemuSemaphore sem; + int max_threads; + QEMUBH *new_thread_bh; -/* The following variables are protected by the global mutex. */ -static QLIST_HEAD(, ThreadPoolElement) head; + /* The following variables are only accessed from one AioContext. */ + QLIST_HEAD(, ThreadPoolElement) head; -/* The following variables are protected by lock. */ -static QTAILQ_HEAD(, ThreadPoolElement) request_list; -static int cur_threads; -static int idle_threads; -static int new_threads; /* backlog of threads we need to create */ -static int pending_threads; /* threads created but not running yet */ -static int pending_cancellations; /* whether we need a cond_broadcast */ + /* The following variables are protected by lock. */ + QTAILQ_HEAD(, ThreadPoolElement) request_list; + int cur_threads; + int idle_threads; + int new_threads; /* backlog of threads we need to create */ + int pending_threads; /* threads created but not running yet */ + int pending_cancellations; /* whether we need a cond_broadcast */ +}; -static void *worker_thread(void *unused) +/* Currently there is only one thread pool instance. */ +static ThreadPool global_pool; + +static void *worker_thread(void *opaque) { - qemu_mutex_lock(&lock); - pending_threads--; - do_spawn_thread(); + ThreadPool *pool = opaque; + + qemu_mutex_lock(&pool->lock); + pool->pending_threads--; + do_spawn_thread(pool); while (1) { ThreadPoolElement *req; int ret; do { - idle_threads++; - qemu_mutex_unlock(&lock); - ret = qemu_sem_timedwait(&sem, 10000); - qemu_mutex_lock(&lock); - idle_threads--; - } while (ret == -1 && !QTAILQ_EMPTY(&request_list)); + pool->idle_threads++; + qemu_mutex_unlock(&pool->lock); + ret = qemu_sem_timedwait(&pool->sem, 10000); + qemu_mutex_lock(&pool->lock); + pool->idle_threads--; + } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list)); if (ret == -1) { break; } - req = QTAILQ_FIRST(&request_list); - QTAILQ_REMOVE(&request_list, req, reqs); + req = QTAILQ_FIRST(&pool->request_list); + QTAILQ_REMOVE(&pool->request_list, req, reqs); req->state = THREAD_ACTIVE; - qemu_mutex_unlock(&lock); + qemu_mutex_unlock(&pool->lock); ret = req->func(req->arg); @@ -105,45 +115,47 @@ static void *worker_thread(void *unused) smp_wmb(); req->state = THREAD_DONE; - qemu_mutex_lock(&lock); - if (pending_cancellations) { - qemu_cond_broadcast(&check_cancel); + qemu_mutex_lock(&pool->lock); + if (pool->pending_cancellations) { + qemu_cond_broadcast(&pool->check_cancel); } - event_notifier_set(¬ifier); + event_notifier_set(&pool->notifier); } - cur_threads--; - qemu_mutex_unlock(&lock); + pool->cur_threads--; + qemu_mutex_unlock(&pool->lock); return NULL; } -static void do_spawn_thread(void) +static void do_spawn_thread(ThreadPool *pool) { QemuThread t; /* Runs with lock taken. */ - if (!new_threads) { + if (!pool->new_threads) { return; } - new_threads--; - pending_threads++; + pool->new_threads--; + pool->pending_threads++; - qemu_thread_create(&t, worker_thread, NULL, QEMU_THREAD_DETACHED); + qemu_thread_create(&t, worker_thread, pool, QEMU_THREAD_DETACHED); } static void spawn_thread_bh_fn(void *opaque) { - qemu_mutex_lock(&lock); - do_spawn_thread(); - qemu_mutex_unlock(&lock); + ThreadPool *pool = opaque; + + qemu_mutex_lock(&pool->lock); + do_spawn_thread(pool); + qemu_mutex_unlock(&pool->lock); } -static void spawn_thread(void) +static void spawn_thread(ThreadPool *pool) { - cur_threads++; - new_threads++; + pool->cur_threads++; + pool->new_threads++; /* If there are threads being created, they will spawn new workers, so * we don't spend time creating many threads in a loop holding a mutex or * starving the current vcpu. @@ -151,23 +163,25 @@ static void spawn_thread(void) * If there are no idle threads, ask the main thread to create one, so we * inherit the correct affinity instead of the vcpu affinity. */ - if (!pending_threads) { - qemu_bh_schedule(new_thread_bh); + if (!pool->pending_threads) { + qemu_bh_schedule(pool->new_thread_bh); } } static void event_notifier_ready(EventNotifier *notifier) { + ThreadPool *pool = container_of(notifier, ThreadPool, notifier); ThreadPoolElement *elem, *next; event_notifier_test_and_clear(notifier); restart: - QLIST_FOREACH_SAFE(elem, &head, all, next) { + QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { if (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { continue; } if (elem->state == THREAD_DONE) { - trace_thread_pool_complete(elem, elem->common.opaque, elem->ret); + trace_thread_pool_complete(pool, elem, elem->common.opaque, + elem->ret); } if (elem->state == THREAD_DONE && elem->common.cb) { QLIST_REMOVE(elem, all); @@ -186,34 +200,36 @@ restart: static int thread_pool_active(EventNotifier *notifier) { - return !QLIST_EMPTY(&head); + ThreadPool *pool = container_of(notifier, ThreadPool, notifier); + return !QLIST_EMPTY(&pool->head); } static void thread_pool_cancel(BlockDriverAIOCB *acb) { ThreadPoolElement *elem = (ThreadPoolElement *)acb; + ThreadPool *pool = elem->pool; trace_thread_pool_cancel(elem, elem->common.opaque); - qemu_mutex_lock(&lock); + qemu_mutex_lock(&pool->lock); if (elem->state == THREAD_QUEUED && /* No thread has yet started working on elem. we can try to "steal" * the item from the worker if we can get a signal from the * semaphore. Because this is non-blocking, we can do it with * the lock taken and ensure that elem will remain THREAD_QUEUED. */ - qemu_sem_timedwait(&sem, 0) == 0) { - QTAILQ_REMOVE(&request_list, elem, reqs); + qemu_sem_timedwait(&pool->sem, 0) == 0) { + QTAILQ_REMOVE(&pool->request_list, elem, reqs); elem->state = THREAD_CANCELED; - event_notifier_set(¬ifier); + event_notifier_set(&pool->notifier); } else { - pending_cancellations++; + pool->pending_cancellations++; while (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { - qemu_cond_wait(&check_cancel, &lock); + qemu_cond_wait(&pool->check_cancel, &pool->lock); } - pending_cancellations--; + pool->pending_cancellations--; } - qemu_mutex_unlock(&lock); + qemu_mutex_unlock(&pool->lock); } static const AIOCBInfo thread_pool_aiocb_info = { @@ -224,24 +240,26 @@ static const AIOCBInfo thread_pool_aiocb_info = { BlockDriverAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockDriverCompletionFunc *cb, void *opaque) { + ThreadPool *pool = &global_pool; ThreadPoolElement *req; req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque); req->func = func; req->arg = arg; req->state = THREAD_QUEUED; + req->pool = pool; - QLIST_INSERT_HEAD(&head, req, all); + QLIST_INSERT_HEAD(&pool->head, req, all); - trace_thread_pool_submit(req, arg); + trace_thread_pool_submit(pool, req, arg); - qemu_mutex_lock(&lock); - if (idle_threads == 0 && cur_threads < max_threads) { - spawn_thread(); + qemu_mutex_lock(&pool->lock); + if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) { + spawn_thread(pool); } - QTAILQ_INSERT_TAIL(&request_list, req, reqs); - qemu_mutex_unlock(&lock); - qemu_sem_post(&sem); + QTAILQ_INSERT_TAIL(&pool->request_list, req, reqs); + qemu_mutex_unlock(&pool->lock); + qemu_sem_post(&pool->sem); return &req->common; } @@ -272,18 +290,30 @@ void thread_pool_submit(ThreadPoolFunc *func, void *arg) thread_pool_submit_aio(func, arg, NULL, NULL); } +static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) +{ + if (!ctx) { + ctx = qemu_get_aio_context(); + } + + memset(pool, 0, sizeof(*pool)); + event_notifier_init(&pool->notifier, false); + qemu_mutex_init(&pool->lock); + qemu_cond_init(&pool->check_cancel); + qemu_sem_init(&pool->sem, 0); + pool->max_threads = 64; + pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool); + + QLIST_INIT(&pool->head); + QTAILQ_INIT(&pool->request_list); + + aio_set_event_notifier(ctx, &pool->notifier, event_notifier_ready, + thread_pool_active); +} + static void thread_pool_init(void) { - QLIST_INIT(&head); - event_notifier_init(¬ifier, false); - qemu_mutex_init(&lock); - qemu_cond_init(&check_cancel); - qemu_sem_init(&sem, 0); - qemu_aio_set_event_notifier(¬ifier, event_notifier_ready, - thread_pool_active); - - QTAILQ_INIT(&request_list); - new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL); + thread_pool_init_one(&global_pool, NULL); } block_init(thread_pool_init) diff --git a/trace-events b/trace-events index d6a847d18a..cd73b7f3ea 100644 --- a/trace-events +++ b/trace-events @@ -115,8 +115,8 @@ virtio_blk_data_plane_complete_request(void *s, unsigned int head, int ret) "dat vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring physical %#"PRIx64" desc %p avail %p used %p" # thread-pool.c -thread_pool_submit(void *req, void *opaque) "req %p opaque %p" -thread_pool_complete(void *req, void *opaque, int ret) "req %p opaque %p ret %d" +thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" +thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" thread_pool_cancel(void *req, void *opaque) "req %p opaque %p" # posix-aio-compat.c From f7311ccc630d925e7351e9440b7ad8bc6f0a51de Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:46 +0100 Subject: [PATCH 20/28] threadpool: add thread_pool_new() and thread_pool_free() ThreadPool is tied to an AioContext through its event notifier, which dictates in which AioContext the work item's callback function will be invoked. In order to support multiple AioContexts we need to support multiple ThreadPool instances. This patch adds the new/free functions. The free function deserves special attention because it quiesces remaining worker threads. This requires a new condition variable and a "stopping" flag to let workers know they should terminate once idle. We never needed to do this before since the global threadpool was not explicitly destroyed until process termination. Also stash the AioContext pointer in ThreadPool so that we can call aio_set_event_notifier() in thread_pool_free(). We didn't need to hold onto AioContext previously since there was no free function. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- include/block/thread-pool.h | 5 ++++ thread-pool.c | 52 ++++++++++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index 200703e35f..e1453c685d 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -26,6 +26,11 @@ typedef int ThreadPoolFunc(void *opaque); +typedef struct ThreadPool ThreadPool; + +ThreadPool *thread_pool_new(struct AioContext *ctx); +void thread_pool_free(ThreadPool *pool); + BlockDriverAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockDriverCompletionFunc *cb, void *opaque); int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); diff --git a/thread-pool.c b/thread-pool.c index a0aecd08fe..d1e4570829 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -24,8 +24,6 @@ #include "qemu/event_notifier.h" #include "block/thread-pool.h" -typedef struct ThreadPool ThreadPool; - static void do_spawn_thread(ThreadPool *pool); typedef struct ThreadPoolElement ThreadPoolElement; @@ -59,8 +57,10 @@ struct ThreadPoolElement { struct ThreadPool { EventNotifier notifier; + AioContext *ctx; QemuMutex lock; QemuCond check_cancel; + QemuCond worker_stopped; QemuSemaphore sem; int max_threads; QEMUBH *new_thread_bh; @@ -75,6 +75,7 @@ struct ThreadPool { int new_threads; /* backlog of threads we need to create */ int pending_threads; /* threads created but not running yet */ int pending_cancellations; /* whether we need a cond_broadcast */ + bool stopping; }; /* Currently there is only one thread pool instance. */ @@ -88,7 +89,7 @@ static void *worker_thread(void *opaque) pool->pending_threads--; do_spawn_thread(pool); - while (1) { + while (!pool->stopping) { ThreadPoolElement *req; int ret; @@ -99,7 +100,7 @@ static void *worker_thread(void *opaque) qemu_mutex_lock(&pool->lock); pool->idle_threads--; } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list)); - if (ret == -1) { + if (ret == -1 || pool->stopping) { break; } @@ -124,6 +125,7 @@ static void *worker_thread(void *opaque) } pool->cur_threads--; + qemu_cond_signal(&pool->worker_stopped); qemu_mutex_unlock(&pool->lock); return NULL; } @@ -298,8 +300,10 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) memset(pool, 0, sizeof(*pool)); event_notifier_init(&pool->notifier, false); + pool->ctx = ctx; qemu_mutex_init(&pool->lock); qemu_cond_init(&pool->check_cancel); + qemu_cond_init(&pool->worker_stopped); qemu_sem_init(&pool->sem, 0); pool->max_threads = 64; pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool); @@ -311,6 +315,46 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) thread_pool_active); } +ThreadPool *thread_pool_new(AioContext *ctx) +{ + ThreadPool *pool = g_new(ThreadPool, 1); + thread_pool_init_one(pool, ctx); + return pool; +} + +void thread_pool_free(ThreadPool *pool) +{ + if (!pool) { + return; + } + + assert(QLIST_EMPTY(&pool->head)); + + qemu_mutex_lock(&pool->lock); + + /* Stop new threads from spawning */ + qemu_bh_delete(pool->new_thread_bh); + pool->cur_threads -= pool->new_threads; + pool->new_threads = 0; + + /* Wait for worker threads to terminate */ + pool->stopping = true; + while (pool->cur_threads > 0) { + qemu_sem_post(&pool->sem); + qemu_cond_wait(&pool->worker_stopped, &pool->lock); + } + + qemu_mutex_unlock(&pool->lock); + + aio_set_event_notifier(pool->ctx, &pool->notifier, NULL, NULL); + qemu_sem_destroy(&pool->sem); + qemu_cond_destroy(&pool->check_cancel); + qemu_cond_destroy(&pool->worker_stopped); + qemu_mutex_destroy(&pool->lock); + event_notifier_cleanup(&pool->notifier); + g_free(pool); +} + static void thread_pool_init(void) { thread_pool_init_one(&global_pool, NULL); From 9b34277d23a6fb15eb9513006c96d8026beeea1f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:47 +0100 Subject: [PATCH 21/28] aio: add a ThreadPool instance to AioContext This patch adds a ThreadPool to AioContext. It's possible that some AioContext instances will never use the ThreadPool, so defer creation until aio_get_thread_pool(). The reason why AioContext should have the ThreadPool is because the ThreadPool is bound to a AioContext instance where the work item's callback function is invoked. It doesn't make sense to keep the ThreadPool pointer anywhere other than AioContext. For example, block/raw-posix.c can get its AioContext's ThreadPool and submit work. Special note about headers: I used struct ThreadPool in aio.h because there is a circular dependency if aio.h includes thread-pool.h. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- async.c | 11 +++++++++++ include/block/aio.h | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/async.c b/async.c index f2d47ba96d..90fe906539 100644 --- a/async.c +++ b/async.c @@ -24,6 +24,7 @@ #include "qemu-common.h" #include "block/aio.h" +#include "block/thread-pool.h" #include "qemu/main-loop.h" /***********************************************************/ @@ -172,6 +173,7 @@ aio_ctx_finalize(GSource *source) { AioContext *ctx = (AioContext *) source; + thread_pool_free(ctx->thread_pool); aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL); event_notifier_cleanup(&ctx->notifier); g_array_free(ctx->pollfds, TRUE); @@ -190,6 +192,14 @@ GSource *aio_get_g_source(AioContext *ctx) return &ctx->source; } +ThreadPool *aio_get_thread_pool(AioContext *ctx) +{ + if (!ctx->thread_pool) { + ctx->thread_pool = thread_pool_new(ctx); + } + return ctx->thread_pool; +} + void aio_notify(AioContext *ctx) { event_notifier_set(&ctx->notifier); @@ -200,6 +210,7 @@ AioContext *aio_context_new(void) AioContext *ctx; ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); + ctx->thread_pool = NULL; event_notifier_init(&ctx->notifier, false); aio_set_event_notifier(ctx, &ctx->notifier, (EventNotifierHandler *) diff --git a/include/block/aio.h b/include/block/aio.h index 5b54d383fc..183679374f 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -66,6 +66,9 @@ typedef struct AioContext { /* GPollFDs for aio_poll() */ GArray *pollfds; + + /* Thread pool for performing work and receiving completion callbacks */ + struct ThreadPool *thread_pool; } AioContext; /* Returns 1 if there are still outstanding AIO requests; 0 otherwise */ @@ -223,6 +226,9 @@ void aio_set_event_notifier(AioContext *ctx, */ GSource *aio_get_g_source(AioContext *ctx); +/* Return the ThreadPool bound to this AioContext */ +struct ThreadPool *aio_get_thread_pool(AioContext *ctx); + /* Functions to operate on the main QEMU AioContext. */ bool qemu_aio_wait(void); From 85d126f3ee666702ac514c66606c62d276c4341c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:48 +0100 Subject: [PATCH 22/28] block: add bdrv_get_aio_context() For now bdrv_get_aio_context() is just a stub that calls qemu_aio_get_context() since the block layer is currently tied to the main loop AioContext. Add the stub now so that the block layer can begin accessing its AioContext. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- block.c | 6 ++++++ include/block/block_int.h | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/block.c b/block.c index 0ab164d992..037e15e065 100644 --- a/block.c +++ b/block.c @@ -4681,3 +4681,9 @@ out: bdrv_delete(bs); } } + +AioContext *bdrv_get_aio_context(BlockDriverState *bs) +{ + /* Currently BlockDriverState always uses the main loop AioContext */ + return qemu_get_aio_context(); +} diff --git a/include/block/block_int.h b/include/block/block_int.h index baf80e3c42..ce0aa26b8e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -294,6 +294,13 @@ int get_tmp_filename(char *filename, int size); void bdrv_set_io_limits(BlockDriverState *bs, BlockIOLimit *io_limits); +/** + * bdrv_get_aio_context: + * + * Returns: the currently bound #AioContext + */ +AioContext *bdrv_get_aio_context(BlockDriverState *bs); + #ifdef _WIN32 int is_windows_drive(const char *filename); #endif From c4d9d19645a484298a67e9021060bc7c2b081d0f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:49 +0100 Subject: [PATCH 23/28] threadpool: drop global thread pool Now that each AioContext has a ThreadPool and the main loop AioContext can be fetched with bdrv_get_aio_context(), we can eliminate the concept of a global thread pool from thread-pool.c. The submit functions must take a ThreadPool* argument. block/raw-posix.c and block/raw-win32.c use aio_get_thread_pool(bdrv_get_aio_context(bs)) to fetch the main loop's ThreadPool. tests/test-thread-pool.c must be updated to reflect the new thread_pool_submit() function prototypes. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- block/raw-posix.c | 8 +++++-- block/raw-win32.c | 4 +++- include/block/thread-pool.h | 10 +++++---- tests/test-thread-pool.c | 44 ++++++++++++++++++------------------- thread-pool.c | 23 ++++++------------- 5 files changed, 43 insertions(+), 46 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 4dfdf985b0..8a3cdbc1f3 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -750,6 +750,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, BlockDriverCompletionFunc *cb, void *opaque, int type) { RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + ThreadPool *pool; acb->bs = bs; acb->aio_type = type; @@ -763,7 +764,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->aio_offset = sector_num * 512; trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, @@ -1413,6 +1415,7 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, { BDRVRawState *s = bs->opaque; RawPosixAIOData *acb; + ThreadPool *pool; if (fd_open(bs) < 0) return NULL; @@ -1424,7 +1427,8 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, acb->aio_offset = 0; acb->aio_ioctl_buf = buf; acb->aio_ioctl_cmd = req; - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) diff --git a/block/raw-win32.c b/block/raw-win32.c index b89ac19ffa..18e0068b26 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -144,6 +144,7 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, BlockDriverCompletionFunc *cb, void *opaque, int type) { RawWin32AIOData *acb = g_slice_new(RawWin32AIOData); + ThreadPool *pool; acb->bs = bs; acb->hfile = hfile; @@ -157,7 +158,8 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, acb->aio_offset = sector_num * 512; trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } int qemu_ftruncate64(int fd, int64_t length) diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index e1453c685d..32afcdd1d6 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -31,9 +31,11 @@ typedef struct ThreadPool ThreadPool; ThreadPool *thread_pool_new(struct AioContext *ctx); void thread_pool_free(ThreadPool *pool); -BlockDriverAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, - BlockDriverCompletionFunc *cb, void *opaque); -int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); -void thread_pool_submit(ThreadPoolFunc *func, void *arg); +BlockDriverAIOCB *thread_pool_submit_aio(ThreadPool *pool, + ThreadPoolFunc *func, void *arg, + BlockDriverCompletionFunc *cb, void *opaque); +int coroutine_fn thread_pool_submit_co(ThreadPool *pool, + ThreadPoolFunc *func, void *arg); +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg); #endif diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c index 9998e031f2..22915aac10 100644 --- a/tests/test-thread-pool.c +++ b/tests/test-thread-pool.c @@ -4,6 +4,8 @@ #include "block/thread-pool.h" #include "block/block.h" +static AioContext *ctx; +static ThreadPool *pool; static int active; typedef struct { @@ -38,19 +40,10 @@ static void done_cb(void *opaque, int ret) active--; } -/* A non-blocking poll of the main AIO context (we cannot use aio_poll - * because we do not know the AioContext). - */ -static void qemu_aio_wait_nonblocking(void) -{ - qemu_notify_event(); - qemu_aio_wait(); -} - /* Wait until all aio and bh activity has finished */ static void qemu_aio_wait_all(void) { - while (qemu_aio_wait()) { + while (aio_poll(ctx, true)) { /* Do nothing */ } } @@ -58,7 +51,7 @@ static void qemu_aio_wait_all(void) static void test_submit(void) { WorkerTestData data = { .n = 0 }; - thread_pool_submit(worker_cb, &data); + thread_pool_submit(pool, worker_cb, &data); qemu_aio_wait_all(); g_assert_cmpint(data.n, ==, 1); } @@ -66,7 +59,8 @@ static void test_submit(void) static void test_submit_aio(void) { WorkerTestData data = { .n = 0, .ret = -EINPROGRESS }; - data.aiocb = thread_pool_submit_aio(worker_cb, &data, done_cb, &data); + data.aiocb = thread_pool_submit_aio(pool, worker_cb, &data, + done_cb, &data); /* The callbacks are not called until after the first wait. */ active = 1; @@ -84,7 +78,7 @@ static void co_test_cb(void *opaque) active = 1; data->n = 0; data->ret = -EINPROGRESS; - thread_pool_submit_co(worker_cb, data); + thread_pool_submit_co(pool, worker_cb, data); /* The test continues in test_submit_co, after qemu_coroutine_enter... */ @@ -126,12 +120,12 @@ static void test_submit_many(void) for (i = 0; i < 100; i++) { data[i].n = 0; data[i].ret = -EINPROGRESS; - thread_pool_submit_aio(worker_cb, &data[i], done_cb, &data[i]); + thread_pool_submit_aio(pool, worker_cb, &data[i], done_cb, &data[i]); } active = 100; while (active > 0) { - qemu_aio_wait(); + aio_poll(ctx, true); } for (i = 0; i < 100; i++) { g_assert_cmpint(data[i].n, ==, 1); @@ -154,7 +148,7 @@ static void test_cancel(void) for (i = 0; i < 100; i++) { data[i].n = 0; data[i].ret = -EINPROGRESS; - data[i].aiocb = thread_pool_submit_aio(long_cb, &data[i], + data[i].aiocb = thread_pool_submit_aio(pool, long_cb, &data[i], done_cb, &data[i]); } @@ -162,7 +156,8 @@ static void test_cancel(void) * run, but do not waste too much time... */ active = 100; - qemu_aio_wait_nonblocking(); + aio_notify(ctx); + aio_poll(ctx, false); /* Wait some time for the threads to start, with some sanity * testing on the behavior of the scheduler... @@ -208,11 +203,10 @@ static void test_cancel(void) int main(int argc, char **argv) { - /* These should be removed once each AioContext has its thread pool. - * The test should create its own AioContext. - */ - qemu_init_main_loop(); - bdrv_init(); + int ret; + + ctx = aio_context_new(); + pool = aio_get_thread_pool(ctx); g_test_init(&argc, &argv, NULL); g_test_add_func("/thread-pool/submit", test_submit); @@ -220,5 +214,9 @@ int main(int argc, char **argv) g_test_add_func("/thread-pool/submit-co", test_submit_co); g_test_add_func("/thread-pool/submit-many", test_submit_many); g_test_add_func("/thread-pool/cancel", test_cancel); - return g_test_run(); + + ret = g_test_run(); + + aio_context_unref(ctx); + return ret; } diff --git a/thread-pool.c b/thread-pool.c index d1e4570829..0ebd4c2964 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -78,9 +78,6 @@ struct ThreadPool { bool stopping; }; -/* Currently there is only one thread pool instance. */ -static ThreadPool global_pool; - static void *worker_thread(void *opaque) { ThreadPool *pool = opaque; @@ -239,10 +236,10 @@ static const AIOCBInfo thread_pool_aiocb_info = { .cancel = thread_pool_cancel, }; -BlockDriverAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, +BlockDriverAIOCB *thread_pool_submit_aio(ThreadPool *pool, + ThreadPoolFunc *func, void *arg, BlockDriverCompletionFunc *cb, void *opaque) { - ThreadPool *pool = &global_pool; ThreadPoolElement *req; req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque); @@ -278,18 +275,19 @@ static void thread_pool_co_cb(void *opaque, int ret) qemu_coroutine_enter(co->co, NULL); } -int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg) +int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func, + void *arg) { ThreadPoolCo tpc = { .co = qemu_coroutine_self(), .ret = -EINPROGRESS }; assert(qemu_in_coroutine()); - thread_pool_submit_aio(func, arg, thread_pool_co_cb, &tpc); + thread_pool_submit_aio(pool, func, arg, thread_pool_co_cb, &tpc); qemu_coroutine_yield(); return tpc.ret; } -void thread_pool_submit(ThreadPoolFunc *func, void *arg) +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg) { - thread_pool_submit_aio(func, arg, NULL, NULL); + thread_pool_submit_aio(pool, func, arg, NULL, NULL); } static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) @@ -354,10 +352,3 @@ void thread_pool_free(ThreadPool *pool) event_notifier_cleanup(&pool->notifier); g_free(pool); } - -static void thread_pool_init(void) -{ - thread_pool_init_one(&global_pool, NULL); -} - -block_init(thread_pool_init) From 28f082469650a0f4c0e37b4ccd6f9514b1a0698d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 7 Mar 2013 13:41:50 +0100 Subject: [PATCH 24/28] coroutine: use AioContext for CoQueue BH CoQueue uses a BH to awake coroutines that were made ready to run again using qemu_co_queue_next() or qemu_co_queue_restart_all(). The BH currently runs in the iothread AioContext and would break coroutines that run in a different AioContext. This is a slightly tricky problem because the lifetime of the BH exceeds that of the CoQueue. This means coroutines can be awoken after CoQueue itself has been freed. Also, there is no qemu_co_queue_destroy() function which we could use to handle freeing resources. Introducing qemu_co_queue_destroy() has a ripple effect of requiring us to also add qemu_co_mutex_destroy() and qemu_co_rwlock_destroy(), as well as updating all callers. Avoid doing that. We also cannot switch from BH to GIdle function because aio_poll() does not dispatch GIdle functions. (GIdle functions make memory management slightly easier because they free themselves.) Finally, I don't want to move unlock_queue and unlock_bh into AioContext. That would break encapsulation - AioContext isn't supposed to know about CoQueue. This patch implements a different solution: each qemu_co_queue_next() or qemu_co_queue_restart_all() call creates a new BH and list of coroutines to wake up. Callers tend to invoke qemu_co_queue_next() and qemu_co_queue_restart_all() occasionally after blocking I/O, so creating a new BH for each call shouldn't be massively inefficient. Note that this patch does not add an interface for specifying the AioContext. That is left to future patches which will convert CoQueue, CoMutex, and CoRwlock to expose AioContext. Signed-off-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini --- include/block/coroutine.h | 1 + qemu-coroutine-lock.c | 59 ++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/include/block/coroutine.h b/include/block/coroutine.h index c31fae3f3c..a978162a3f 100644 --- a/include/block/coroutine.h +++ b/include/block/coroutine.h @@ -104,6 +104,7 @@ bool qemu_in_coroutine(void); */ typedef struct CoQueue { QTAILQ_HEAD(, Coroutine) entries; + AioContext *ctx; } CoQueue; /** diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c index 97ef01c796..86efe1f86a 100644 --- a/qemu-coroutine-lock.c +++ b/qemu-coroutine-lock.c @@ -29,28 +29,36 @@ #include "block/aio.h" #include "trace.h" -static QTAILQ_HEAD(, Coroutine) unlock_bh_queue = - QTAILQ_HEAD_INITIALIZER(unlock_bh_queue); -static QEMUBH* unlock_bh; +/* Coroutines are awoken from a BH to allow the current coroutine to complete + * its flow of execution. The BH may run after the CoQueue has been destroyed, + * so keep BH data in a separate heap-allocated struct. + */ +typedef struct { + QEMUBH *bh; + QTAILQ_HEAD(, Coroutine) entries; +} CoQueueNextData; static void qemu_co_queue_next_bh(void *opaque) { + CoQueueNextData *data = opaque; Coroutine *next; trace_qemu_co_queue_next_bh(); - while ((next = QTAILQ_FIRST(&unlock_bh_queue))) { - QTAILQ_REMOVE(&unlock_bh_queue, next, co_queue_next); + while ((next = QTAILQ_FIRST(&data->entries))) { + QTAILQ_REMOVE(&data->entries, next, co_queue_next); qemu_coroutine_enter(next, NULL); } + + qemu_bh_delete(data->bh); + g_slice_free(CoQueueNextData, data); } void qemu_co_queue_init(CoQueue *queue) { QTAILQ_INIT(&queue->entries); - if (!unlock_bh) { - unlock_bh = qemu_bh_new(qemu_co_queue_next_bh, NULL); - } + /* This will be exposed to callers once there are multiple AioContexts */ + queue->ctx = qemu_get_aio_context(); } void coroutine_fn qemu_co_queue_wait(CoQueue *queue) @@ -69,26 +77,39 @@ void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue) assert(qemu_in_coroutine()); } -bool qemu_co_queue_next(CoQueue *queue) +static bool qemu_co_queue_do_restart(CoQueue *queue, bool single) { Coroutine *next; + CoQueueNextData *data; - next = QTAILQ_FIRST(&queue->entries); - if (next) { - QTAILQ_REMOVE(&queue->entries, next, co_queue_next); - QTAILQ_INSERT_TAIL(&unlock_bh_queue, next, co_queue_next); - trace_qemu_co_queue_next(next); - qemu_bh_schedule(unlock_bh); + if (QTAILQ_EMPTY(&queue->entries)) { + return false; } - return (next != NULL); + data = g_slice_new(CoQueueNextData); + data->bh = aio_bh_new(queue->ctx, qemu_co_queue_next_bh, data); + QTAILQ_INIT(&data->entries); + qemu_bh_schedule(data->bh); + + while ((next = QTAILQ_FIRST(&queue->entries)) != NULL) { + QTAILQ_REMOVE(&queue->entries, next, co_queue_next); + QTAILQ_INSERT_TAIL(&data->entries, next, co_queue_next); + trace_qemu_co_queue_next(next); + if (single) { + break; + } + } + return true; +} + +bool qemu_co_queue_next(CoQueue *queue) +{ + return qemu_co_queue_do_restart(queue, true); } void qemu_co_queue_restart_all(CoQueue *queue) { - while (qemu_co_queue_next(queue)) { - /* Do nothing */ - } + qemu_co_queue_do_restart(queue, false); } bool qemu_co_queue_empty(CoQueue *queue) From 485e3ce88e8af0e5311dd626f399b3a1e0846dd6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Mar 2013 15:14:15 +0100 Subject: [PATCH 25/28] dataplane: fix hang introduced by AioContext transition The bug is that the EventNotifiers do have a NULL io_flush callback. Because _none_ of the callbacks on the dataplane AioContext have such a callback, aio_poll will simply do nothing. Fixed by adding the callbacks: the ioeventfd will always be polled (this can change in the future to pause/resume the processing during live snapshots or similar operations); the ioqueue will be polled if there are outstanding requests. I must admit I have screwed up my testing somehow, because commit 2c20e71 does not work even if cherry-picked on top of 1.4.0, and this patch fixes it there as well. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- hw/dataplane/virtio-blk.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c index dfe5f9b9cd..1242d61e71 100644 --- a/hw/dataplane/virtio-blk.c +++ b/hw/dataplane/virtio-blk.c @@ -263,6 +263,11 @@ static int process_request(IOQueue *ioq, struct iovec iov[], } } +static int flush_true(EventNotifier *e) +{ + return true; +} + static void handle_notify(EventNotifier *e) { VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, @@ -342,6 +347,14 @@ static void handle_notify(EventNotifier *e) } } +static int flush_io(EventNotifier *e) +{ + VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, + io_notifier); + + return s->num_reqs > 0; +} + static void handle_io(EventNotifier *e) { VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, @@ -472,7 +485,7 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) exit(1); } s->host_notifier = *virtio_queue_get_host_notifier(vq); - aio_set_event_notifier(s->ctx, &s->host_notifier, handle_notify, NULL); + aio_set_event_notifier(s->ctx, &s->host_notifier, handle_notify, flush_true); /* Set up ioqueue */ ioq_init(&s->ioqueue, s->fd, REQ_MAX); @@ -480,7 +493,7 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); } s->io_notifier = *ioq_get_notifier(&s->ioqueue); - aio_set_event_notifier(s->ctx, &s->io_notifier, handle_io, NULL); + aio_set_event_notifier(s->ctx, &s->io_notifier, handle_io, flush_io); s->started = true; trace_virtio_blk_data_plane_start(s); From acbf30ec601b1f817febc4500025b7c4181312c4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Mar 2013 15:58:13 +0100 Subject: [PATCH 26/28] qemu-iotests: add tests for rebasing zero clusters If zero clusters are erroneously treated as unallocated, "qemu-img rebase" will copy the backing file's contents onto the cluster. The bug existed also in image streaming, but since the root cause was in qcow2's is_allocated implementation it is enough to test it with qemu-img. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/050 | 75 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/050.out | 17 +++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 93 insertions(+) create mode 100755 tests/qemu-iotests/050 create mode 100644 tests/qemu-iotests/050.out diff --git a/tests/qemu-iotests/050 b/tests/qemu-iotests/050 new file mode 100755 index 0000000000..05793e2d4b --- /dev/null +++ b/tests/qemu-iotests/050 @@ -0,0 +1,75 @@ +#!/bin/bash +# +# Test qemu-img rebase with zero clusters +# +# Copyright (C) 2013 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=pbonzini@redhat.com + +seq=`basename $0` +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img + rm -f $TEST_IMG.old + rm -f $TEST_IMG.new +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 qed +_supported_proto file +_supported_os Linux + +if test "$IMGFMT" = qcow2 && test $IMGOPTS = ""; then + IMGOPTS=compat=1.1 +fi + +echo +echo "== Creating images ==" + +size=10M +_make_test_img $size +$QEMU_IO -c "write -P 0x40 0 1048576" $TEST_IMG | _filter_qemu_io +mv $TEST_IMG $TEST_IMG.old + +_make_test_img $size +$QEMU_IO -c "write -P 0x5a 0 1048576" $TEST_IMG | _filter_qemu_io +mv $TEST_IMG $TEST_IMG.new + +_make_test_img -b $TEST_IMG.old $size +$QEMU_IO -c "write -z 0 1048576" $TEST_IMG | _filter_qemu_io + +echo +echo "== Rebasing the image ==" + +$QEMU_IMG rebase -b $TEST_IMG.new $TEST_IMG +$QEMU_IO -c "read -P 0x00 0 1048576" $TEST_IMG | _filter_qemu_io + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/050.out b/tests/qemu-iotests/050.out new file mode 100644 index 0000000000..3f5f7e1e7b --- /dev/null +++ b/tests/qemu-iotests/050.out @@ -0,0 +1,17 @@ +QA output created by 050 + +== Creating images == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=10485760 +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=10485760 +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=10485760 backing_file='TEST_DIR/t.IMGFMT.old' +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +== Rebasing the image == +read 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index fcf57e0510..1d7e4f31a0 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -56,3 +56,4 @@ 047 rw auto 048 img auto quick 049 rw auto +050 rw auto backing quick From 804dd41792c8bdc0c8b34c8e580882ff8ef736d8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 14 Mar 2013 17:06:55 +0100 Subject: [PATCH 27/28] qemu-iotests: use -nographic in test case 007 A comment explains that -nographic hangs test case 007. This is no longer the case so add -nographic. This makes the test suite faster and more pleasant to run since no windows pop up. I am not sure exactly when -nographic starting working for this case but there is no fundamental reason why graphics are needed here. Make sure the serial port is not on stdio, it would conflict with the monitor. Also remove unnecessary trailing whitespace on these lines. Signed-off-by: Stefan Hajnoczi Tested-by: Kevin Wolf --- tests/qemu-iotests/007 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/qemu-iotests/007 b/tests/qemu-iotests/007 index 0139264c4f..c454f2c8ec 100755 --- a/tests/qemu-iotests/007 +++ b/tests/qemu-iotests/007 @@ -50,10 +50,9 @@ _make_test_img 1M for i in `seq 1 10`; do echo "savevm $i" - # XXX(hch): adding -nographic would be good, but hangs the test - $QEMU -hda $TEST_IMG -monitor stdio >/dev/null 2>&1 </dev/null 2>&1 < Date: Thu, 14 Mar 2013 13:59:53 +0100 Subject: [PATCH 28/28] blockdev: Fix up copyright and permission notice Screwed up in commit 666daa68. Thanks to Kevin Wolf for reminding me to fix this. Signed-off-by: Markus Armbruster Signed-off-by: Stefan Hajnoczi --- blockdev.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/blockdev.c b/blockdev.c index 7ae619860b..09f76b782f 100644 --- a/blockdev.c +++ b/blockdev.c @@ -5,6 +5,29 @@ * * This work is licensed under the terms of the GNU GPL, version 2 or * later. See the COPYING file in the top-level directory. + * + * This file incorporates work covered by the following copyright and + * permission notice: + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ #include "sysemu/blockdev.h"