From 396374caeadf044bad0aae9447eeeb109ea8651c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 25 Feb 2016 23:53:54 +0100 Subject: [PATCH 01/40] qemu-img: eliminate memory leak Not particularly important since qemu-img exits immediately after calling img_rebase, but easily fixed. Coverity says thanks. Signed-off-by: Paolo Bonzini Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- qemu-img.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 2edb139073..3103150717 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2775,6 +2775,8 @@ static int img_snapshot(int argc, char **argv) static int img_rebase(int argc, char **argv) { BlockBackend *blk = NULL, *blk_old_backing = NULL, *blk_new_backing = NULL; + uint8_t *buf_old = NULL; + uint8_t *buf_new = NULL; BlockDriverState *bs = NULL; char *filename; const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg; @@ -2957,8 +2959,6 @@ static int img_rebase(int argc, char **argv) int64_t new_backing_num_sectors = 0; uint64_t sector; int n; - uint8_t * buf_old; - uint8_t * buf_new; float local_progress = 0; buf_old = blk_blockalign(blk, IO_BUF_SIZE); @@ -3070,9 +3070,6 @@ static int img_rebase(int argc, char **argv) } qemu_progress_print(local_progress, 100); } - - qemu_vfree(buf_old); - qemu_vfree(buf_new); } /* @@ -3108,6 +3105,8 @@ out: blk_unref(blk_old_backing); blk_unref(blk_new_backing); } + qemu_vfree(buf_old); + qemu_vfree(buf_new); blk_unref(blk); if (ret) { From 2b77e60ab86e240a06f1ef117ecd488e85d6226e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 26 Feb 2016 19:02:30 +0100 Subject: [PATCH 02/40] block/qapi: Factor out bdrv_query_blk_stats() The new functions handles the data that is taken from the BlockBackend. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block/qapi.c | 131 ++++++++++++++++++++++++++------------------------- 1 file changed, 68 insertions(+), 63 deletions(-) diff --git a/block/qapi.c b/block/qapi.c index db2d3fb915..c04f1d8042 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -355,6 +355,73 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, qapi_free_BlockInfo(info); } +static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) +{ + BlockAcctStats *stats = blk_get_stats(blk); + BlockAcctTimedStats *ts = NULL; + + s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; + s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; + s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; + s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; + + s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; + s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; + s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; + + s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; + s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; + s->stats->invalid_flush_operations = + stats->invalid_ops[BLOCK_ACCT_FLUSH]; + + s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ]; + s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; + s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; + s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; + s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; + s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; + + s->stats->has_idle_time_ns = stats->last_access_time_ns > 0; + if (s->stats->has_idle_time_ns) { + s->stats->idle_time_ns = block_acct_idle_time_ns(stats); + } + + s->stats->account_invalid = stats->account_invalid; + s->stats->account_failed = stats->account_failed; + + while ((ts = block_acct_interval_next(stats, ts))) { + BlockDeviceTimedStatsList *timed_stats = + g_malloc0(sizeof(*timed_stats)); + BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); + timed_stats->next = s->stats->timed_stats; + timed_stats->value = dev_stats; + s->stats->timed_stats = timed_stats; + + TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; + TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; + TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; + + dev_stats->interval_length = ts->interval_length; + + dev_stats->min_rd_latency_ns = timed_average_min(rd); + dev_stats->max_rd_latency_ns = timed_average_max(rd); + dev_stats->avg_rd_latency_ns = timed_average_avg(rd); + + dev_stats->min_wr_latency_ns = timed_average_min(wr); + dev_stats->max_wr_latency_ns = timed_average_max(wr); + dev_stats->avg_wr_latency_ns = timed_average_avg(wr); + + dev_stats->min_flush_latency_ns = timed_average_min(fl); + dev_stats->max_flush_latency_ns = timed_average_max(fl); + dev_stats->avg_flush_latency_ns = timed_average_avg(fl); + + dev_stats->avg_rd_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_READ); + dev_stats->avg_wr_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); + } +} + static BlockStats *bdrv_query_stats(const BlockDriverState *bs, bool query_backing) { @@ -374,69 +441,7 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s->stats = g_malloc0(sizeof(*s->stats)); if (bs->blk) { - BlockAcctStats *stats = blk_get_stats(bs->blk); - BlockAcctTimedStats *ts = NULL; - - s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; - s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; - s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; - s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; - - s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; - s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; - s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; - - s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; - s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; - s->stats->invalid_flush_operations = - stats->invalid_ops[BLOCK_ACCT_FLUSH]; - - s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ]; - s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; - s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; - s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; - s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; - s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; - - s->stats->has_idle_time_ns = stats->last_access_time_ns > 0; - if (s->stats->has_idle_time_ns) { - s->stats->idle_time_ns = block_acct_idle_time_ns(stats); - } - - s->stats->account_invalid = stats->account_invalid; - s->stats->account_failed = stats->account_failed; - - while ((ts = block_acct_interval_next(stats, ts))) { - BlockDeviceTimedStatsList *timed_stats = - g_malloc0(sizeof(*timed_stats)); - BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); - timed_stats->next = s->stats->timed_stats; - timed_stats->value = dev_stats; - s->stats->timed_stats = timed_stats; - - TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; - TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; - TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; - - dev_stats->interval_length = ts->interval_length; - - dev_stats->min_rd_latency_ns = timed_average_min(rd); - dev_stats->max_rd_latency_ns = timed_average_max(rd); - dev_stats->avg_rd_latency_ns = timed_average_avg(rd); - - dev_stats->min_wr_latency_ns = timed_average_min(wr); - dev_stats->max_wr_latency_ns = timed_average_max(wr); - dev_stats->avg_wr_latency_ns = timed_average_avg(wr); - - dev_stats->min_flush_latency_ns = timed_average_min(fl); - dev_stats->max_flush_latency_ns = timed_average_max(fl); - dev_stats->avg_flush_latency_ns = timed_average_avg(fl); - - dev_stats->avg_rd_queue_depth = - block_acct_queue_depth(ts, BLOCK_ACCT_READ); - dev_stats->avg_wr_queue_depth = - block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); - } + bdrv_query_blk_stats(s, bs->blk); } s->stats->wr_highest_offset = bs->wr_highest_offset; From b07363a1a38695e12fc888b490b6215358752e85 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 26 Feb 2016 21:03:01 +0100 Subject: [PATCH 03/40] block/qapi: Factor out bdrv_query_bds_stats() The new functions handles the data that is taken from the BlockDriverState. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block/qapi.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/block/qapi.c b/block/qapi.c index c04f1d8042..31ae87941f 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -355,6 +355,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, qapi_free_BlockInfo(info); } +static BlockStats *bdrv_query_stats(const BlockDriverState *bs, + bool query_backing); + static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) { BlockAcctStats *stats = blk_get_stats(blk); @@ -422,13 +425,9 @@ static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) } } -static BlockStats *bdrv_query_stats(const BlockDriverState *bs, - bool query_backing) +static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs, + bool query_backing) { - BlockStats *s; - - s = g_malloc0(sizeof(*s)); - if (bdrv_get_device_name(bs)[0]) { s->has_device = true; s->device = g_strdup(bdrv_get_device_name(bs)); @@ -439,11 +438,6 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s->node_name = g_strdup(bdrv_get_node_name(bs)); } - s->stats = g_malloc0(sizeof(*s->stats)); - if (bs->blk) { - bdrv_query_blk_stats(s, bs->blk); - } - s->stats->wr_highest_offset = bs->wr_highest_offset; if (bs->file) { @@ -456,6 +450,21 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s->backing = bdrv_query_stats(bs->backing->bs, query_backing); } +} + +static BlockStats *bdrv_query_stats(const BlockDriverState *bs, + bool query_backing) +{ + BlockStats *s; + + s = g_malloc0(sizeof(*s)); + s->stats = g_malloc0(sizeof(*s->stats)); + + if (bs->blk) { + bdrv_query_blk_stats(s, bs->blk); + } + bdrv_query_bds_stats(s, bs, query_backing); + return s; } From c21cc6ca989ebbeaed3e601ae4e521afbff5df54 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 26 Feb 2016 21:16:38 +0100 Subject: [PATCH 04/40] block/qapi: Include empty drives in query-blockstats Since commit 5ec18f8c, query-blockstats didn't return the statistics of drives without media any more because such drives have only a BB now, but not a BDS any more. This patch fixes the regression so that query-blockstats iterates over BBs by default and empty drives are displayed again. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block/qapi.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/block/qapi.c b/block/qapi.c index 31ae87941f..6a4869a8d9 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -355,7 +355,8 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, qapi_free_BlockInfo(info); } -static BlockStats *bdrv_query_stats(const BlockDriverState *bs, +static BlockStats *bdrv_query_stats(BlockBackend *blk, + const BlockDriverState *bs, bool query_backing); static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) @@ -363,6 +364,9 @@ static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) BlockAcctStats *stats = blk_get_stats(blk); BlockAcctTimedStats *ts = NULL; + s->has_device = true; + s->device = g_strdup(blk_name(blk)); + s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; @@ -428,11 +432,6 @@ static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs, bool query_backing) { - if (bdrv_get_device_name(bs)[0]) { - s->has_device = true; - s->device = g_strdup(bdrv_get_device_name(bs)); - } - if (bdrv_get_node_name(bs)[0]) { s->has_node_name = true; s->node_name = g_strdup(bdrv_get_node_name(bs)); @@ -442,17 +441,18 @@ static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs, if (bs->file) { s->has_parent = true; - s->parent = bdrv_query_stats(bs->file->bs, query_backing); + s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing); } if (query_backing && bs->backing) { s->has_backing = true; - s->backing = bdrv_query_stats(bs->backing->bs, query_backing); + s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing); } } -static BlockStats *bdrv_query_stats(const BlockDriverState *bs, +static BlockStats *bdrv_query_stats(BlockBackend *blk, + const BlockDriverState *bs, bool query_backing) { BlockStats *s; @@ -460,10 +460,12 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs, s = g_malloc0(sizeof(*s)); s->stats = g_malloc0(sizeof(*s->stats)); - if (bs->blk) { - bdrv_query_blk_stats(s, bs->blk); + if (blk) { + bdrv_query_blk_stats(s, blk); + } + if (bs) { + bdrv_query_bds_stats(s, bs, query_backing); } - bdrv_query_bds_stats(s, bs, query_backing); return s; } @@ -491,22 +493,38 @@ BlockInfoList *qmp_query_block(Error **errp) return head; } +static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs, + bool query_nodes) +{ + if (query_nodes) { + *bs = bdrv_next_node(*bs); + return !!*bs; + } + + *blk = blk_next(*blk); + *bs = *blk ? blk_bs(*blk) : NULL; + + return !!*blk; +} + BlockStatsList *qmp_query_blockstats(bool has_query_nodes, bool query_nodes, Error **errp) { BlockStatsList *head = NULL, **p_next = &head; + BlockBackend *blk = NULL; BlockDriverState *bs = NULL; /* Just to be safe if query_nodes is not always initialized */ query_nodes = has_query_nodes && query_nodes; - while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) { + while (next_query_bds(&blk, &bs, query_nodes)) { BlockStatsList *info = g_malloc0(sizeof(*info)); - AioContext *ctx = bdrv_get_aio_context(bs); + AioContext *ctx = blk ? blk_get_aio_context(blk) + : bdrv_get_aio_context(bs); aio_context_acquire(ctx); - info->value = bdrv_query_stats(bs, !query_nodes); + info->value = bdrv_query_stats(blk, bs, !query_nodes); aio_context_release(ctx); *p_next = info; From c540d53ac849f385521acb7552ade4f6d3f55ca6 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Thu, 25 Feb 2016 12:27:27 -0500 Subject: [PATCH 05/40] block/vpc: choose size calculation method based on creator_app field The VHD file format is used by both Virtual PC, and Hyper-V. However, how the virtual disk size is calculated varies between the two. Virtual PC uses the CHS drive parameters to determine the drive size. Hyper-V, on the other hand, uses the current_size field in the footer when determining image size. This is problematic for a few reasons: * VHD images from Hyper-V, using CHS calculations, will likely be trunctated. * If we just rely always on current_size, then QEMU may have data compatibility issues with Virtual PC (we may write too much data into a VHD file to be used by Virtual PC, for instance). * Existing VHD images created by QEMU have used the CHS calculations, except for images exceeding the 127GB limit. We want to remain compatible with our own generated images. Luckily, the VHD specification defines a 'Creator App' field, that is used to indicate what software created the VHD file. This patch does two things: 1. Uses the 'Creator App' field to help determine how to calculate size, and 2. Adds a VPC format option 'force_size_calc', so that the user can override the 'Creator App' auto-detection, in case there exist VHD images with unknown or contradictory 'Creator App' entries. N.B.: We currently use the maximum CHS value as an indication to use the current_size field. This patch does not change that, even with the 'force_size_calc' option. Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/vpc.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 5 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index f504536d1c..54a38a7aed 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -128,6 +128,8 @@ typedef struct BDRVVPCState { uint32_t block_size; uint32_t bitmap_size; + bool force_use_chs; + bool force_use_sz; #ifdef CACHE uint8_t *pageentry_u8; @@ -140,6 +142,22 @@ typedef struct BDRVVPCState { Error *migration_blocker; } BDRVVPCState; +#define VPC_OPT_SIZE_CALC "force_size_calc" +static QemuOptsList vpc_runtime_opts = { + .name = "vpc-runtime-opts", + .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head), + .desc = { + { + .name = VPC_OPT_SIZE_CALC, + .type = QEMU_OPT_STRING, + .help = "Force disk size calculation to use either CHS geometry, " + "or use the disk current_size specified in the VHD footer. " + "{chs, current_size}" + }, + { /* end of list */ } + } +}; + static uint32_t vpc_checksum(uint8_t* buf, size_t size) { uint32_t res = 0; @@ -159,6 +177,25 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } +static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts, + Error **errp) +{ + BDRVVPCState *s = bs->opaque; + const char *size_calc; + + size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC); + + if (!size_calc) { + /* no override, use autodetect only */ + } else if (!strcmp(size_calc, "current_size")) { + s->force_use_sz = true; + } else if (!strcmp(size_calc, "chs")) { + s->force_use_chs = true; + } else { + error_setg(errp, "Invalid size calculation mode: '%s'", size_calc); + } +} + static int vpc_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -166,6 +203,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, int i; VHDFooter *footer; VHDDynDiskHeader *dyndisk_header; + QemuOpts *opts = NULL; + Error *local_err = NULL; + bool use_chs; uint8_t buf[HEADER_SIZE]; uint32_t checksum; uint64_t computed_size; @@ -173,6 +213,21 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, int disk_type = VHD_DYNAMIC; int ret; + opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + vpc_parse_options(bs, opts, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE); if (ret < 0) { goto fail; @@ -218,12 +273,34 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - /* Images that have exactly the maximum geometry are probably bigger and - * would be truncated if we adhered to the geometry for them. Rely on - * footer->current_size for them. */ - if (bs->total_sectors == VHD_MAX_GEOMETRY) { + /* Microsoft Virtual PC and Microsoft Hyper-V produce and read + * VHD image sizes differently. VPC will rely on CHS geometry, + * while Hyper-V and disk2vhd use the size specified in the footer. + * + * We use a couple of approaches to try and determine the correct method: + * look at the Creator App field, and look for images that have CHS + * geometry that is the maximum value. + * + * If the CHS geometry is the maximum CHS geometry, then we assume that + * the size is the footer->current_size to avoid truncation. Otherwise, + * we follow the table based on footer->creator_app: + * + * Known creator apps: + * 'vpc ' : CHS Virtual PC (uses disk geometry) + * 'qemu' : CHS QEMU (uses disk geometry) + * 'win ' : current_size Hyper-V + * 'd2v ' : current_size Disk2vhd + * + * The user can override the table values via drive options, however + * even with an override we will still use current_size for images + * that have CHS geometry of the maximum size. + */ + use_chs = (!!strncmp(footer->creator_app, "win ", 4) && + !!strncmp(footer->creator_app, "d2v ", 4)) || s->force_use_chs; + + if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) { bs->total_sectors = be64_to_cpu(footer->current_size) / - BDRV_SECTOR_SIZE; + BDRV_SECTOR_SIZE; } /* Allow a maximum disk size of approximately 2 TB */ From 798609bbe29df4edf4bbec46927fc12862b02378 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Thu, 25 Feb 2016 12:27:28 -0500 Subject: [PATCH 06/40] block/vpc: tests for auto-detecting VPC and Hyper-V VHD images This tests auto-detection, and overrides, of VHD image sizes created by Virtual PC, Hyper-V, and Disk2vhd. This adds three sample images: hyperv2012r2-dynamic.vhd.bz2 - dynamic VHD image created with Hyper-V virtualpc-dynamic.vhd.bz2 - dynamic VHD image created with Virtual PC d2v-zerofilled.vhd.bz2 - dynamic VHD image created with Disk2vhd Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- tests/qemu-iotests/146 | 114 ++++++++++++++++++ tests/qemu-iotests/146.out | 38 ++++++ tests/qemu-iotests/group | 1 + .../sample_images/d2v-zerofilled.vhd.bz2 | Bin 0 -> 1021 bytes .../hyperv2012r2-dynamic.vhd.bz2 | Bin 0 -> 214 bytes .../sample_images/virtualpc-dynamic.vhd.bz2 | Bin 0 -> 212 bytes 6 files changed, 153 insertions(+) create mode 100755 tests/qemu-iotests/146 create mode 100644 tests/qemu-iotests/146.out create mode 100644 tests/qemu-iotests/sample_images/d2v-zerofilled.vhd.bz2 create mode 100644 tests/qemu-iotests/sample_images/hyperv2012r2-dynamic.vhd.bz2 create mode 100644 tests/qemu-iotests/sample_images/virtualpc-dynamic.vhd.bz2 diff --git a/tests/qemu-iotests/146 b/tests/qemu-iotests/146 new file mode 100755 index 0000000000..4cbe1f4068 --- /dev/null +++ b/tests/qemu-iotests/146 @@ -0,0 +1,114 @@ +#!/bin/bash +# +# Test VHD image format creator detection and override +# +# Copyright (C) 2016 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=jcody@redhat.com + +seq=`basename $0` +echo "QA output created by $seq" + +here=`pwd` +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_qemu + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.qemu + +_supported_fmt vpc +_supported_proto file +_supported_os Linux + + +qemu_comm_method="monitor" +silent= + +echo +echo === Testing VPC Autodetect === +echo +_use_sample_img virtualpc-dynamic.vhd.bz2 + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Testing VPC with current_size force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing VPC with chs force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +_cleanup_test_img + +echo +echo === Testing Hyper-V Autodetect === +echo +_use_sample_img hyperv2012r2-dynamic.vhd.bz2 + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Testing Hyper-V with current_size force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing Hyper-V with chs force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +_cleanup_test_img + +echo +echo === Testing d2v Autodetect === +echo +_use_sample_img d2v-zerofilled.vhd.bz2 + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Testing d2v with current_size force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing d2v with chs force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + + +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/146.out b/tests/qemu-iotests/146.out new file mode 100644 index 0000000000..478ad4e4ee --- /dev/null +++ b/tests/qemu-iotests/146.out @@ -0,0 +1,38 @@ +QA output created by 146 + +=== Testing VPC Autodetect === + +[ 0] 266334240/ 266334240 sectors not allocated at offset 0 bytes (0) + +=== Testing VPC with current_size force === + +[ 0] 266338304/ 266338304 sectors not allocated at offset 0 bytes (0) + +=== Testing VPC with chs force === + +[ 0] 266334240/ 266334240 sectors not allocated at offset 0 bytes (0) + +=== Testing Hyper-V Autodetect === + +[ 0] 266338304/ 266338304 sectors not allocated at offset 0 bytes (0) + +=== Testing Hyper-V with current_size force === + +[ 0] 266338304/ 266338304 sectors not allocated at offset 0 bytes (0) + +=== Testing Hyper-V with chs force === + +[ 0] 266334240/ 266334240 sectors not allocated at offset 0 bytes (0) + +=== Testing d2v Autodetect === + +[ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) + +=== Testing d2v with current_size force === + +[ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) + +=== Testing d2v with chs force === + +[ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 47fd40c546..1211149249 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -148,3 +148,4 @@ 143 auto quick 144 rw auto quick 145 auto quick +146 auto quick diff --git a/tests/qemu-iotests/sample_images/d2v-zerofilled.vhd.bz2 b/tests/qemu-iotests/sample_images/d2v-zerofilled.vhd.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..f12cb9203aa3271af1d610d5313a3aa3ffad1a6e GIT binary patch literal 1021 zcmVR0Kl057?_45N2K&kFhwSy{iam`qxDZv z00000Gynhq0000000000HlQMuspUUPey9PViI4zfG|8rhKmY&%qednmXblEH&>Ca_ zWYB5qc4VLNl7#$VQwjl0Xaxii5h#j6sEUm1_%YBbC@P|3{PF>V2$@`#TQ8H$W@$Lg zunSNdj{OKq5`>`%N>Y?123z>=l}RDjq{>_xT55WUad`6}nE`}o-m$~F`x_^ZphAfn zO|8An)TxuGQmIYdlwK7|@o=XTJk-QQNXhmVo7%?O?8@<|vm>5o(%$o0Y^psI;F6c2CaKv)BP zpa2GWE&u_5&=y+8ic%G?Z+DQFvhR)xYNaV(A1s>;FuEJ-BVQ(@hddCK)7>No9svWtM-S007|t0002M+$MmK002S&004>ZC;2vf zS?+VWYEqSP%#rBgy6^ew%^6lpWoug2R<+8rT;7C8(p!1yl4xYQ>#oR-rAC0IDNZ=! zDN0h5r6@uWgdq*Zz53)tL`CKd(?^+^nVIm2h=`2|NoAH&B(Yi-HR$=LqW7aHq> r>VAzO3jW3zM|uPx5)cG{1Xu(02t5UF)d5g$5901frwS4U1zi6?E&-p( literal 0 HcmV?d00001 diff --git a/tests/qemu-iotests/sample_images/hyperv2012r2-dynamic.vhd.bz2 b/tests/qemu-iotests/sample_images/hyperv2012r2-dynamic.vhd.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..bfeccf7b9f2596f119bb35d7e650a29f8e6f17ef GIT binary patch literal 214 zcmV;{04e`MT4*^jL0KkKS<7>@aR2~j|Ns3EPyxUK5D)+a0HCTX+~fcS06+jB01!Yx z0Eqx02&u3Dw$>7=8feLprXhl8!7=I+01-+c00006fB?`)si-s!00E((0ieW+9LW!< z$bjJjeWx)DFnA{B7q@4*a;xX z24}+|;{k09^toeVF+M+y8lDaaN{P?GVI@TYSp=OSB$8;1RzUy&6`?NT5C8<(0ssS^ Oh1`)&6eJCs&V_&uMoJF= literal 0 HcmV?d00001 From fb9245c2610932d33ce148b58714fcc7b3c6eb5f Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Thu, 25 Feb 2016 12:27:29 -0500 Subject: [PATCH 07/40] block/vpc: give option to force the current_size field in .bdrv_create When QEMU creates a VHD image, it goes by the original spec, calculating the current_size based on the nearest CHS geometry (with an exception for disks > 127GB). Apparently, Azure will only allow images that are sized to the nearest MB, and the current_size as calculated from CHS cannot guarantee that. Allow QEMU to create images similar to how Hyper-V creates images, by setting current_size to the specified virtual disk size. This introduces an option, force_size, to be passed to the vpc format during image creation, e.g.: qemu-img convert -f raw -o force_size -O vpc test.img test.vhd When using the "force_size" option, the creator app field used by QEMU will be "qem2" instead of "qemu", to indicate the difference. In light of this, we also add parsing of the "qem2" field during vpc_open. Bug reference: https://bugs.launchpad.net/qemu/+bug/1490611 Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/vpc.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 54a38a7aed..318e6d66fb 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -46,8 +46,14 @@ enum vhd_type { // Seconds since Jan 1, 2000 0:00:00 (UTC) #define VHD_TIMESTAMP_BASE 946684800 +#define VHD_CHS_MAX_C 65535LL +#define VHD_CHS_MAX_H 16 +#define VHD_CHS_MAX_S 255 + #define VHD_MAX_SECTORS (65535LL * 255 * 255) -#define VHD_MAX_GEOMETRY (65535LL * 16 * 255) +#define VHD_MAX_GEOMETRY (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S) + +#define VPC_OPT_FORCE_SIZE "force_size" // always big-endian typedef struct vhd_footer { @@ -288,6 +294,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, * Known creator apps: * 'vpc ' : CHS Virtual PC (uses disk geometry) * 'qemu' : CHS QEMU (uses disk geometry) + * 'qem2' : current_size QEMU (uses current_size) * 'win ' : current_size Hyper-V * 'd2v ' : current_size Disk2vhd * @@ -296,6 +303,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, * that have CHS geometry of the maximum size. */ use_chs = (!!strncmp(footer->creator_app, "win ", 4) && + !!strncmp(footer->creator_app, "qem2", 4) && !!strncmp(footer->creator_app, "d2v ", 4)) || s->force_use_chs; if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) { @@ -850,6 +858,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size; int disk_type; int ret = -EIO; + bool force_size; Error *local_err = NULL; BlockDriverState *bs = NULL; @@ -870,6 +879,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) disk_type = VHD_DYNAMIC; } + force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); @@ -887,13 +898,20 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) * sectors requested until we get enough (or fail). This ensures that * qemu-img convert doesn't truncate images, but rather rounds up. * - * If the image size can't be represented by a spec conform CHS geometry, + * If the image size can't be represented by a spec conformant CHS geometry, * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use * the image size from the VHD footer to calculate total_sectors. */ - total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); - for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { - calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + if (force_size) { + /* This will force the use of total_size for sector count, below */ + cyls = VHD_CHS_MAX_C; + heads = VHD_CHS_MAX_H; + secs_per_cyl = VHD_CHS_MAX_S; + } else { + total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + } } if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { @@ -912,8 +930,11 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) memset(buf, 0, 1024); memcpy(footer->creator, "conectix", 8); - /* TODO Check if "qemu" creator_app is ok for VPC */ - memcpy(footer->creator_app, "qemu", 4); + if (force_size) { + memcpy(footer->creator_app, "qem2", 4); + } else { + memcpy(footer->creator_app, "qemu", 4); + } memcpy(footer->creator_os, "Wi2k", 4); footer->features = cpu_to_be32(0x02); @@ -994,6 +1015,13 @@ static QemuOptsList vpc_create_opts = { "Type of virtual hard disk format. Supported formats are " "{dynamic (default) | fixed} " }, + { + .name = VPC_OPT_FORCE_SIZE, + .type = QEMU_OPT_BOOL, + .help = "Force disk size calculation to use the actual size " + "specified, rather than using the nearest CHS-based " + "calculation" + }, { /* end of list */ } } }; From 1001dd9f844429c01ebf433cc67df8cf196d06d5 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Thu, 25 Feb 2016 12:27:30 -0500 Subject: [PATCH 08/40] block/vpc: add tests for image creation force_size parameter Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- tests/qemu-iotests/146 | 51 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/146.out | 32 ++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/tests/qemu-iotests/146 b/tests/qemu-iotests/146 index 4cbe1f4068..043711be68 100755 --- a/tests/qemu-iotests/146 +++ b/tests/qemu-iotests/146 @@ -108,6 +108,57 @@ echo ${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' +_cleanup_test_img + +echo +echo === Testing Image create, default === +echo + +TEST_IMG="${TEST_DIR}/vpc-create-test.vpc" + +_make_test_img 4G + +echo +echo === Read created image, default opts ==== +echo + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=chs ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=current_size ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing Image create, force_size === +echo + +_make_test_img -o force_size 4G + +echo +echo === Read created image, default opts ==== +echo + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=chs ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=current_size ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/146.out b/tests/qemu-iotests/146.out index 478ad4e4ee..4f334d86bc 100644 --- a/tests/qemu-iotests/146.out +++ b/tests/qemu-iotests/146.out @@ -35,4 +35,36 @@ QA output created by 146 === Testing d2v with chs force === [ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) + +=== Testing Image create, default === + +Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT size=4294967296 + +=== Read created image, default opts ==== + +[ 0] 8389584/ 8389584 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=chs ==== + +[ 0] 8389584/ 8389584 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=current_size ==== + +[ 0] 8389584/ 8389584 sectors not allocated at offset 0 bytes (0) + +=== Testing Image create, force_size === + +Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT size=4294967296 force_size=on + +=== Read created image, default opts ==== + +[ 0] 8388608/ 8388608 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=chs ==== + +[ 0] 8388608/ 8388608 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=current_size ==== + +[ 0] 8388608/ 8388608 sectors not allocated at offset 0 bytes (0) *** done From 58346b82edffff0fb886358aa6d69604b215a770 Mon Sep 17 00:00:00 2001 From: Changlong Xie Date: Fri, 26 Feb 2016 09:39:00 +0800 Subject: [PATCH 09/40] docs: fix invalid node name in qmp event Cc: Dr. David Alan Gilbert Cc: Wen Congyang Signed-off-by: Wen Congyang Signed-off-by: Changlong Xie Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- docs/qmp-events.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/qmp-events.txt b/docs/qmp-events.txt index 4e3eb9e77a..d6b9aeacca 100644 --- a/docs/qmp-events.txt +++ b/docs/qmp-events.txt @@ -337,7 +337,7 @@ Data: Example: { "event": "QUORUM_REPORT_BAD", - "data": { "node-name": "1.raw", "sector-num": 345435, "sectors-count": 5 }, + "data": { "node-name": "node0", "sector-num": 345435, "sectors-count": 5 }, "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } Note: this event is rate-limited. From 0ae053b7e1b74a551b89a84c706e0cb9a163d92c Mon Sep 17 00:00:00 2001 From: Changlong Xie Date: Fri, 26 Feb 2016 09:39:01 +0800 Subject: [PATCH 10/40] qmp event: Refactor QUORUM_REPORT_BAD Introduce QuorumOpType, and make QUORUM_REPORT_BAD compatible with it. Cc: Dr. David Alan Gilbert Cc: Wen Congyang Signed-off-by: Wen Congyang Signed-off-by: Changlong Xie Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/quorum.c | 17 ++++++++++++----- docs/qmp-events.txt | 11 ++++++++++- qapi/block.json | 16 ++++++++++++++++ qapi/event.json | 4 +++- 4 files changed, 41 insertions(+), 7 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index 11cc60b713..8f83574f27 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -215,14 +215,16 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, return acb; } -static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +static void quorum_report_bad(QuorumOpType type, uint64_t sector_num, + int nb_sectors, char *node_name, int ret) { const char *msg = NULL; if (ret < 0) { msg = strerror(-ret); } - qapi_event_send_quorum_report_bad(!!msg, msg, node_name, - acb->sector_num, acb->nb_sectors, &error_abort); + + qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, + sector_num, nb_sectors, &error_abort); } static void quorum_report_failure(QuorumAIOCB *acb) @@ -282,6 +284,7 @@ static void quorum_aio_cb(void *opaque, int ret) QuorumChildRequest *sacb = opaque; QuorumAIOCB *acb = sacb->parent; BDRVQuorumState *s = acb->common.bs->opaque; + QuorumOpType type; bool rewrite = false; if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { @@ -300,12 +303,14 @@ static void quorum_aio_cb(void *opaque, int ret) return; } + type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; sacb->ret = ret; acb->count++; if (ret == 0) { acb->success_count++; } else { - quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); + quorum_report_bad(type, acb->sector_num, acb->nb_sectors, + sacb->aiocb->bs->node_name, ret); } assert(acb->count <= s->num_children); assert(acb->success_count <= s->num_children); @@ -338,7 +343,9 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, continue; } QLIST_FOREACH(item, &version->items, next) { - quorum_report_bad(acb, s->children[item->index]->bs->node_name, 0); + quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num, + acb->nb_sectors, + s->children[item->index]->bs->node_name, 0); } } } diff --git a/docs/qmp-events.txt b/docs/qmp-events.txt index d6b9aeacca..fa7574d671 100644 --- a/docs/qmp-events.txt +++ b/docs/qmp-events.txt @@ -325,6 +325,7 @@ Emitted to report a corruption of a Quorum file. Data: +- "type": Quorum operation type - "error": Error message (json-string, optional) Only present on failure. This field contains a human-readable error message. There are no semantics other than that the @@ -336,10 +337,18 @@ Data: Example: +Read operation: { "event": "QUORUM_REPORT_BAD", - "data": { "node-name": "node0", "sector-num": 345435, "sectors-count": 5 }, + "data": { "node-name": "node0", "sector-num": 345435, "sectors-count": 5, + "type": "read" }, "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } +Flush operation: +{ "event": "QUORUM_REPORT_BAD", + "data": { "node-name": "node0", "sector-num": 0, "sectors-count": 2097120, + "type": "flush", "error": "Broken pipe" }, + "timestamp": { "seconds": 1456406829, "microseconds": 291763 } } + Note: this event is rate-limited. RESET diff --git a/qapi/block.json b/qapi/block.json index 58e6b301bf..937337dce5 100644 --- a/qapi/block.json +++ b/qapi/block.json @@ -196,3 +196,19 @@ ## { 'event': 'DEVICE_TRAY_MOVED', 'data': { 'device': 'str', 'tray-open': 'bool' } } + +## +# @QuorumOpType +# +# An enumeration of the quorum operation types +# +# @read: read operation +# +# @write: write operation +# +# @flush: flush operation +# +# Since: 2.6 +## +{ 'enum': 'QuorumOpType', + 'data': [ 'read', 'write', 'flush' ] } diff --git a/qapi/event.json b/qapi/event.json index 1a45a6cb26..8642052ebc 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -325,6 +325,8 @@ # # Emitted to report a corruption of a Quorum file # +# @type: quorum operation type (Since 2.6) +# # @error: #optional, error message. Only present on failure. This field # contains a human-readable error message. There are no semantics other # than that the block layer reported an error and clients should not @@ -339,7 +341,7 @@ # Since: 2.0 ## { 'event': 'QUORUM_REPORT_BAD', - 'data': { '*error': 'str', 'node-name': 'str', + 'data': { 'type': 'QuorumOpType', '*error': 'str', 'node-name': 'str', 'sector-num': 'int', 'sectors-count': 'int' } } ## From 924e8a2bbc7cc62b3996efe9a2a460f541c04520 Mon Sep 17 00:00:00 2001 From: Changlong Xie Date: Fri, 26 Feb 2016 09:39:02 +0800 Subject: [PATCH 11/40] quorum: modify vote rules for flush operation Keep flush interface the same logic as quorum read/write, Otherwise in following scenario, we'll encounter unexpected errors. Quorum has two children(A, B). A do flush sucessfully, but B flush failed. This cause the filesystem of guest become read-only with following errors: end_request: I/O error, dev vda, sector 11159960 Aborting journal on device vda3-8 EXT4-fs error (device vda3): ext4_journal_start_sb:327: Detected abort journal EXT4-fs (vda3): Remounting filesystem read-only Cc: Dr. David Alan Gilbert Cc: Wen Congyang Signed-off-by: Wen Congyang Signed-off-by: Changlong Xie Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/quorum.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index 8f83574f27..b16171b57e 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -767,19 +767,30 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs) QuorumVoteValue result_value; int i; int result = 0; + int success_count = 0; QLIST_INIT(&error_votes.vote_list); error_votes.compare = quorum_64bits_compare; for (i = 0; i < s->num_children; i++) { result = bdrv_co_flush(s->children[i]->bs); - result_value.l = result; - quorum_count_vote(&error_votes, &result_value, i); + if (result) { + quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, + bdrv_nb_sectors(s->children[i]->bs), + s->children[i]->bs->node_name, result); + result_value.l = result; + quorum_count_vote(&error_votes, &result_value, i); + } else { + success_count++; + } } - winner = quorum_get_vote_winner(&error_votes); - result = winner->value.l; - + if (success_count >= s->threshold) { + result = 0; + } else { + winner = quorum_get_vote_winner(&error_votes); + result = winner->value.l; + } quorum_free_vote_list(&error_votes); return result; From f86b8b584b114d68036bf576057f51caec7b94ba Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 2 Mar 2016 12:16:44 +0100 Subject: [PATCH 12/40] blockdev: Snapshotting must not open second instance of old top Calling bdrv_img_create() with a size of -1 means that it determines the size automatically by opening the backing file. However, in the case of live snapshots, the backing file is already opened and we must avoid opening the same image twice at the same time. Apart from that, just getting the size from the already existing BDS is a lot less overhead than opening a new instance. Signed-off-by: Kevin Wolf Reviewed-by: Jeff Cody --- blockdev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index 0f20c6511f..e1c1540010 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1732,10 +1732,15 @@ static void external_snapshot_prepare(BlkActionState *common, /* create new image w/backing file */ mode = s->has_mode ? s->mode : NEW_IMAGE_MODE_ABSOLUTE_PATHS; if (mode != NEW_IMAGE_MODE_EXISTING) { + int64_t size = bdrv_getlength(state->old_bs); + if (size < 0) { + error_setg_errno(errp, -size, "bdrv_getlength failed"); + return; + } bdrv_img_create(new_image_file, format, state->old_bs->filename, state->old_bs->drv->format_name, - NULL, -1, flags, &local_err, false); + NULL, size, flags, &local_err, false); if (local_err) { error_propagate(errp, local_err); return; From 73176bee9918a3738237b0e06eff72b497283869 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 7 Mar 2016 13:02:15 +0100 Subject: [PATCH 13/40] block: Fix snapshot=on cache modes Since commit 91a097e, we end up with a somewhat weird cache mode configuration with snapshot=on: The commit broke the cache mode inheritance for the snapshot overlay so that it is opened as writethrough instead of unsafe now. The following bdrv_append() call to put it on top of the tree swaps the WCE flag with the snapshot's backing file (i.e. the originally given file), so what we eventually get is cache=writeback on the temporary overlay and cache=writethrough,cache.no-flush=on on the real image file. This patch changes things so that the temporary overlay gets cache=unsafe again like it used to, and the real images get whatever the user specified. This means that cache.direct is now respected even with snapshot=on, and in the case of committing changes, the final flush is no longer ignored except explicitly requested by the user. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 36 +++++++++++++++++++++++++----------- blockdev.c | 7 ------- include/block/block.h | 1 - 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/block.c b/block.c index ba24b8e674..cf5eb34382 100644 --- a/block.c +++ b/block.c @@ -687,13 +687,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) } /* - * Returns the flags that a temporary snapshot should get, based on the - * originally requested flags (the originally requested image will have flags - * like a backing file) + * Returns the options and flags that a temporary snapshot should get, based on + * the originally requested flags (the originally requested image will have + * flags like a backing file) */ -static int bdrv_temp_snapshot_flags(int flags) +static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options) { - return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; + *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; + + /* For temporary files, unconditional cache=unsafe is fine */ + qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on"); + qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off"); + qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on"); } /* @@ -1424,13 +1430,13 @@ done: return c; } -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) +static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, + QDict *snapshot_options, Error **errp) { /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char *tmp_filename = g_malloc0(PATH_MAX + 1); int64_t total_size; QemuOpts *opts = NULL; - QDict *snapshot_options; BlockDriverState *bs_snapshot; Error *local_err = NULL; int ret; @@ -1464,8 +1470,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) goto out; } - /* Prepare a new options QDict for the temporary file */ - snapshot_options = qdict_new(); + /* Prepare options QDict for the temporary file */ qdict_put(snapshot_options, "file.driver", qstring_from_str("file")); qdict_put(snapshot_options, "file.filename", @@ -1477,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, flags, &local_err); + snapshot_options = NULL; if (ret < 0) { error_propagate(errp, local_err); goto out; @@ -1485,6 +1491,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) bdrv_append(bs_snapshot, bs); out: + QDECREF(snapshot_options); g_free(tmp_filename); return ret; } @@ -1516,6 +1523,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, const char *drvname; const char *backing; Error *local_err = NULL; + QDict *snapshot_options = NULL; int snapshot_flags = 0; assert(pbs); @@ -1607,7 +1615,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, flags |= BDRV_O_ALLOW_RDWR; } if (flags & BDRV_O_SNAPSHOT) { - snapshot_flags = bdrv_temp_snapshot_flags(flags); + snapshot_options = qdict_new(); + bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options, + flags, options); bdrv_backing_options(&flags, options, flags, options); } @@ -1709,7 +1719,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, /* For snapshot=on, create a temporary qcow2 overlay. bs points to the * temporary snapshot afterwards. */ if (snapshot_flags) { - ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err); + ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options, + &local_err); + snapshot_options = NULL; if (local_err) { goto close_and_fail; } @@ -1721,6 +1733,7 @@ fail: if (file != NULL) { bdrv_unref_child(bs, file); } + QDECREF(snapshot_options); QDECREF(bs->explicit_options); QDECREF(bs->options); QDECREF(options); @@ -1743,6 +1756,7 @@ close_and_fail: } else { bdrv_unref(bs); } + QDECREF(snapshot_options); QDECREF(options); if (local_err) { error_propagate(errp, local_err); diff --git a/blockdev.c b/blockdev.c index e1c1540010..eecd78d276 100644 --- a/blockdev.c +++ b/blockdev.c @@ -593,13 +593,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off"); qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off"); - if (snapshot) { - /* always use cache=unsafe with snapshot */ - qdict_put(bs_opts, BDRV_OPT_CACHE_WB, qstring_from_str("on")); - qdict_put(bs_opts, BDRV_OPT_CACHE_DIRECT, qstring_from_str("off")); - qdict_put(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, qstring_from_str("on")); - } - if (runstate_check(RUN_STATE_INMIGRATE)) { bdrv_flags |= BDRV_O_INACTIVE; } diff --git a/include/block/block.h b/include/block/block.h index 1c4f4d8141..3900c4d8c3 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -215,7 +215,6 @@ BdrvChild *bdrv_open_child(const char *filename, void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd); int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, const char *bdref_key, Error **errp); -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp); int bdrv_open(BlockDriverState **pbs, const char *filename, const char *reference, QDict *options, int flags, Error **errp); BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, From a81d616437e53106c31ba5025d5f359e592a2ef4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 7 Mar 2016 14:23:04 +0100 Subject: [PATCH 14/40] block: Fix cache mode defaults in bds_tree_init() Without setting explicit defaults in the options, blockdev-add without an ID ended up defaulting to writethrough. It should be writeback as documented. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake --- blockdev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/blockdev.c b/blockdev.c index eecd78d276..1824cae259 100644 --- a/blockdev.c +++ b/blockdev.c @@ -675,6 +675,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp) goto fail; } + /* bdrv_open() defaults to the values in bdrv_flags (for compatibility + * with other callers) rather than what we want as the real defaults. + * Apply the defaults here instead. */ + qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on"); + qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off"); + qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off"); + if (runstate_check(RUN_STATE_INMIGRATE)) { bdrv_flags |= BDRV_O_INACTIVE; } From 965415eb20b8f2252289166da1bc6e488b466873 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 16:24:34 +0800 Subject: [PATCH 15/40] vmdk: Switch to heap arrays for vmdk_write_cid It is only called once for each opened image, so we can do it the easy way. Reviewed-by: Peter Xu Signed-off-by: Fam Zheng Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/vmdk.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a8db5d9ec2..1ec2452da5 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -274,36 +274,39 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) { - char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; + char *desc, *tmp_desc; char *p_name, *tmp_str; BDRVVmdkState *s = bs->opaque; - int ret; + int ret = 0; + desc = g_malloc0(DESC_SIZE); + tmp_desc = g_malloc0(DESC_SIZE); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { - return ret; + goto out; } desc[DESC_SIZE - 1] = '\0'; tmp_str = strstr(desc, "parentCID"); if (tmp_str == NULL) { - return -EINVAL; + ret = -EINVAL; + goto out; } - pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); + pstrcpy(tmp_desc, DESC_SIZE, tmp_str); p_name = strstr(desc, "CID"); if (p_name != NULL) { p_name += sizeof("CID"); - snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); - pstrcat(desc, sizeof(desc), tmp_desc); + snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid); + pstrcat(desc, DESC_SIZE, tmp_desc); } ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - if (ret < 0) { - return ret; - } - return 0; +out: + g_free(desc); + g_free(tmp_desc); + return ret; } static int vmdk_is_cid_valid(BlockDriverState *bs) From 5997c210b9ed7d6dbc64aa5049eb2bc4ec574aba Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 16:24:35 +0800 Subject: [PATCH 16/40] vmdk: Switch to heap arrays for vmdk_read_cid Signed-off-by: Fam Zheng Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/vmdk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/vmdk.c b/block/vmdk.c index 1ec2452da5..c68f4563ad 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -242,15 +242,17 @@ static void vmdk_free_last_extent(BlockDriverState *bs) static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { - char desc[DESC_SIZE]; + char *desc; uint32_t cid = 0xffffffff; const char *p_name, *cid_str; size_t cid_str_size; BDRVVmdkState *s = bs->opaque; int ret; + desc = g_malloc0(DESC_SIZE); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { + g_free(desc); return 0; } @@ -269,6 +271,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) sscanf(p_name, "%" SCNx32, &cid); } + g_free(desc); return cid; } From 71968dbfd8d190321ae54caf2ddfdf012ee97b3c Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 16:24:36 +0800 Subject: [PATCH 17/40] vmdk: Switch to heap arrays for vmdk_parent_open Signed-off-by: Fam Zheng Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/vmdk.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index c68f4563ad..03be7f0f5f 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -343,15 +343,16 @@ static int vmdk_reopen_prepare(BDRVReopenState *state, static int vmdk_parent_open(BlockDriverState *bs) { char *p_name; - char desc[DESC_SIZE + 1]; + char *desc; BDRVVmdkState *s = bs->opaque; int ret; - desc[DESC_SIZE] = '\0'; + desc = g_malloc0(DESC_SIZE + 1); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { - return ret; + goto out; } + ret = 0; p_name = strstr(desc, "parentFileNameHint"); if (p_name != NULL) { @@ -360,16 +361,20 @@ static int vmdk_parent_open(BlockDriverState *bs) p_name += sizeof("parentFileNameHint") + 1; end_name = strchr(p_name, '\"'); if (end_name == NULL) { - return -EINVAL; + ret = -EINVAL; + goto out; } if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { - return -EINVAL; + ret = -EINVAL; + goto out; } pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); } - return 0; +out: + g_free(desc); + return ret; } /* Create and append extent to the extent array. Return the added VmdkExtent From abb21ac3e602430bd7a35e88ecc2eb62c91c7b42 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 23 Feb 2016 17:33:24 +0100 Subject: [PATCH 18/40] hmp: 'drive_add -n' for creating a node without BB This patch adds an option to the drive_add HMP command to create only a BlockDriverState without a BlockBackend on top. The motivation for this is that libvirt needs to specify options to a migration target (specifically, detect-zeroes). drive-mirror doesn't allow specifying options, and the proper way to do this is to create the target BDS separately with blockdev-add (where you can specify options) and then use blockdev-mirror to that BDS. However, libvirt can't use blockdev-add as long as it is still experimental, and we're expecting that it will still take some time, so we need to resort to drive_add. The problem with drive_add is that so far it always created a BB, and BDSes with a BB can't be used as a mirroring target as long as we don't support multiple BBs per BDS - and while we're working towards that goal, it's another thing that will still take some time. So to achieve the goal, the simplest solution to provide the functionality now without adding one-off options to the mirror QMP commands is to extend drive_add to create nodes without BBs. Signed-off-by: Kevin Wolf --- blockdev.c | 30 ++++++++++++++++++++++++++++++ device-hotplug.c | 7 +++++++ hmp-commands.hx | 4 ++-- include/block/block_int.h | 2 ++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index 1824cae259..1297c90512 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3875,6 +3875,36 @@ out: aio_context_release(aio_context); } +void hmp_drive_add_node(Monitor *mon, const char *optstr) +{ + QemuOpts *opts; + QDict *qdict; + Error *local_err = NULL; + + opts = qemu_opts_parse_noisily(&qemu_drive_opts, optstr, false); + if (!opts) { + return; + } + + qdict = qemu_opts_to_qdict(opts, NULL); + + if (!qdict_get_try_str(qdict, "node-name")) { + error_report("'node-name' needs to be specified"); + goto out; + } + + BlockDriverState *bs = bds_tree_init(qdict, &local_err); + if (!bs) { + error_report_err(local_err); + goto out; + } + + QTAILQ_INSERT_TAIL(&monitor_bdrv_states, bs, monitor_list); + +out: + qemu_opts_del(opts); +} + void qmp_blockdev_add(BlockdevOptions *options, Error **errp) { QmpOutputVisitor *ov = qmp_output_visitor_new(); diff --git a/device-hotplug.c b/device-hotplug.c index 9a7cd669d5..3e5cdaad10 100644 --- a/device-hotplug.c +++ b/device-hotplug.c @@ -30,6 +30,7 @@ #include "qemu/config-file.h" #include "sysemu/sysemu.h" #include "monitor/monitor.h" +#include "block/block_int.h" static DriveInfo *add_init_drive(const char *optstr) { @@ -55,6 +56,12 @@ void hmp_drive_add(Monitor *mon, const QDict *qdict) { DriveInfo *dinfo = NULL; const char *opts = qdict_get_str(qdict, "opts"); + bool node = qdict_get_try_bool(qdict, "node", false); + + if (node) { + hmp_drive_add_node(mon, opts); + return; + } dinfo = add_init_drive(opts); if (!dinfo) { diff --git a/hmp-commands.hx b/hmp-commands.hx index 639205b692..4f4f60a0df 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1201,8 +1201,8 @@ ETEXI { .name = "drive_add", - .args_type = "pci_addr:s,opts:s", - .params = "[[:]:]\n" + .args_type = "node:-n,pci_addr:s,opts:s", + .params = "[-n] [[:]:]\n" "[file=file][,if=type][,bus=n]\n" "[,unit=m][,media=d][,index=i]\n" "[,cyls=c,heads=h,secs=s[,trans=t]]\n" diff --git a/include/block/block_int.h b/include/block/block_int.h index 9ef823a660..dda5ba0927 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -694,6 +694,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, BlockCompletionFunc *cb, void *opaque, BlockJobTxn *txn, Error **errp); +void hmp_drive_add_node(Monitor *mon, const char *optstr); + void blk_set_bs(BlockBackend *blk, BlockDriverState *bs); void blk_dev_change_media_cb(BlockBackend *blk, bool load); From 2073d410ce3f2b1507bccd6aba68e8808647f9a9 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 23 Feb 2016 17:50:37 +0100 Subject: [PATCH 19/40] hmp: Extend drive_del to delete nodes without BB Now that we can use drive_add to create new nodes without a BB, we also want to be able to delete such nodes again. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- blockdev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/blockdev.c b/blockdev.c index 1297c90512..322ca03908 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2824,6 +2824,15 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict) AioContext *aio_context; Error *local_err = NULL; + bs = bdrv_find_node(id); + if (bs) { + qmp_x_blockdev_del(false, NULL, true, id, &local_err); + if (local_err) { + error_report_err(local_err); + } + return; + } + blk = blk_by_name(id); if (!blk) { error_report("Device '%s' not found", id); From 6340472c54529c5b703deec3ab0d6bdfe644f11e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 4 Mar 2016 14:53:50 +0100 Subject: [PATCH 20/40] block: Use writeback in .bdrv_create() implementations There's no reason to use a writethrough cache mode while creating an image. Signed-off-by: Kevin Wolf --- block/parallels.c | 3 ++- block/qcow.c | 3 ++- block/qcow2.c | 3 ++- block/sheepdog.c | 6 ++++-- block/vdi.c | 3 ++- block/vhdx.c | 3 ++- block/vmdk.c | 9 ++++++--- block/vpc.c | 3 ++- 8 files changed, 22 insertions(+), 11 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index 645521d783..3f9fd48cc6 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -479,7 +479,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) file = NULL; ret = bdrv_open(&file, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); if (ret < 0) { error_propagate(errp, local_err); return ret; diff --git a/block/qcow.c b/block/qcow.c index 251910cc9d..c46810c56b 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -798,7 +798,8 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) qcow_bs = NULL; ret = bdrv_open(&qcow_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); if (ret < 0) { error_propagate(errp, local_err); goto cleanup; diff --git a/block/qcow2.c b/block/qcow2.c index 8babecdab2..5a79177546 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2173,7 +2173,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, } bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, &local_err); if (ret < 0) { error_propagate(errp, local_err); diff --git a/block/sheepdog.c b/block/sheepdog.c index 05677ed983..2488a8e67d 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -1645,7 +1645,8 @@ static int sd_prealloc(const char *filename, Error **errp) void *buf = NULL; int ret; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, errp); if (ret < 0) { goto out_with_err_set; @@ -1838,7 +1839,8 @@ static int sd_create(const char *filename, QemuOpts *opts, } bs = NULL; - ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp); + ret = bdrv_open(&bs, backing_file, NULL, NULL, + BDRV_O_PROTOCOL | BDRV_O_CACHE_WB, errp); if (ret < 0) { goto out; } diff --git a/block/vdi.c b/block/vdi.c index b403243604..12407c43e9 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -766,7 +766,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) error_propagate(errp, local_err); goto exit; } - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, &local_err); if (ret < 0) { error_propagate(errp, local_err); diff --git a/block/vhdx.c b/block/vhdx.c index 9a51428317..ea030ad509 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1838,7 +1838,8 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) } bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, &local_err); if (ret < 0) { error_propagate(errp, local_err); diff --git a/block/vmdk.c b/block/vmdk.c index 03be7f0f5f..dd8093688a 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1664,7 +1664,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, } assert(bs == NULL); - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, &local_err); if (ret < 0) { error_propagate(errp, local_err); @@ -1944,7 +1945,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) ret = -ENOENT; goto exit; } - ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, errp); + ret = bdrv_open(&bs, full_backing, NULL, NULL, + BDRV_O_NO_BACKING | BDRV_O_CACHE_WB, errp); g_free(full_backing); if (ret != 0) { goto exit; @@ -2015,7 +2017,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) } assert(new_bs == NULL); ret = bdrv_open(&new_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); if (ret < 0) { error_propagate(errp, local_err); goto exit; diff --git a/block/vpc.c b/block/vpc.c index 318e6d66fb..1db47d63ba 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -886,7 +886,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) error_propagate(errp, local_err); goto out; } - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, &local_err); if (ret < 0) { error_propagate(errp, local_err); From c10c9d96158ce4d05f4325e64c0ce6a5fcd64b8b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 16:39:49 +0100 Subject: [PATCH 21/40] block: Introduce blk_set_allow_write_beyond_eof() We check that the guest can't write beyond the end of its disk, but for other internal users it can make sense to allow growing a file. Signed-off-by: Kevin Wolf --- block/block-backend.c | 23 ++++++++++++++++------- include/sysemu/block-backend.h | 1 + 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index ebdf78a11c..03e71b4368 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -50,6 +50,8 @@ struct BlockBackend { bool iostatus_enabled; BlockDeviceIoStatus iostatus; + bool allow_write_beyond_eof; + NotifierList remove_bs_notifiers, insert_bs_notifiers; }; @@ -579,6 +581,11 @@ void blk_iostatus_set_err(BlockBackend *blk, int error) } } +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow) +{ + blk->allow_write_beyond_eof = allow; +} + static int blk_check_byte_request(BlockBackend *blk, int64_t offset, size_t size) { @@ -592,17 +599,19 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset, return -ENOMEDIUM; } - len = blk_getlength(blk); - if (len < 0) { - return len; - } - if (offset < 0) { return -EIO; } - if (offset > len || len - offset < size) { - return -EIO; + if (!blk->allow_write_beyond_eof) { + len = blk_getlength(blk); + if (len < 0) { + return len; + } + + if (offset > len || len - offset < size) { + return -EIO; + } } return 0; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 66c5cf22e1..00d69baa07 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -78,6 +78,7 @@ void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs); void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk); +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow); void blk_iostatus_enable(BlockBackend *blk); bool blk_iostatus_is_enabled(const BlockBackend *blk); BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk); From 8942764f548e239b1b78c28bb662bef3e1221c3d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 22/40] parallels: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/parallels.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index 3f9fd48cc6..0d1a60c972 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -30,6 +30,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/bitmap.h" #include "qapi/util.h" @@ -461,7 +462,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size, cl_size; uint8_t tmp[BDRV_SECTOR_SIZE]; Error *local_err = NULL; - BlockDriverState *file; + BlockBackend *file; uint32_t bat_entries, bat_sectors; ParallelsHeader header; int ret; @@ -477,15 +478,17 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) return ret; } - file = NULL; - ret = bdrv_open(&file, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + file = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (file == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } - ret = bdrv_truncate(file, 0); + + blk_set_allow_write_beyond_eof(file, true); + + ret = blk_truncate(file, 0); if (ret < 0) { goto exit; } @@ -509,18 +512,18 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) memset(tmp, 0, sizeof(tmp)); memcpy(tmp, &header, sizeof(header)); - ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); if (ret < 0) { goto exit; } - ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0); + ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0); if (ret < 0) { goto exit; } ret = 0; done: - bdrv_unref(file); + blk_unref(file); return ret; exit: From 6af40160209f579997e4adb05648813a717caab6 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 23/40] qcow: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/qcow.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index c46810c56b..2fd5ee65d4 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include #include "qapi/qmp/qerror.h" @@ -780,7 +781,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) int flags = 0; Error *local_err = NULL; int ret; - BlockDriverState *qcow_bs; + BlockBackend *qcow_blk; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), @@ -796,16 +797,18 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) goto cleanup; } - qcow_bs = NULL; - ret = bdrv_open(&qcow_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + qcow_blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (qcow_blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto cleanup; } - ret = bdrv_truncate(qcow_bs, 0); + blk_set_allow_write_beyond_eof(qcow_blk, true); + + ret = blk_truncate(qcow_blk, 0); if (ret < 0) { goto exit; } @@ -845,13 +848,13 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) } /* write all the data */ - ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); + ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header)); if (ret != sizeof(header)) { goto exit; } if (backing_file) { - ret = bdrv_pwrite(qcow_bs, sizeof(header), + ret = blk_pwrite(qcow_blk, sizeof(header), backing_file, backing_filename_len); if (ret != backing_filename_len) { goto exit; @@ -861,7 +864,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) tmp = g_malloc0(BDRV_SECTOR_SIZE); for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ BDRV_SECTOR_SIZE); i++) { - ret = bdrv_pwrite(qcow_bs, header_size + + ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); if (ret != BDRV_SECTOR_SIZE) { g_free(tmp); @@ -872,7 +875,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) g_free(tmp); ret = 0; exit: - bdrv_unref(qcow_bs); + blk_unref(qcow_blk); cleanup: g_free(backing_file); return ret; From 23588797b6830254ee25ed682bdd7f2cd65e1252 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 24/40] qcow2: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/qcow2.c | 62 +++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 5a79177546..1ce6264011 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include #include "block/qcow2.h" @@ -2097,7 +2098,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file * size for any qcow2 image. */ - BlockDriverState* bs; + BlockBackend *blk; QCowHeader *header; uint64_t* refcount_table; Error *local_err = NULL; @@ -2172,15 +2173,16 @@ static int qcow2_create2(const char *filename, int64_t total_size, return ret; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } + blk_set_allow_write_beyond_eof(blk, true); + /* Write the header */ QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); header = g_malloc0(cluster_size); @@ -2208,7 +2210,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = bdrv_pwrite(bs, 0, header, cluster_size); + ret = blk_pwrite(blk, 0, header, cluster_size); g_free(header); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write qcow2 header"); @@ -2218,7 +2220,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Write a refcount table with one refcount block */ refcount_table = g_malloc0(2 * cluster_size); refcount_table[0] = cpu_to_be64(2 * cluster_size); - ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); + ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size); g_free(refcount_table); if (ret < 0) { @@ -2226,8 +2228,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, goto out; } - bdrv_unref(bs); - bs = NULL; + blk_unref(blk); + blk = NULL; /* * And now open the image and make it consistent first (i.e. increase the @@ -2236,15 +2238,16 @@ static int qcow2_create2(const char *filename, int64_t total_size, */ options = qdict_new(); qdict_put(options, "driver", qstring_from_str("qcow2")); - ret = bdrv_open(&bs, filename, NULL, options, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, - &local_err); - if (ret < 0) { + blk = blk_new_open("image-qcow2", filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } - ret = qcow2_alloc_clusters(bs, 3 * cluster_size); + ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " "header and refcount table"); @@ -2256,14 +2259,14 @@ static int qcow2_create2(const char *filename, int64_t total_size, } /* Create a full header (including things like feature table) */ - ret = qcow2_update_header(bs); + ret = qcow2_update_header(blk_bs(blk)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not update qcow2 header"); goto out; } /* Okay, now that we have a valid image, let's give it the right size */ - ret = bdrv_truncate(bs, total_size); + ret = blk_truncate(blk, total_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not resize image"); goto out; @@ -2271,7 +2274,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Want a backing file? There you go.*/ if (backing_file) { - ret = bdrv_change_backing_file(bs, backing_file, backing_format); + ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format); if (ret < 0) { error_setg_errno(errp, -ret, "Could not assign backing file '%s' " "with format '%s'", backing_file, backing_format); @@ -2281,9 +2284,9 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* And if we're supposed to preallocate metadata, do that now */ if (prealloc != PREALLOC_MODE_OFF) { - BDRVQcow2State *s = bs->opaque; + BDRVQcow2State *s = blk_bs(blk)->opaque; qemu_co_mutex_lock(&s->lock); - ret = preallocate(bs); + ret = preallocate(blk_bs(blk)); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { error_setg_errno(errp, -ret, "Could not preallocate metadata"); @@ -2291,24 +2294,25 @@ static int qcow2_create2(const char *filename, int64_t total_size, } } - bdrv_unref(bs); - bs = NULL; + blk_unref(blk); + blk = NULL; /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ options = qdict_new(); qdict_put(options, "driver", qstring_from_str("qcow2")); - ret = bdrv_open(&bs, filename, NULL, options, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, - &local_err); - if (local_err) { + blk = blk_new_open("image-flush", filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } ret = 0; out: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } return ret; } From 8a56fdadaff912b2f1ad581ac42580945e36a74b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 25/40] qed: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/qed.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/block/qed.c b/block/qed.c index 404be1e9b9..8de7dd0832 100644 --- a/block/qed.c +++ b/block/qed.c @@ -18,6 +18,7 @@ #include "qed.h" #include "qapi/qmp/qerror.h" #include "migration/migration.h" +#include "sysemu/block-backend.h" static const AIOCBInfo qed_aiocb_info = { .aiocb_size = sizeof(QEDAIOCB), @@ -580,7 +581,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, size_t l1_size = header.cluster_size * header.table_size; Error *local_err = NULL; int ret = 0; - BlockDriverState *bs; + BlockBackend *blk; ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { @@ -588,17 +589,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, return ret; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } + blk_set_allow_write_beyond_eof(blk, true); + /* File must start empty and grow, check truncate is supported */ - ret = bdrv_truncate(bs, 0); + ret = blk_truncate(blk, 0); if (ret < 0) { goto out; } @@ -614,18 +616,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, } qed_header_cpu_to_le(&header, &le_header); - ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); + ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header)); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, - header.backing_filename_size); + ret = blk_pwrite(blk, sizeof(le_header), backing_file, + header.backing_filename_size); if (ret < 0) { goto out; } l1_table = g_malloc0(l1_size); - ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); + ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size); if (ret < 0) { goto out; } @@ -633,7 +635,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, ret = 0; /* success */ out: g_free(l1_table); - bdrv_unref(bs); + blk_unref(blk); return ret; } From fba98d455a7de1f2ef801d60ae895b173f4738c5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 26/40] sheepdog: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/sheepdog.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 2488a8e67d..a6e98a5a72 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -18,6 +18,7 @@ #include "qemu/error-report.h" #include "qemu/sockets.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/bitops.h" #define SD_PROTO_VER 0x01 @@ -1636,7 +1637,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, static int sd_prealloc(const char *filename, Error **errp) { - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; BDRVSheepdogState *base = NULL; unsigned long buf_size; uint32_t idx, max_idx; @@ -1645,20 +1646,23 @@ static int sd_prealloc(const char *filename, Error **errp) void *buf = NULL; int ret; - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - errp); - if (ret < 0) { + blk = blk_new_open("image-prealloc", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + errp); + if (blk == NULL) { + ret = -EIO; goto out_with_err_set; } - vdi_size = bdrv_getlength(bs); + blk_set_allow_write_beyond_eof(blk, true); + + vdi_size = blk_getlength(blk); if (vdi_size < 0) { ret = vdi_size; goto out; } - base = bs->opaque; + base = blk_bs(blk)->opaque; object_size = (UINT32_C(1) << base->inode.block_size_shift); buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); buf = g_malloc0(buf_size); @@ -1670,23 +1674,24 @@ static int sd_prealloc(const char *filename, Error **errp) * The created image can be a cloned image, so we need to read * a data from the source image. */ - ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); + ret = blk_pread(blk, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); + ret = blk_pwrite(blk, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } } + ret = 0; out: if (ret < 0) { error_setg_errno(errp, -ret, "Can't pre-allocate"); } out_with_err_set: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } g_free(buf); @@ -1826,7 +1831,7 @@ static int sd_create(const char *filename, QemuOpts *opts, } if (backing_file) { - BlockDriverState *bs; + BlockBackend *blk; BDRVSheepdogState *base; BlockDriver *drv; @@ -1838,23 +1843,23 @@ static int sd_create(const char *filename, QemuOpts *opts, goto out; } - bs = NULL; - ret = bdrv_open(&bs, backing_file, NULL, NULL, - BDRV_O_PROTOCOL | BDRV_O_CACHE_WB, errp); - if (ret < 0) { + blk = blk_new_open("backing", backing_file, NULL, NULL, + BDRV_O_PROTOCOL | BDRV_O_CACHE_WB, errp); + if (blk == NULL) { + ret = -EIO; goto out; } - base = bs->opaque; + base = blk_bs(blk)->opaque; if (!is_snapshot(&base->inode)) { error_setg(errp, "cannot clone from a non snapshot vdi"); - bdrv_unref(bs); + blk_unref(blk); ret = -EINVAL; goto out; } s->inode.vdi_id = base->inode.vdi_id; - bdrv_unref(bs); + blk_unref(blk); } s->aio_context = qemu_get_aio_context(); From a08f0c3b5fe8d2e08c3b7e797b359772819fbb92 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 27/40] vdi: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/vdi.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/vdi.c b/block/vdi.c index 12407c43e9..662d14b74e 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -52,6 +52,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" #include "qemu/coroutine.h" @@ -733,7 +734,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) size_t bmap_size; int64_t offset = 0; Error *local_err = NULL; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; uint32_t *bmap = NULL; logout("\n"); @@ -766,14 +767,18 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) error_propagate(errp, local_err); goto exit; } - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + /* We need enough blocks to store the given disk size, so always round up. */ blocks = DIV_ROUND_UP(bytes, block_size); @@ -803,7 +808,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + ret = blk_pwrite(blk, offset, &header, sizeof(header)); if (ret < 0) { error_setg(errp, "Error writing header to %s", filename); goto exit; @@ -824,7 +829,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + ret = blk_pwrite(blk, offset, bmap, bmap_size); if (ret < 0) { error_setg(errp, "Error writing bmap to %s", filename); goto exit; @@ -833,7 +838,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } if (image_type == VDI_TYPE_STATIC) { - ret = bdrv_truncate(bs, offset + blocks * block_size); + ret = blk_truncate(blk, offset + blocks * block_size); if (ret < 0) { error_setg(errp, "Failed to statically allocate %s", filename); goto exit; @@ -841,7 +846,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } exit: - bdrv_unref(bs); + blk_unref(blk); g_free(bmap); return ret; } From 10bf03af128fa9ff5eb696c7b4dc145406f2de40 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 28/40] vhdx: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/vhdx.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/block/vhdx.c b/block/vhdx.c index ea030ad509..e15020c9be 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -18,6 +18,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/crc32c.h" #include "block/vhdx.h" @@ -1772,7 +1773,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) gunichar2 *creator = NULL; glong creator_items; - BlockDriverState *bs; + BlockBackend *blk; char *type = NULL; VHDXImageType image_type; Error *local_err = NULL; @@ -1837,15 +1838,17 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + /* Create (A) */ /* The creator field is optional, but may be useful for @@ -1853,13 +1856,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, &creator_items, NULL); signature = cpu_to_le64(VHDX_FILE_SIGNATURE); - ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); if (ret < 0) { goto delete_and_exit; } if (creator) { - ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), - creator, creator_items * sizeof(gunichar2)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature), + creator, creator_items * sizeof(gunichar2)); if (ret < 0) { goto delete_and_exit; } @@ -1867,13 +1870,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) /* Creates (B),(C) */ - ret = vhdx_create_new_headers(bs, image_size, log_size); + ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size); if (ret < 0) { goto delete_and_exit; } /* Creates (D),(E),(G) explicitly. (F) created as by-product */ - ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, + ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512, log_size, use_zero_blocks, image_type, &metadata_offset); if (ret < 0) { @@ -1881,7 +1884,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) } /* Creates (H) */ - ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, + ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512, metadata_offset, image_type); if (ret < 0) { goto delete_and_exit; @@ -1889,7 +1892,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) delete_and_exit: - bdrv_unref(bs); + blk_unref(blk); exit: g_free(type); g_free(creator); From c4bea1690e3c920f8139356f79ff38ce92ddeaa9 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 29/40] vmdk: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/vmdk.c | 77 +++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index dd8093688a..23bd57e20e 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -26,6 +26,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "qemu/module.h" @@ -1650,7 +1651,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, QemuOpts *opts, Error **errp) { int ret, i; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; VMDK4Header header; Error *local_err = NULL; uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; @@ -1663,17 +1664,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, goto exit; } - assert(bs == NULL); - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("extent", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + if (flat) { - ret = bdrv_truncate(bs, filesize); + ret = blk_truncate(blk, filesize); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); } @@ -1728,18 +1731,18 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + ret = blk_pwrite(blk, 0, &magic, sizeof(magic)); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); + ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); goto exit; @@ -1752,8 +1755,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, i < gt_count; i++, tmp += gt_size) { gd_buf[i] = cpu_to_le32(tmp); } - ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1764,8 +1767,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, i < gt_count; i++, tmp += gt_size) { gd_buf[i] = cpu_to_le32(tmp); } - ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1773,8 +1776,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, ret = 0; exit: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } g_free(gd_buf); return ret; @@ -1823,7 +1826,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix, static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) { int idx = 0; - BlockDriverState *new_bs = NULL; + BlockBackend *new_blk = NULL; Error *local_err = NULL; char *desc = NULL; int64_t total_size = 0, filesize; @@ -1934,7 +1937,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } if (backing_file) { - BlockDriverState *bs = NULL; + BlockBackend *blk; char *full_backing = g_new0(char, PATH_MAX); bdrv_get_full_backing_filename_from_filename(filename, backing_file, full_backing, PATH_MAX, @@ -1945,19 +1948,21 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) ret = -ENOENT; goto exit; } - ret = bdrv_open(&bs, full_backing, NULL, NULL, - BDRV_O_NO_BACKING | BDRV_O_CACHE_WB, errp); + + blk = blk_new_open("backing", full_backing, NULL, NULL, + BDRV_O_NO_BACKING | BDRV_O_CACHE_WB, errp); g_free(full_backing); - if (ret != 0) { + if (blk == NULL) { + ret = -EIO; goto exit; } - if (strcmp(bs->drv->format_name, "vmdk")) { - bdrv_unref(bs); + if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) { + blk_unref(blk); ret = -EINVAL; goto exit; } - parent_cid = vmdk_read_cid(bs, 0); - bdrv_unref(bs); + parent_cid = vmdk_read_cid(blk_bs(blk), 0); + blk_unref(blk); snprintf(parent_desc_line, BUF_SIZE, "parentFileNameHint=\"%s\"", backing_file); } @@ -2015,15 +2020,19 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } } - assert(new_bs == NULL); - ret = bdrv_open(&new_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + + new_blk = blk_new_open("descriptor", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (new_blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } - ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + + blk_set_allow_write_beyond_eof(new_blk, true); + + ret = blk_pwrite(new_blk, desc_offset, desc, desc_len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write description"); goto exit; @@ -2031,14 +2040,14 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) /* bdrv_pwrite write padding zeros to align to sector, we don't need that * for description file */ if (desc_offset == 0) { - ret = bdrv_truncate(new_bs, desc_len); + ret = blk_truncate(new_blk, desc_len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); } } exit: - if (new_bs) { - bdrv_unref(new_bs); + if (new_blk) { + blk_unref(new_blk); } g_free(adapter_type); g_free(backing_file); From b8f45cdf7827e39f9a1e6cc446f5972cc6144237 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 8 Mar 2016 15:57:05 +0100 Subject: [PATCH 30/40] vpc: Use BB functions in .bdrv_create() All users of the block layers are supposed to go through a BlockBackend. The .bdrv_create() implementation is one such user, so this patch converts it. Signed-off-by: Kevin Wolf --- block/vpc.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 1db47d63ba..0d1524d6f6 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" #if defined(CONFIG_UUID) @@ -758,7 +759,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, return 0; } -static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, +static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, int64_t total_sectors) { VHDDynDiskHeader *dyndisk_header = @@ -772,13 +773,13 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); if (ret) { goto fail; } offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); - ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); if (ret < 0) { goto fail; } @@ -788,7 +789,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - ret = bdrv_pwrite_sync(bs, offset, buf, 512); + ret = blk_pwrite(blk, offset, buf, 512); if (ret < 0) { goto fail; } @@ -815,7 +816,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, // Write the header offset = 512; - ret = bdrv_pwrite_sync(bs, offset, buf, 1024); + ret = blk_pwrite(blk, offset, buf, 1024); if (ret < 0) { goto fail; } @@ -824,7 +825,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, return ret; } -static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, +static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, int64_t total_size) { int ret; @@ -832,12 +833,12 @@ static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, /* Add footer to total size */ total_size += HEADER_SIZE; - ret = bdrv_truncate(bs, total_size); + ret = blk_truncate(blk, total_size); if (ret < 0) { return ret; } - ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); + ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE); if (ret < 0) { return ret; } @@ -860,7 +861,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) int ret = -EIO; bool force_size; Error *local_err = NULL; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), @@ -886,14 +887,18 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) error_propagate(errp, local_err); goto out; } - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } + blk_set_allow_write_beyond_eof(blk, true); + /* * Calculate matching total_size and geometry. Increase the number of * sectors requested until we get enough (or fail). This ensures that @@ -965,13 +970,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE)); if (disk_type == VHD_DYNAMIC) { - ret = create_dynamic_disk(bs, buf, total_sectors); + ret = create_dynamic_disk(blk, buf, total_sectors); } else { - ret = create_fixed_disk(bs, buf, total_size); + ret = create_fixed_disk(blk, buf, total_size); } out: - bdrv_unref(bs); + blk_unref(blk); g_free(disk_type_param); return ret; } From b2f56462d51a49c28d2a7b214b3ae8e8d3329f1f Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 12:44:52 +0800 Subject: [PATCH 31/40] backup: Use Bitmap to replace "s->bitmap" "s->bitmap" tracks done sectors, we only check bit states without using any iterator which HBitmap is good for. Switch to "Bitmap" which is simpler and more memory efficient. Meanwhile, rename it to done_bitmap, to reflect the intention. Signed-off-by: Fam Zheng Reviewed-by: John Snow Message-id: 1457412306-18940-2-git-send-email-famz@redhat.com Signed-off-by: Max Reitz --- block/backup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/backup.c b/block/backup.c index 0f1b1bc084..ab3e345e92 100644 --- a/block/backup.c +++ b/block/backup.c @@ -20,6 +20,7 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "qemu/bitmap.h" #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) #define SLICE_TIME 100000000ULL /* ns */ @@ -42,7 +43,7 @@ typedef struct BackupBlockJob { BlockdevOnError on_target_error; CoRwlock flush_rwlock; uint64_t sectors_read; - HBitmap *bitmap; + unsigned long *done_bitmap; int64_t cluster_size; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; @@ -116,7 +117,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, cow_request_begin(&cow_request, job, start, end); for (; start < end; start++) { - if (hbitmap_get(job->bitmap, start)) { + if (test_bit(start, job->done_bitmap)) { trace_backup_do_cow_skip(job, start); continue; /* already copied */ } @@ -167,7 +168,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, goto out; } - hbitmap_set(job->bitmap, start, 1); + set_bit(start, job->done_bitmap); /* Publish progress, guest I/O counts as progress too. Note that the * offset field is an opaque progress value, it is not a disk offset. @@ -399,7 +400,7 @@ static void coroutine_fn backup_run(void *opaque) start = 0; end = DIV_ROUND_UP(job->common.len, job->cluster_size); - job->bitmap = hbitmap_alloc(end, 0); + job->done_bitmap = bitmap_new(end); bdrv_set_enable_write_cache(target, true); if (target->blk) { @@ -480,7 +481,7 @@ static void coroutine_fn backup_run(void *opaque) /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); - hbitmap_free(job->bitmap); + g_free(job->done_bitmap); if (target->blk) { blk_iostatus_disable(target->blk); From 78f9dc859d2db4c3bad382e09e0935bf959d07a6 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 12:44:53 +0800 Subject: [PATCH 32/40] block: Include hbitmap.h in block.h Signed-off-by: Fam Zheng Reviewed-by: John Snow Message-id: 1457412306-18940-3-git-send-email-famz@redhat.com Signed-off-by: Max Reitz --- include/block/block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/block/block.h b/include/block/block.h index 3900c4d8c3..66c55ea77b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -8,6 +8,7 @@ #include "block/accounting.h" #include "qapi/qmp/qobject.h" #include "qapi-types.h" +#include "qemu/hbitmap.h" /* block.c */ typedef struct BlockDriver BlockDriver; @@ -474,7 +475,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t size); void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -struct HBitmapIter; typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, From 9a3f5cf1bfd2b1e11a496bdaf038cc7d825e01df Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 12:44:54 +0800 Subject: [PATCH 33/40] typedefs: Add BdrvDirtyBitmap Following patches to refactor and move block dirty bitmap code could use this. Signed-off-by: Fam Zheng Reviewed-by: John Snow Message-id: 1457412306-18940-4-git-send-email-famz@redhat.com Signed-off-by: Max Reitz --- include/block/block.h | 1 - include/qemu/typedefs.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/block/block.h b/include/block/block.h index 66c55ea77b..a4031765ea 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -475,7 +475,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t size); void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, const char *name, diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 9a5ead69a1..fd039e0e81 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -10,6 +10,7 @@ typedef struct AddressSpace AddressSpace; typedef struct AioContext AioContext; typedef struct AllwinnerAHCIState AllwinnerAHCIState; typedef struct AudioState AudioState; +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; typedef struct BlockBackend BlockBackend; typedef struct BlockBackendRootState BlockBackendRootState; typedef struct BlockDriverState BlockDriverState; From ebab225910d7bb3d805176ff89b5efc5f18653c3 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 12:44:55 +0800 Subject: [PATCH 34/40] block: Move block dirty bitmap code to separate files The only code change is making bdrv_dirty_bitmap_truncate public. It is used in block.c. Also two long lines (bdrv_get_dirty) are wrapped. Signed-off-by: Fam Zheng Reviewed-by: John Snow Message-id: 1457412306-18940-5-git-send-email-famz@redhat.com Signed-off-by: Max Reitz --- block.c | 360 -------------------------------- block/Makefile.objs | 2 +- block/dirty-bitmap.c | 387 +++++++++++++++++++++++++++++++++++ include/block/block.h | 35 +--- include/block/dirty-bitmap.h | 44 ++++ 5 files changed, 433 insertions(+), 395 deletions(-) create mode 100644 block/dirty-bitmap.c create mode 100644 include/block/dirty-bitmap.h diff --git a/block.c b/block.c index cf5eb34382..59a18a3a66 100644 --- a/block.c +++ b/block.c @@ -53,23 +53,6 @@ #include #endif -/** - * A BdrvDirtyBitmap can be in three possible states: - * (1) successor is NULL and disabled is false: full r/w mode - * (2) successor is NULL and disabled is true: read only mode ("disabled") - * (3) successor is set: frozen mode. - * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, - * or enabled. A frozen bitmap can only abdicate() or reclaim(). - */ -struct BdrvDirtyBitmap { - HBitmap *bitmap; /* Dirty sector bitmap implementation */ - BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ - char *name; /* Optional non-empty unique ID */ - int64_t size; /* Size of the bitmap (Number of sectors) */ - bool disabled; /* Bitmap is read-only */ - QLIST_ENTRY(BdrvDirtyBitmap) list; -}; - #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ struct BdrvStates bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -88,9 +71,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, BlockDriverState *parent, const BdrvChildRole *child_role, Error **errp); -static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); -static void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs); - /* If non-zero, use only whitelisted block drivers */ static int use_bdrv_whitelist; @@ -3445,346 +3425,6 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked) } } -BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) -{ - BdrvDirtyBitmap *bm; - - assert(name); - QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { - if (bm->name && !strcmp(name, bm->name)) { - return bm; - } - } - return NULL; -} - -void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - g_free(bitmap->name); - bitmap->name = NULL; -} - -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, - uint32_t granularity, - const char *name, - Error **errp) -{ - int64_t bitmap_size; - BdrvDirtyBitmap *bitmap; - uint32_t sector_granularity; - - assert((granularity & (granularity - 1)) == 0); - - if (name && bdrv_find_dirty_bitmap(bs, name)) { - error_setg(errp, "Bitmap already exists: %s", name); - return NULL; - } - sector_granularity = granularity >> BDRV_SECTOR_BITS; - assert(sector_granularity); - bitmap_size = bdrv_nb_sectors(bs); - if (bitmap_size < 0) { - error_setg_errno(errp, -bitmap_size, "could not get length of device"); - errno = -bitmap_size; - return NULL; - } - bitmap = g_new0(BdrvDirtyBitmap, 1); - bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); - bitmap->size = bitmap_size; - bitmap->name = g_strdup(name); - bitmap->disabled = false; - QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); - return bitmap; -} - -bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) -{ - return bitmap->successor; -} - -bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) -{ - return !(bitmap->disabled || bitmap->successor); -} - -DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) -{ - if (bdrv_dirty_bitmap_frozen(bitmap)) { - return DIRTY_BITMAP_STATUS_FROZEN; - } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { - return DIRTY_BITMAP_STATUS_DISABLED; - } else { - return DIRTY_BITMAP_STATUS_ACTIVE; - } -} - -/** - * Create a successor bitmap destined to replace this bitmap after an operation. - * Requires that the bitmap is not frozen and has no successor. - */ -int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, Error **errp) -{ - uint64_t granularity; - BdrvDirtyBitmap *child; - - if (bdrv_dirty_bitmap_frozen(bitmap)) { - error_setg(errp, "Cannot create a successor for a bitmap that is " - "currently frozen"); - return -1; - } - assert(!bitmap->successor); - - /* Create an anonymous successor */ - granularity = bdrv_dirty_bitmap_granularity(bitmap); - child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); - if (!child) { - return -1; - } - - /* Successor will be on or off based on our current state. */ - child->disabled = bitmap->disabled; - - /* Install the successor and freeze the parent */ - bitmap->successor = child; - return 0; -} - -/** - * For a bitmap with a successor, yield our name to the successor, - * delete the old bitmap, and return a handle to the new bitmap. - */ -BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp) -{ - char *name; - BdrvDirtyBitmap *successor = bitmap->successor; - - if (successor == NULL) { - error_setg(errp, "Cannot relinquish control if " - "there's no successor present"); - return NULL; - } - - name = bitmap->name; - bitmap->name = NULL; - successor->name = name; - bitmap->successor = NULL; - bdrv_release_dirty_bitmap(bs, bitmap); - - return successor; -} - -/** - * In cases of failure where we can no longer safely delete the parent, - * we may wish to re-join the parent and child/successor. - * The merged parent will be un-frozen, but not explicitly re-enabled. - */ -BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *parent, - Error **errp) -{ - BdrvDirtyBitmap *successor = parent->successor; - - if (!successor) { - error_setg(errp, "Cannot reclaim a successor when none is present"); - return NULL; - } - - if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { - error_setg(errp, "Merging of parent and successor bitmap failed"); - return NULL; - } - bdrv_release_dirty_bitmap(bs, successor); - parent->successor = NULL; - - return parent; -} - -/** - * Truncates _all_ bitmaps attached to a BDS. - */ -static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) -{ - BdrvDirtyBitmap *bitmap; - uint64_t size = bdrv_nb_sectors(bs); - - QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - hbitmap_truncate(bitmap->bitmap, size); - bitmap->size = size; - } -} - -static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - bool only_named) -{ - BdrvDirtyBitmap *bm, *next; - QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { - if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { - assert(!bdrv_dirty_bitmap_frozen(bm)); - QLIST_REMOVE(bm, list); - hbitmap_free(bm->bitmap); - g_free(bm->name); - g_free(bm); - - if (bitmap) { - return; - } - } - } -} - -void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) -{ - bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); -} - -/** - * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). - * There must not be any frozen bitmaps attached. - */ -static void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) -{ - bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); -} - -void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - bitmap->disabled = true; -} - -void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - bitmap->disabled = false; -} - -BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) -{ - BdrvDirtyBitmap *bm; - BlockDirtyInfoList *list = NULL; - BlockDirtyInfoList **plist = &list; - - QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { - BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); - BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); - info->count = bdrv_get_dirty_count(bm); - info->granularity = bdrv_dirty_bitmap_granularity(bm); - info->has_name = !!bm->name; - info->name = g_strdup(bm->name); - info->status = bdrv_dirty_bitmap_status(bm); - entry->value = info; - *plist = entry; - plist = &entry->next; - } - - return list; -} - -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) -{ - if (bitmap) { - return hbitmap_get(bitmap->bitmap, sector); - } else { - return 0; - } -} - -/** - * Chooses a default granularity based on the existing cluster size, - * but clamped between [4K, 64K]. Defaults to 64K in the case that there - * is no cluster size information available. - */ -uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - uint32_t granularity; - - if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { - granularity = MAX(4096, bdi.cluster_size); - granularity = MIN(65536, granularity); - } else { - granularity = 65536; - } - - return granularity; -} - -uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) -{ - return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); -} - -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) -{ - hbitmap_iter_init(hbi, bitmap->bitmap, 0); -} - -void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); -} - -void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); -} - -void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - if (!out) { - hbitmap_reset_all(bitmap->bitmap); - } else { - HBitmap *backup = bitmap->bitmap; - bitmap->bitmap = hbitmap_alloc(bitmap->size, - hbitmap_granularity(backup)); - *out = backup; - } -} - -void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) -{ - HBitmap *tmp = bitmap->bitmap; - assert(bdrv_dirty_bitmap_enabled(bitmap)); - bitmap->bitmap = in; - hbitmap_free(tmp); -} - -void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) -{ - BdrvDirtyBitmap *bitmap; - QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { - if (!bdrv_dirty_bitmap_enabled(bitmap)) { - continue; - } - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); - } -} - -/** - * Advance an HBitmapIter to an arbitrary offset. - */ -void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) -{ - assert(hbi->hb); - hbitmap_iter_init(hbi, hbi->hb, offset); -} - -int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) -{ - return hbitmap_count(bitmap->bitmap); -} - /* Get a reference to bs */ void bdrv_ref(BlockDriverState *bs) { diff --git a/block/Makefile.objs b/block/Makefile.objs index 58ef2ef3f2..cdd865597a 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -20,7 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o -block-obj-y += accounting.o +block-obj-y += accounting.o dirty-bitmap.o block-obj-y += write-threshold.o common-obj-y += stream.o diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c new file mode 100644 index 0000000000..556e1d15c4 --- /dev/null +++ b/block/dirty-bitmap.c @@ -0,0 +1,387 @@ +/* + * Block Dirty Bitmap + * + * Copyright (c) 2016 Red Hat. Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "config-host.h" +#include "qemu-common.h" +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" + +/** + * A BdrvDirtyBitmap can be in three possible states: + * (1) successor is NULL and disabled is false: full r/w mode + * (2) successor is NULL and disabled is true: read only mode ("disabled") + * (3) successor is set: frozen mode. + * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, + * or enabled. A frozen bitmap can only abdicate() or reclaim(). + */ +struct BdrvDirtyBitmap { + HBitmap *bitmap; /* Dirty sector bitmap implementation */ + BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ + char *name; /* Optional non-empty unique ID */ + int64_t size; /* Size of the bitmap (Number of sectors) */ + bool disabled; /* Bitmap is read-only */ + QLIST_ENTRY(BdrvDirtyBitmap) list; +}; + +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) +{ + BdrvDirtyBitmap *bm; + + assert(name); + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + if (bm->name && !strcmp(name, bm->name)) { + return bm; + } + } + return NULL; +} + +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + g_free(bitmap->name); + bitmap->name = NULL; +} + +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp) +{ + int64_t bitmap_size; + BdrvDirtyBitmap *bitmap; + uint32_t sector_granularity; + + assert((granularity & (granularity - 1)) == 0); + + if (name && bdrv_find_dirty_bitmap(bs, name)) { + error_setg(errp, "Bitmap already exists: %s", name); + return NULL; + } + sector_granularity = granularity >> BDRV_SECTOR_BITS; + assert(sector_granularity); + bitmap_size = bdrv_nb_sectors(bs); + if (bitmap_size < 0) { + error_setg_errno(errp, -bitmap_size, "could not get length of device"); + errno = -bitmap_size; + return NULL; + } + bitmap = g_new0(BdrvDirtyBitmap, 1); + bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); + bitmap->size = bitmap_size; + bitmap->name = g_strdup(name); + bitmap->disabled = false; + QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + return bitmap; +} + +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) +{ + return bitmap->successor; +} + +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) +{ + return !(bitmap->disabled || bitmap->successor); +} + +DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) +{ + if (bdrv_dirty_bitmap_frozen(bitmap)) { + return DIRTY_BITMAP_STATUS_FROZEN; + } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { + return DIRTY_BITMAP_STATUS_DISABLED; + } else { + return DIRTY_BITMAP_STATUS_ACTIVE; + } +} + +/** + * Create a successor bitmap destined to replace this bitmap after an operation. + * Requires that the bitmap is not frozen and has no successor. + */ +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, Error **errp) +{ + uint64_t granularity; + BdrvDirtyBitmap *child; + + if (bdrv_dirty_bitmap_frozen(bitmap)) { + error_setg(errp, "Cannot create a successor for a bitmap that is " + "currently frozen"); + return -1; + } + assert(!bitmap->successor); + + /* Create an anonymous successor */ + granularity = bdrv_dirty_bitmap_granularity(bitmap); + child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); + if (!child) { + return -1; + } + + /* Successor will be on or off based on our current state. */ + child->disabled = bitmap->disabled; + + /* Install the successor and freeze the parent */ + bitmap->successor = child; + return 0; +} + +/** + * For a bitmap with a successor, yield our name to the successor, + * delete the old bitmap, and return a handle to the new bitmap. + */ +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp) +{ + char *name; + BdrvDirtyBitmap *successor = bitmap->successor; + + if (successor == NULL) { + error_setg(errp, "Cannot relinquish control if " + "there's no successor present"); + return NULL; + } + + name = bitmap->name; + bitmap->name = NULL; + successor->name = name; + bitmap->successor = NULL; + bdrv_release_dirty_bitmap(bs, bitmap); + + return successor; +} + +/** + * In cases of failure where we can no longer safely delete the parent, + * we may wish to re-join the parent and child/successor. + * The merged parent will be un-frozen, but not explicitly re-enabled. + */ +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *parent, + Error **errp) +{ + BdrvDirtyBitmap *successor = parent->successor; + + if (!successor) { + error_setg(errp, "Cannot reclaim a successor when none is present"); + return NULL; + } + + if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { + error_setg(errp, "Merging of parent and successor bitmap failed"); + return NULL; + } + bdrv_release_dirty_bitmap(bs, successor); + parent->successor = NULL; + + return parent; +} + +/** + * Truncates _all_ bitmaps attached to a BDS. + */ +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bitmap; + uint64_t size = bdrv_nb_sectors(bs); + + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + hbitmap_truncate(bitmap->bitmap, size); + bitmap->size = size; + } +} + +static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + bool only_named) +{ + BdrvDirtyBitmap *bm, *next; + QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { + if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { + assert(!bdrv_dirty_bitmap_frozen(bm)); + QLIST_REMOVE(bm, list); + hbitmap_free(bm->bitmap); + g_free(bm->name); + g_free(bm); + + if (bitmap) { + return; + } + } + } +} + +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) +{ + bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); +} + +/** + * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). + * There must not be any frozen bitmaps attached. + */ +void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) +{ + bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); +} + +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = true; +} + +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = false; +} + +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bm; + BlockDirtyInfoList *list = NULL; + BlockDirtyInfoList **plist = &list; + + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); + BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); + info->count = bdrv_get_dirty_count(bm); + info->granularity = bdrv_dirty_bitmap_granularity(bm); + info->has_name = !!bm->name; + info->name = g_strdup(bm->name); + info->status = bdrv_dirty_bitmap_status(bm); + entry->value = info; + *plist = entry; + plist = &entry->next; + } + + return list; +} + +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector) +{ + if (bitmap) { + return hbitmap_get(bitmap->bitmap, sector); + } else { + return 0; + } +} + +/** + * Chooses a default granularity based on the existing cluster size, + * but clamped between [4K, 64K]. Defaults to 64K in the case that there + * is no cluster size information available. + */ +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + uint32_t granularity; + + if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + + return granularity; +} + +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) +{ + return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); +} + +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) +{ + hbitmap_iter_init(hbi, bitmap->bitmap, 0); +} + +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); +} + +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); +} + +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + if (!out) { + hbitmap_reset_all(bitmap->bitmap); + } else { + HBitmap *backup = bitmap->bitmap; + bitmap->bitmap = hbitmap_alloc(bitmap->size, + hbitmap_granularity(backup)); + *out = backup; + } +} + +void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) +{ + HBitmap *tmp = bitmap->bitmap; + assert(bdrv_dirty_bitmap_enabled(bitmap)); + bitmap->bitmap = in; + hbitmap_free(tmp); +} + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) +{ + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + if (!bdrv_dirty_bitmap_enabled(bitmap)) { + continue; + } + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + } +} + +/** + * Advance an HBitmapIter to an arbitrary offset. + */ +void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) +{ + assert(hbi->hb); + hbitmap_iter_init(hbi, hbi->hb, offset); +} + +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) +{ + return hbitmap_count(bitmap->bitmap); +} diff --git a/include/block/block.h b/include/block/block.h index a4031765ea..9688d7f421 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -6,6 +6,7 @@ #include "qemu/option.h" #include "qemu/coroutine.h" #include "block/accounting.h" +#include "block/dirty-bitmap.h" #include "qapi/qmp/qobject.h" #include "qapi-types.h" #include "qemu/hbitmap.h" @@ -475,40 +476,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t size); void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, - uint32_t granularity, - const char *name, - Error **errp); -int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp); -BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp); -BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp); -BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, - const char *name); -void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap); -void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap); -void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); -void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); -BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); -uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); -uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); -bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); -bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); -DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector); -void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors); -void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors); -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); -void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); -int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); - void bdrv_enable_copy_on_read(BlockDriverState *bs); void bdrv_disable_copy_on_read(BlockDriverState *bs); diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h new file mode 100644 index 0000000000..80afe603f6 --- /dev/null +++ b/include/block/dirty-bitmap.h @@ -0,0 +1,44 @@ +#ifndef BLOCK_DIRTY_BITMAP_H +#define BLOCK_DIRTY_BITMAP_H + +#include "qemu-common.h" +#include "qemu/hbitmap.h" + +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp); +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, + const char *name); +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap); +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap); +void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs); +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); +DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector); +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors); +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors); +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); +void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); + +#endif From fcce736719d04effab3d9a07d29ab4cecaf841dd Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 8 Mar 2016 12:44:56 +0800 Subject: [PATCH 35/40] block: Remove unused typedef of BlockDriverDirtyHandler Signed-off-by: Fam Zheng Reviewed-by: John Snow Message-id: 1457412306-18940-6-git-send-email-famz@redhat.com Signed-off-by: Max Reitz --- include/block/block.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index 9688d7f421..eaa64262d9 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -321,8 +321,6 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp); /* async block I/O */ -typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector, - int sector_num); BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, BlockCompletionFunc *cb, void *opaque); From e3f66e03680ec2a1f43393d551bc83c5d469f84b Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 11 Mar 2016 15:14:47 +0100 Subject: [PATCH 36/40] iotests: Correct 081's reference output The newly added type parameter for the QUORUM_REPORT_BAD event changed the output of iotest 081, so the reference should be amended accordingly. Signed-off-by: Max Reitz Message-id: 1457705687-27122-1-git-send-email-mreitz@redhat.com Reviewed-by: Alberto Garcia --- tests/qemu-iotests/081.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out index 70632314c8..97df69d71c 100644 --- a/tests/qemu-iotests/081.out +++ b/tests/qemu-iotests/081.out @@ -31,7 +31,7 @@ QMP_VERSION {"return": {}} {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "drive2", "sectors-count": 20480, "sector-num": 0}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "drive2", "sectors-count": 20480, "sector-num": 0, "type": "read"}} read 10485760/10485760 bytes at offset 0 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": ""} From b9c600d20716b3d942cb07188ff998fb236a8365 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 10 Mar 2016 13:55:24 +0200 Subject: [PATCH 37/40] quorum: Fix crash in quorum_aio_cb() quorum_aio_cb() emits the QUORUM_REPORT_BAD event if there's an I/O error in a Quorum child. However sacb->aiocb must be correctly initialized for this to happen. read_quorum_children() and read_fifo_child() are not doing this, which results in a QEMU crash. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 8138570d071ba7e25db3736979234a1fd71dbd05.1457610443.git.berto@igalia.com Signed-off-by: Max Reitz --- block/quorum.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index b16171b57e..3d473515a8 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -655,8 +655,9 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) } for (i = 0; i < s->num_children; i++) { - bdrv_aio_readv(s->children[i]->bs, acb->sector_num, &acb->qcrs[i].qiov, - acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); + acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num, + &acb->qcrs[i].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[i]); } return &acb->common; @@ -671,9 +672,10 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, acb->qcrs[acb->child_iter].buf); - bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, - &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[acb->child_iter]); + acb->qcrs[acb->child_iter].aiocb = + bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[acb->child_iter]); return &acb->common; } From 6d425eb94d34df3fdcebbe512ae6091a307cb022 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 10 Mar 2016 13:55:25 +0200 Subject: [PATCH 38/40] monitor: Separate QUORUM_REPORT_BAD events according to the node name The QUORUM_REPORT_BAD event is emitted whenever there's an I/O error in a child of a Quorum device. This event is emitted at a maximum rate of 1 per second. This means that an error in one of the children will mask errors in the other children if they happen within the same 1 second interval. This patch modifies qapi_event_throttle_equal() so QUORUM_REPORT_BAD events are kept separately if they come from different children. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: b989c0cb3755bc4b6696e796fa8ed2ef6c56606a.1457610443.git.berto@igalia.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- monitor.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/monitor.c b/monitor.c index e99ca8c91e..c9fe862754 100644 --- a/monitor.c +++ b/monitor.c @@ -572,6 +572,10 @@ static unsigned int qapi_event_throttle_hash(const void *key) hash += g_str_hash(qdict_get_str(evstate->data, "id")); } + if (evstate->event == QAPI_EVENT_QUORUM_REPORT_BAD) { + hash += g_str_hash(qdict_get_str(evstate->data, "node-name")); + } + return hash; } @@ -589,6 +593,11 @@ static gboolean qapi_event_throttle_equal(const void *a, const void *b) qdict_get_str(evb->data, "id")); } + if (eva->event == QAPI_EVENT_QUORUM_REPORT_BAD) { + return !strcmp(qdict_get_str(eva->data, "node-name"), + qdict_get_str(evb->data, "node-name")); + } + return TRUE; } From dc5999787181f6d090217f45570067e55333835b Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 10 Mar 2016 13:55:26 +0200 Subject: [PATCH 39/40] monitor: Use QEMU_CLOCK_VIRTUAL for the event queue in qtest mode This allows us to perform tests on the monitor queues to verify that the rate limits are enforced. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: dde511809e954a5c32d5b648bb184c03c89ed5d5.1457610443.git.berto@igalia.com Signed-off-by: Max Reitz --- monitor.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/monitor.c b/monitor.c index c9fe862754..894f862dd3 100644 --- a/monitor.c +++ b/monitor.c @@ -76,6 +76,7 @@ #include "qapi-event.h" #include "qmp-introspect.h" #include "sysemu/block-backend.h" +#include "sysemu/qtest.h" /* for hmp_info_irq/pic */ #if defined(TARGET_SPARC) @@ -232,6 +233,8 @@ static const mon_cmd_t qmp_cmds[]; Monitor *cur_mon; +static QEMUClockType event_clock_type = QEMU_CLOCK_REALTIME; + static void monitor_command_cb(void *opaque, const char *cmdline, void *readline_opaque); @@ -513,7 +516,7 @@ monitor_qapi_event_queue(QAPIEvent event, QDict *qdict, Error **errp) * monitor_qapi_event_handler() in evconf->rate ns. Any * events arriving before then will be delayed until then. */ - int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + int64_t now = qemu_clock_get_ns(event_clock_type); monitor_qapi_event_emit(event, qdict); @@ -522,7 +525,7 @@ monitor_qapi_event_queue(QAPIEvent event, QDict *qdict, Error **errp) evstate->data = data; QINCREF(evstate->data); evstate->qdict = NULL; - evstate->timer = timer_new_ns(QEMU_CLOCK_REALTIME, + evstate->timer = timer_new_ns(event_clock_type, monitor_qapi_event_handler, evstate); g_hash_table_add(monitor_qapi_event_state, evstate); @@ -547,7 +550,7 @@ static void monitor_qapi_event_handler(void *opaque) qemu_mutex_lock(&monitor_lock); if (evstate->qdict) { - int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + int64_t now = qemu_clock_get_ns(event_clock_type); monitor_qapi_event_emit(evstate->event, evstate->qdict); QDECREF(evstate->qdict); @@ -603,6 +606,10 @@ static gboolean qapi_event_throttle_equal(const void *a, const void *b) static void monitor_qapi_event_init(void) { + if (qtest_enabled()) { + event_clock_type = QEMU_CLOCK_VIRTUAL; + } + monitor_qapi_event_state = g_hash_table_new(qapi_event_throttle_hash, qapi_event_throttle_equal); qmp_event_set_func_emit(monitor_qapi_event_queue); From 7223c48cff7aec07517a30c2fbfadbb45de9eb14 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 10 Mar 2016 13:55:27 +0200 Subject: [PATCH 40/40] iotests: Add test for QMP event rates This test verifies that the rate-limited QMP events are emitted at a maximum rate of 1 per second as defined in monitor_qapi_event_conf in monitor.c It also checks that QUORUM_REPORT_BAD events generated from different nodes are kept in separate queues so they don't mask each other. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 0dbd3ee88a59a6363042ad81cfb345037bfbf612.1457610443.git.berto@igalia.com [mreitz@redhat.com: Renamed test from 146 to 148] Signed-off-by: Max Reitz --- tests/qemu-iotests/148 | 129 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/148.out | 5 ++ tests/qemu-iotests/group | 1 + 3 files changed, 135 insertions(+) create mode 100644 tests/qemu-iotests/148 create mode 100644 tests/qemu-iotests/148.out diff --git a/tests/qemu-iotests/148 b/tests/qemu-iotests/148 new file mode 100644 index 0000000000..30bc37958e --- /dev/null +++ b/tests/qemu-iotests/148 @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# +# Test the rate limit of QMP events +# +# Copyright (C) 2016 Igalia, S.L. +# Author: Alberto Garcia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import os +import iotests + +imgs = (os.path.join(iotests.test_dir, 'quorum0.img'), + os.path.join(iotests.test_dir, 'quorum1.img'), + os.path.join(iotests.test_dir, 'quorum2.img')) + +img_conf = (os.path.join(iotests.test_dir, 'quorum0.conf'), + os.path.join(iotests.test_dir, 'quorum1.conf'), + os.path.join(iotests.test_dir, 'quorum2.conf')) + +event_rate = 1000000000 +sector_size = 512 +offset = 10 + +class TestQuorumEvents(iotests.QMPTestCase): + + def create_blkdebug_file(self, blkdebug_file, bad_sector): + file = open(blkdebug_file, 'w') + file.write(''' +[inject-error] +event = "read_aio" +errno = "5" +sector = "%d" +''' % bad_sector) + file.close() + + def setUp(self): + driveopts = ['driver=quorum', 'vote-threshold=2'] + for i in range(len(imgs)): + iotests.qemu_img('create', '-f', iotests.imgfmt, imgs[i], '1M') + self.create_blkdebug_file(img_conf[i], i + offset) + driveopts.append('children.%d.driver=%s' % (i, iotests.imgfmt)) + driveopts.append('children.%d.file.driver=blkdebug' % i) + driveopts.append('children.%d.file.config=%s' % (i, img_conf[i])) + driveopts.append('children.%d.file.image.filename=%s' % (i, imgs[i])) + driveopts.append('children.%d.node-name=img%d' % (i, i)) + self.vm = iotests.VM() + self.vm.add_drive(None, opts = ','.join(driveopts)) + self.vm.launch() + + def tearDown(self): + self.vm.shutdown() + for i in range(len(imgs)): + os.remove(imgs[i]) + os.remove(img_conf[i]) + + def do_check_event(self, node, sector = 0): + if node == None: + self.assertEqual(self.vm.get_qmp_event(), None) + return + + for event in self.vm.get_qmp_events(wait=True): + if event['event'] == 'QUORUM_REPORT_BAD': + self.assert_qmp(event, 'data/node-name', node) + self.assert_qmp(event, 'data/sector-num', sector) + + def testQuorum(self): + if not 'quorum' in iotests.qemu_img_pipe('--help'): + return + + # Generate an error and get an event + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + (offset * sector_size, sector_size)) + self.vm.qtest("clock_step 10") + self.do_check_event('img0', offset) + + # I/O errors in the same child: only one event is emitted + delay = 10 + for i in range(3): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + (offset * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event(None) + + # Wait enough so the event is finally emitted + self.vm.qtest("clock_step %d" % (2 * event_rate)) + self.do_check_event('img0', offset) + + # I/O errors in the same child: all events are emitted + delay = 2 * event_rate + for i in range(3): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + (offset * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event('img0', offset) + + # I/O errors in different children: all events are emitted + delay = 10 + for i in range(len(imgs)): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + ((offset + i) * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event('img%d' % i, offset + i) + + # I/O errors in different children: all events are emitted + delay = 2 * event_rate + for i in range(len(imgs)): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + ((offset + i) * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event('img%d' % i, offset + i) + + # No more pending events + self.do_check_event(None) + +if __name__ == '__main__': + iotests.main(supported_fmts=["raw"]) diff --git a/tests/qemu-iotests/148.out b/tests/qemu-iotests/148.out new file mode 100644 index 0000000000..ae1213e6f8 --- /dev/null +++ b/tests/qemu-iotests/148.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 1211149249..faf0f21397 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -149,3 +149,4 @@ 144 rw auto quick 145 auto quick 146 auto quick +148 rw auto quick