From 7725b8bf129de6887fb680da28775fab5eded914 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 9 Nov 2015 08:09:33 +0100 Subject: [PATCH 01/12] block/nfs: add support for setting debug level recent libnfs versions support logging debug messages. Add support for it in qemu through an URL parameter. Example: qemu -cdrom nfs://127.0.0.1/iso/my.iso?debug=2 Signed-off-by: Peter Lieven Reviewed-by: Fam Zheng Message-id: 1447052973-14513-1-git-send-email-pl@kamp.de Signed-off-by: Jeff Cody --- block/nfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/nfs.c b/block/nfs.c index 5eb8c133b9..7220e8991b 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -36,6 +36,7 @@ #include #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576 +#define QEMU_NFS_MAX_DEBUG_LEVEL 2 typedef struct NFSClient { struct nfs_context *context; @@ -333,6 +334,17 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename, val = QEMU_NFS_MAX_READAHEAD_SIZE; } nfs_set_readahead(client->context, val); +#endif +#ifdef LIBNFS_FEATURE_DEBUG + } else if (!strcmp(qp->p[i].name, "debug")) { + /* limit the maximum debug level to avoid potential flooding + * of our log files. */ + if (val > QEMU_NFS_MAX_DEBUG_LEVEL) { + error_report("NFS Warning: Limiting NFS debug level" + " to %d", QEMU_NFS_MAX_DEBUG_LEVEL); + val = QEMU_NFS_MAX_DEBUG_LEVEL; + } + nfs_set_debug(client->context, val); #endif } else { error_setg(errp, "Unknown NFS parameter name: %s", From eab8eb8db39813a4ef4655f04128058494a3c4d3 Mon Sep 17 00:00:00 2001 From: Vasiliy Tolstov Date: Wed, 23 Dec 2015 21:22:26 +0900 Subject: [PATCH 02/12] sheepdog: allow to delete snapshot This patch implements a blockdriver function bdrv_snapshot_delete() in the sheepdog driver. With the new function, snapshots of sheepdog can be deleted from libvirt. Cc: Jeff Cody Signed-off-by: Hitoshi Mitake Signed-off-by: Vasiliy Tolstov Message-id: 1450873346-22334-1-git-send-email-mitake.hitoshi@lab.ntt.co.jp Signed-off-by: Jeff Cody --- block/sheepdog.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 2 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index a0098c1165..8739accddd 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -284,6 +284,12 @@ static inline bool is_snapshot(struct SheepdogInode *inode) return !!inode->snap_ctime; } +static inline size_t count_data_objs(const struct SheepdogInode *inode) +{ + return DIV_ROUND_UP(inode->vdi_size, + (1UL << inode->block_size_shift)); +} + #undef DPRINTF #ifdef DEBUG_SDOG #define DPRINTF(fmt, args...) \ @@ -2478,13 +2484,128 @@ out: return ret; } +#define NR_BATCHED_DISCARD 128 + +static bool remove_objects(BDRVSheepdogState *s) +{ + int fd, i = 0, nr_objs = 0; + Error *local_err = NULL; + int ret = 0; + bool result = true; + SheepdogInode *inode = &s->inode; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return false; + } + + nr_objs = count_data_objs(inode); + while (i < nr_objs) { + int start_idx, nr_filled_idx; + + while (i < nr_objs && !inode->data_vdi_id[i]) { + i++; + } + start_idx = i; + + nr_filled_idx = 0; + while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) { + if (inode->data_vdi_id[i]) { + inode->data_vdi_id[i] = 0; + nr_filled_idx++; + } + + i++; + } + + ret = write_object(fd, s->aio_context, + (char *)&inode->data_vdi_id[start_idx], + vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies, + (i - start_idx) * sizeof(uint32_t), + offsetof(struct SheepdogInode, + data_vdi_id[start_idx]), + false, s->cache_flags); + if (ret < 0) { + error_report("failed to discard snapshot inode."); + result = false; + goto out; + } + } + +out: + closesocket(fd); + return result; +} + static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id, const char *name, Error **errp) { - /* FIXME: Delete specified snapshot id. */ - return 0; + uint32_t snap_id = 0; + char snap_tag[SD_MAX_VDI_TAG_LEN]; + Error *local_err = NULL; + int fd, ret; + char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; + BDRVSheepdogState *s = bs->opaque; + unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0; + uint32_t vid; + SheepdogVdiReq hdr = { + .opcode = SD_OP_DEL_VDI, + .data_length = wlen, + .flags = SD_FLAG_CMD_WRITE, + }; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + + if (!remove_objects(s)) { + return -1; + } + + memset(buf, 0, sizeof(buf)); + memset(snap_tag, 0, sizeof(snap_tag)); + pstrcpy(buf, SD_MAX_VDI_LEN, s->name); + if (qemu_strtoul(snapshot_id, NULL, 10, (unsigned long *)&snap_id)) { + return -1; + } + + if (snap_id) { + hdr.snapid = snap_id; + } else { + pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id); + pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); + } + + ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, + &local_err); + if (ret) { + return ret; + } + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return -1; + } + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + buf, &wlen, &rlen); + closesocket(fd); + if (ret) { + return ret; + } + + switch (rsp->result) { + case SD_RES_NO_VDI: + error_report("%s was already deleted", s->name); + case SD_RES_SUCCESS: + break; + default: + error_report("%s, %s", sd_strerror(rsp->result), s->name); + return -1; + } + + return ret; } static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) From 60390a2192e7b38aee18db6ce7fb740498709737 Mon Sep 17 00:00:00 2001 From: "Daniel P. Berrange" Date: Thu, 21 Jan 2016 14:19:19 +0000 Subject: [PATCH 03/12] rbd: add support for getting password from QCryptoSecret object Currently RBD passwords must be provided on the command line via $QEMU -drive file=rbd:pool/image:id=myname:\ key=QVFDVm41aE82SHpGQWhBQXEwTkN2OGp0SmNJY0UrSE9CbE1RMUE=:\ auth_supported=cephx This is insecure because the key is visible in the OS process listing. This adds support for an 'password-secret' parameter in the RBD parameters that can be used with the QCryptoSecret object to provide the password via a file: echo "QVFDVm41aE82SHpGQWhBQXEwTkN2OGp0SmNJY0UrSE9CbE1RMUE=" > poolkey.b64 $QEMU -object secret,id=secret0,file=poolkey.b64,format=base64 \ -drive driver=rbd,filename=rbd:pool/image:id=myname:\ auth_supported=cephx,password-secret=secret0 Reviewed-by: Josh Durgin Signed-off-by: Daniel P. Berrange Message-id: 1453385961-10718-2-git-send-email-berrange@redhat.com Signed-off-by: Jeff Cody --- block/rbd.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/block/rbd.c b/block/rbd.c index 51b64f3fed..abfea612ec 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -16,6 +16,7 @@ #include "qemu-common.h" #include "qemu/error-report.h" #include "block/block_int.h" +#include "crypto/secret.h" #include @@ -228,6 +229,27 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) return NULL; } + +static int qemu_rbd_set_auth(rados_t cluster, const char *secretid, + Error **errp) +{ + if (secretid == 0) { + return 0; + } + + gchar *secret = qcrypto_secret_lookup_as_base64(secretid, + errp); + if (!secret) { + return -1; + } + + rados_conf_set(cluster, "key", secret); + g_free(secret); + + return 0; +} + + static int qemu_rbd_set_conf(rados_t cluster, const char *conf, bool only_read_conf_file, Error **errp) @@ -299,10 +321,13 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) char conf[RBD_MAX_CONF_SIZE]; char clientname_buf[RBD_MAX_CONF_SIZE]; char *clientname; + const char *secretid; rados_t cluster; rados_ioctx_t io_ctx; int ret; + secretid = qemu_opt_get(opts, "password-secret"); + if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), name, sizeof(name), @@ -350,6 +375,11 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) return -EIO; } + if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) { + rados_shutdown(cluster); + return -EIO; + } + if (rados_connect(cluster) < 0) { error_setg(errp, "error connecting"); rados_shutdown(cluster); @@ -423,6 +453,11 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "Specification of the rbd image", }, + { + .name = "password-secret", + .type = QEMU_OPT_STRING, + .help = "ID of secret providing the password", + }, { /* end of list */ } }, }; @@ -436,6 +471,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, char conf[RBD_MAX_CONF_SIZE]; char clientname_buf[RBD_MAX_CONF_SIZE]; char *clientname; + const char *secretid; QemuOpts *opts; Error *local_err = NULL; const char *filename; @@ -450,6 +486,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, } filename = qemu_opt_get(opts, "filename"); + secretid = qemu_opt_get(opts, "password-secret"); if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), @@ -488,6 +525,11 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, } } + if (qemu_rbd_set_auth(s->cluster, secretid, errp) < 0) { + r = -EIO; + goto failed_shutdown; + } + /* * Fallback to more conservative semantics if setting cache * options fails. Ignore errors from setting rbd_cache because the @@ -919,6 +961,11 @@ static QemuOptsList qemu_rbd_create_opts = { .type = QEMU_OPT_SIZE, .help = "RBD object size" }, + { + .name = "password-secret", + .type = QEMU_OPT_STRING, + .help = "ID of secret providing the password", + }, { /* end of list */ } } }; From 1bff96064290b2e9594dcbeea967b3d8cc76e2b2 Mon Sep 17 00:00:00 2001 From: "Daniel P. Berrange" Date: Thu, 21 Jan 2016 14:19:20 +0000 Subject: [PATCH 04/12] curl: add support for HTTP authentication parameters If connecting to a web server which has authentication turned on, QEMU gets a 401 as curl has not been configured with any authentication credentials. This adds 4 new parameters to the curl block driver options 'username', 'password-secret', 'proxy-username' and 'proxy-password-secret'. Passwords are provided using the recently added 'secret' object type $QEMU \ -object secret,id=sec0,filename=/home/berrange/example.pw \ -object secret,id=sec1,filename=/home/berrange/proxy.pw \ -drive driver=http,url=http://example.com/some.img,\ username=dan,password-secret=sec0,\ proxy-username=dan,proxy-password-secret=sec1 Of course it is possible to use the same secret for both the proxy & server passwords if desired, or omit the proxy auth details, or the server auth details as required. Signed-off-by: Daniel P. Berrange Message-id: 1453385961-10718-3-git-send-email-berrange@redhat.com Signed-off-by: Jeff Cody --- block/curl.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/block/curl.c b/block/curl.c index 1507e0ac34..c70bfb404d 100644 --- a/block/curl.c +++ b/block/curl.c @@ -27,6 +27,7 @@ #include "block/block_int.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qstring.h" +#include "crypto/secret.h" #include // #define DEBUG_CURL @@ -78,6 +79,10 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, #define CURL_BLOCK_OPT_SSLVERIFY "sslverify" #define CURL_BLOCK_OPT_TIMEOUT "timeout" #define CURL_BLOCK_OPT_COOKIE "cookie" +#define CURL_BLOCK_OPT_USERNAME "username" +#define CURL_BLOCK_OPT_PASSWORD_SECRET "password-secret" +#define CURL_BLOCK_OPT_PROXY_USERNAME "proxy-username" +#define CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET "proxy-password-secret" struct BDRVCURLState; @@ -120,6 +125,10 @@ typedef struct BDRVCURLState { char *cookie; bool accept_range; AioContext *aio_context; + char *username; + char *password; + char *proxyusername; + char *proxypassword; } BDRVCURLState; static void curl_clean_state(CURLState *s); @@ -419,6 +428,21 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + if (s->username) { + curl_easy_setopt(state->curl, CURLOPT_USERNAME, s->username); + } + if (s->password) { + curl_easy_setopt(state->curl, CURLOPT_PASSWORD, s->password); + } + if (s->proxyusername) { + curl_easy_setopt(state->curl, + CURLOPT_PROXYUSERNAME, s->proxyusername); + } + if (s->proxypassword) { + curl_easy_setopt(state->curl, + CURLOPT_PROXYPASSWORD, s->proxypassword); + } + /* Restrict supported protocols to avoid security issues in the more * obscure protocols. For example, do not allow POP3/SMTP/IMAP see * CVE-2013-0249. @@ -525,10 +549,31 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "Pass the cookie or list of cookies with each request" }, + { + .name = CURL_BLOCK_OPT_USERNAME, + .type = QEMU_OPT_STRING, + .help = "Username for HTTP auth" + }, + { + .name = CURL_BLOCK_OPT_PASSWORD_SECRET, + .type = QEMU_OPT_STRING, + .help = "ID of secret used as password for HTTP auth", + }, + { + .name = CURL_BLOCK_OPT_PROXY_USERNAME, + .type = QEMU_OPT_STRING, + .help = "Username for HTTP proxy auth" + }, + { + .name = CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET, + .type = QEMU_OPT_STRING, + .help = "ID of secret used as password for HTTP proxy auth", + }, { /* end of list */ } }, }; + static int curl_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -539,6 +584,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, const char *file; const char *cookie; double d; + const char *secretid; static int inited = 0; @@ -580,6 +626,26 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, goto out_noclean; } + s->username = g_strdup(qemu_opt_get(opts, CURL_BLOCK_OPT_USERNAME)); + secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PASSWORD_SECRET); + + if (secretid) { + s->password = qcrypto_secret_lookup_as_utf8(secretid, errp); + if (!s->password) { + goto out_noclean; + } + } + + s->proxyusername = g_strdup( + qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_USERNAME)); + secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET); + if (secretid) { + s->proxypassword = qcrypto_secret_lookup_as_utf8(secretid, errp); + if (!s->proxypassword) { + goto out_noclean; + } + } + if (!inited) { curl_global_init(CURL_GLOBAL_ALL); inited = 1; From b189346eb1784df95ed6fed610411dbf23d19e1f Mon Sep 17 00:00:00 2001 From: "Daniel P. Berrange" Date: Thu, 21 Jan 2016 14:19:21 +0000 Subject: [PATCH 05/12] iscsi: add support for getting CHAP password via QCryptoSecret API The iSCSI driver currently accepts the CHAP password in plain text as a block driver property. This change adds a new "password-secret" property that accepts the ID of a QCryptoSecret instance. $QEMU \ -object secret,id=sec0,filename=/home/berrange/example.pw \ -drive driver=iscsi,url=iscsi://example.com/target-foo/lun1,\ user=dan,password-secret=sec0 Signed-off-by: Daniel P. Berrange Message-id: 1453385961-10718-4-git-send-email-berrange@redhat.com Signed-off-by: Jeff Cody --- block/iscsi.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/block/iscsi.c b/block/iscsi.c index 9fe76f48ec..128ea79c13 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -39,6 +39,7 @@ #include "sysemu/sysemu.h" #include "qmp-commands.h" #include "qapi/qmp/qstring.h" +#include "crypto/secret.h" #include #include @@ -1080,6 +1081,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target, QemuOpts *opts; const char *user = NULL; const char *password = NULL; + const char *secretid; + char *secret = NULL; list = qemu_find_opts("iscsi"); if (!list) { @@ -1099,8 +1102,20 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target, return; } + secretid = qemu_opt_get(opts, "password-secret"); password = qemu_opt_get(opts, "password"); - if (!password) { + if (secretid && password) { + error_setg(errp, "'password' and 'password-secret' properties are " + "mutually exclusive"); + return; + } + if (secretid) { + secret = qcrypto_secret_lookup_as_utf8(secretid, errp); + if (!secret) { + return; + } + password = secret; + } else if (!password) { error_setg(errp, "CHAP username specified but no password was given"); return; } @@ -1108,6 +1123,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target, if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { error_setg(errp, "Failed to set initiator username and password"); } + + g_free(secret); } static void parse_header_digest(struct iscsi_context *iscsi, const char *target, @@ -1857,6 +1874,11 @@ static QemuOptsList qemu_iscsi_opts = { .name = "password", .type = QEMU_OPT_STRING, .help = "password for CHAP authentication to target", + },{ + .name = "password-secret", + .type = QEMU_OPT_STRING, + .help = "ID of the secret providing password for CHAP " + "authentication to target", },{ .name = "header-digest", .type = QEMU_OPT_STRING, From 939901dcd2093a5d6b40420314d1af8b185ec43c Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 18 Dec 2015 16:04:25 +0100 Subject: [PATCH 06/12] vhdx: DIV_ROUND_UP() in vhdx_calc_bat_entries() We have DIV_ROUND_UP(), so we can use it to produce more easily readable code. It may be slower than the bit shifting currently performed (because it actually performs a division), but since vhdx_calc_bat_entries() is never used in a hot path, this is completely fine. Signed-off-by: Max Reitz Message-id: 1450451066-13335-2-git-send-email-mreitz@redhat.com Signed-off-by: Jeff Cody --- block/vhdx.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/block/vhdx.c b/block/vhdx.c index 72042e9082..1e7e03ebed 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -857,14 +857,8 @@ static void vhdx_calc_bat_entries(BDRVVHDXState *s) { uint32_t data_blocks_cnt, bitmap_blocks_cnt; - data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; - if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { - data_blocks_cnt++; - } - bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; - if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { - bitmap_blocks_cnt++; - } + data_blocks_cnt = DIV_ROUND_UP(s->virtual_disk_size, s->block_size); + bitmap_blocks_cnt = DIV_ROUND_UP(data_blocks_cnt, s->chunk_ratio); if (s->parent_entries) { s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); From 04a3615860d693cbf5da015ee136a313628a42be Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 18 Dec 2015 16:04:26 +0100 Subject: [PATCH 07/12] vhdx: Simplify vhdx_set_shift_bits() For values which are powers of two (and we do assume all of these to be), sizeof(x) * 8 - 1 - clz(x) == ctz(x). Therefore, use ctz(). Signed-off-by: Max Reitz Message-id: 1450451066-13335-3-git-send-email-mreitz@redhat.com Signed-off-by: Jeff Cody --- block/vhdx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/vhdx.c b/block/vhdx.c index 1e7e03ebed..9a51428317 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -264,10 +264,10 @@ static void vhdx_region_unregister_all(BDRVVHDXState *s) static void vhdx_set_shift_bits(BDRVVHDXState *s) { - s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); - s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); - s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); - s->block_size_bits = 31 - clz32(s->block_size); + s->logical_sector_size_bits = ctz32(s->logical_sector_size); + s->sectors_per_block_bits = ctz32(s->sectors_per_block); + s->chunk_ratio_bits = ctz64(s->chunk_ratio); + s->block_size_bits = ctz32(s->block_size); } /* From e5b43573e28b226621ac6ed9ad71e1a72d71922d Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 5 Feb 2016 10:00:29 +0800 Subject: [PATCH 08/12] mirror: Rewrite mirror_iteration The "pnum < nb_sectors" condition in deciding whether to actually copy data is unnecessarily strict, and the qiov initialization is unnecessarily for bdrv_aio_write_zeroes and bdrv_aio_discard. Rewrite mirror_iteration to fix both flaws. The output of iotests 109 is updated because we now report the offset and len slightly differently in mirroring progress. Signed-off-by: Fam Zheng Reviewed-by: Max Reitz Message-id: 1454637630-10585-2-git-send-email-famz@redhat.com Signed-off-by: Jeff Cody --- block/mirror.c | 335 ++++++++++++++++++++++--------------- tests/qemu-iotests/109.out | 80 ++++----- trace-events | 1 - 3 files changed, 243 insertions(+), 173 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 2c0edfaf48..48cd0b319b 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -47,7 +47,6 @@ typedef struct MirrorBlockJob { BlockdevOnError on_source_error, on_target_error; bool synced; bool should_complete; - int64_t sector_num; int64_t granularity; size_t buf_size; int64_t bdev_length; @@ -64,6 +63,8 @@ typedef struct MirrorBlockJob { int ret; bool unmap; bool waiting_for_io; + int target_cluster_sectors; + int max_iov; } MirrorBlockJob; typedef struct MirrorOp { @@ -159,116 +160,79 @@ static void mirror_read_complete(void *opaque, int ret) mirror_write_complete, op); } -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) +/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and + * return the offset of the adjusted tail sector against original. */ +static int mirror_cow_align(MirrorBlockJob *s, + int64_t *sector_num, + int *nb_sectors) { - BlockDriverState *source = s->common.bs; - int nb_sectors, sectors_per_chunk, nb_chunks, max_iov; - int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; - uint64_t delay_ns = 0; - MirrorOp *op; - int pnum; - int64_t ret; - BlockDriverState *file; + bool need_cow; + int ret = 0; + int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS; + int64_t align_sector_num = *sector_num; + int align_nb_sectors = *nb_sectors; + int max_sectors = chunk_sectors * s->max_iov; - max_iov = MIN(source->bl.max_iov, s->target->bl.max_iov); - - s->sector_num = hbitmap_iter_next(&s->hbi); - if (s->sector_num < 0) { - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); - s->sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); - assert(s->sector_num >= 0); + need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap); + need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors, + s->cow_bitmap); + if (need_cow) { + bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors, + &align_sector_num, &align_nb_sectors); } - hbitmap_next_sector = s->sector_num; - sector_num = s->sector_num; + if (align_nb_sectors > max_sectors) { + align_nb_sectors = max_sectors; + if (need_cow) { + align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors, + s->target_cluster_sectors); + } + } + + ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors); + *sector_num = align_sector_num; + *nb_sectors = align_nb_sectors; + assert(ret >= 0); + return ret; +} + +/* Submit async read while handling COW. + * Returns: nb_sectors if no alignment is necessary, or + * (new_end - sector_num) if tail is rounded up or down due to + * alignment or buffer limit. + */ +static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, + int nb_sectors) +{ + BlockDriverState *source = s->common.bs; + int sectors_per_chunk, nb_chunks; + int ret = nb_sectors; + MirrorOp *op; + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - end = s->bdev_length / BDRV_SECTOR_SIZE; - /* Extend the QEMUIOVector to include all adjacent blocks that will - * be copied in this operation. - * - * We have to do this if we have no backing file yet in the destination, - * and the cluster size is very large. Then we need to do COW ourselves. - * The first time a cluster is copied, copy it entirely. Note that, - * because both the granularity and the cluster size are powers of two, - * the number of sectors to copy cannot exceed one cluster. - * - * We also want to extend the QEMUIOVector to include more adjacent - * dirty blocks if possible, to limit the number of I/O operations and - * run efficiently even with a small granularity. - */ - nb_chunks = 0; - nb_sectors = 0; - next_sector = sector_num; - next_chunk = sector_num / sectors_per_chunk; + /* We can only handle as much as buf_size at a time. */ + nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors); + assert(nb_sectors); - /* Wait for I/O to this cluster (from a previous iteration) to be done. */ - while (test_bit(next_chunk, s->in_flight_bitmap)) { + if (s->cow_bitmap) { + ret += mirror_cow_align(s, §or_num, &nb_sectors); + } + assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size); + /* The sector range must meet granularity because: + * 1) Caller passes in aligned values; + * 2) mirror_cow_align is used only when target cluster is larger. */ + assert(!(nb_sectors % sectors_per_chunk)); + assert(!(sector_num % sectors_per_chunk)); + nb_chunks = nb_sectors / sectors_per_chunk; + + while (s->buf_free_count < nb_chunks) { trace_mirror_yield_in_flight(s, sector_num, s->in_flight); s->waiting_for_io = true; qemu_coroutine_yield(); s->waiting_for_io = false; } - do { - int added_sectors, added_chunks; - - if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || - test_bit(next_chunk, s->in_flight_bitmap)) { - assert(nb_sectors > 0); - break; - } - - added_sectors = sectors_per_chunk; - if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { - bdrv_round_to_clusters(s->target, - next_sector, added_sectors, - &next_sector, &added_sectors); - - /* On the first iteration, the rounding may make us copy - * sectors before the first dirty one. - */ - if (next_sector < sector_num) { - assert(nb_sectors == 0); - sector_num = next_sector; - next_chunk = next_sector / sectors_per_chunk; - } - } - - added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); - added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; - - /* When doing COW, it may happen that there is not enough space for - * a full cluster. Wait if that is the case. - */ - while (nb_chunks == 0 && s->buf_free_count < added_chunks) { - trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; - } - if (s->buf_free_count < nb_chunks + added_chunks) { - trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); - break; - } - if (max_iov < nb_chunks + added_chunks) { - trace_mirror_break_iov_max(s, nb_chunks, added_chunks); - break; - } - - /* We have enough free space to copy these sectors. */ - bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); - - nb_sectors += added_sectors; - nb_chunks += added_chunks; - next_sector += added_sectors; - next_chunk += added_chunks; - if (!s->synced && s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); - } - } while (delay_ns == 0 && next_sector < end); - /* Allocate a MirrorOp that is used as an AIO callback. */ op = g_new(MirrorOp, 1); op->s = s; @@ -279,47 +243,153 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) * from s->buf_free. */ qemu_iovec_init(&op->qiov, nb_chunks); - next_sector = sector_num; while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); - size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; + size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size; QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); - - /* Advance the HBitmapIter in parallel, so that we do not examine - * the same sector twice. - */ - if (next_sector > hbitmap_next_sector - && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { - hbitmap_next_sector = hbitmap_iter_next(&s->hbi); - } - - next_sector += sectors_per_chunk; } - bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); - /* Copy the dirty cluster. */ s->in_flight++; s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); - ret = bdrv_get_block_status_above(source, NULL, sector_num, - nb_sectors, &pnum, &file); - if (ret < 0 || pnum < nb_sectors || - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, - mirror_read_complete, op); - } else if (ret & BDRV_BLOCK_ZERO) { + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + mirror_read_complete, op); + return ret; +} + +static void mirror_do_zero_or_discard(MirrorBlockJob *s, + int64_t sector_num, + int nb_sectors, + bool is_discard) +{ + MirrorOp *op; + + /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed + * so the freeing in mirror_iteration_done is nop. */ + op = g_new0(MirrorOp, 1); + op->s = s; + op->sector_num = sector_num; + op->nb_sectors = nb_sectors; + + s->in_flight++; + s->sectors_in_flight += nb_sectors; + if (is_discard) { + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, + mirror_write_complete, op); + } else { bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, mirror_write_complete, op); - } else { - assert(!(ret & BDRV_BLOCK_DATA)); - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, - mirror_write_complete, op); + } +} + +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) +{ + BlockDriverState *source = s->common.bs; + int64_t sector_num; + uint64_t delay_ns = 0; + /* At least the first dirty chunk is mirrored in one iteration. */ + int nb_chunks = 1; + int64_t end = s->bdev_length / BDRV_SECTOR_SIZE; + int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + + sector_num = hbitmap_iter_next(&s->hbi); + if (sector_num < 0) { + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); + sector_num = hbitmap_iter_next(&s->hbi); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); + assert(sector_num >= 0); + } + + /* Find the number of consective dirty chunks following the first dirty + * one, and wait for in flight requests in them. */ + while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { + int64_t hbitmap_next; + int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; + int64_t next_chunk = next_sector / sectors_per_chunk; + if (next_sector >= end || + !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { + break; + } + if (test_bit(next_chunk, s->in_flight_bitmap)) { + if (nb_chunks > 0) { + break; + } + trace_mirror_yield_in_flight(s, next_sector, s->in_flight); + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; + /* Now retry. */ + } else { + hbitmap_next = hbitmap_iter_next(&s->hbi); + assert(hbitmap_next == next_sector); + nb_chunks++; + } + } + + /* Clear dirty bits before querying the block status, because + * calling bdrv_get_block_status_above could yield - if some blocks are + * marked dirty in this window, we need to know. + */ + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, + nb_chunks * sectors_per_chunk); + bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); + while (nb_chunks > 0 && sector_num < end) { + int ret; + int io_sectors; + BlockDriverState *file; + enum MirrorMethod { + MIRROR_METHOD_COPY, + MIRROR_METHOD_ZERO, + MIRROR_METHOD_DISCARD + } mirror_method = MIRROR_METHOD_COPY; + + assert(!(sector_num % sectors_per_chunk)); + ret = bdrv_get_block_status_above(source, NULL, sector_num, + nb_chunks * sectors_per_chunk, + &io_sectors, &file); + if (ret < 0) { + io_sectors = nb_chunks * sectors_per_chunk; + } + + io_sectors -= io_sectors % sectors_per_chunk; + if (io_sectors < sectors_per_chunk) { + io_sectors = sectors_per_chunk; + } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) { + int64_t target_sector_num; + int target_nb_sectors; + bdrv_round_to_clusters(s->target, sector_num, io_sectors, + &target_sector_num, &target_nb_sectors); + if (target_sector_num == sector_num && + target_nb_sectors == io_sectors) { + mirror_method = ret & BDRV_BLOCK_ZERO ? + MIRROR_METHOD_ZERO : + MIRROR_METHOD_DISCARD; + } + } + + switch (mirror_method) { + case MIRROR_METHOD_COPY: + io_sectors = mirror_do_read(s, sector_num, io_sectors); + break; + case MIRROR_METHOD_ZERO: + mirror_do_zero_or_discard(s, sector_num, io_sectors, false); + break; + case MIRROR_METHOD_DISCARD: + mirror_do_zero_or_discard(s, sector_num, io_sectors, true); + break; + default: + abort(); + } + assert(io_sectors); + sector_num += io_sectors; + nb_chunks -= io_sectors / sectors_per_chunk; + delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors); } return delay_ns; } @@ -420,6 +490,7 @@ static void coroutine_fn mirror_run(void *opaque) checking for a NULL string */ int ret = 0; int n; + int target_cluster_size = BDRV_SECTOR_SIZE; if (block_job_is_cancelled(&s->common)) { goto immediate_exit; @@ -449,16 +520,16 @@ static void coroutine_fn mirror_run(void *opaque) */ bdrv_get_backing_filename(s->target, backing_filename, sizeof(backing_filename)); - if (backing_filename[0] && !s->target->backing) { - ret = bdrv_get_info(s->target, &bdi); - if (ret < 0) { - goto immediate_exit; - } - if (s->granularity < bdi.cluster_size) { - s->buf_size = MAX(s->buf_size, bdi.cluster_size); - s->cow_bitmap = bitmap_new(length); - } + if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) { + target_cluster_size = bdi.cluster_size; } + if (backing_filename[0] && !s->target->backing + && s->granularity < target_cluster_size) { + s->buf_size = MAX(s->buf_size, target_cluster_size); + s->cow_bitmap = bitmap_new(length); + } + s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS; + s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov); end = s->bdev_length / BDRV_SECTOR_SIZE; s->buf = qemu_try_blockalign(bs, s->buf_size); diff --git a/tests/qemu-iotests/109.out b/tests/qemu-iotests/109.out index 7db92c9ce8..b3358de73f 100644 --- a/tests/qemu-iotests/109.out +++ b/tests/qemu-iotests/109.out @@ -2,57 +2,57 @@ QA output created by 109 === Writing a qcow header into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 -Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 1024, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 1024, "offset": 1024, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Warning: Image size mismatch! Images are identical. === Writing a qcow2 header into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 -Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 197120, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 262144, "offset": 65536, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 197120, "offset": 197120, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 197120, "offset": 197120, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 262144, "offset": 262144, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 262144, "offset": 262144, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Warning: Image size mismatch! Images are identical. === Writing a qed header into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 -Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 327680, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 327680, "offset": 262144, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) @@ -65,29 +65,29 @@ Images are identical. === Writing a vdi header into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 -Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 1024, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 1024, "offset": 1024, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Warning: Image size mismatch! Images are identical. === Writing a vmdk header into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 -Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. @@ -107,49 +107,49 @@ Images are identical. === Writing a vpc header into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 -Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2560, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 2560, "offset": 2560, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Warning: Image size mismatch! Images are identical. === Copying sample image empty.bochs into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2560, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 2560, "offset": 2560, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Image resized. Warning: Image size mismatch! Images are identical. === Copying sample image iotest-dirtylog-10G-4M.vhdx into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. @@ -170,7 +170,7 @@ Images are identical. === Copying sample image parallels-v1 into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. @@ -191,41 +191,41 @@ Images are identical. === Copying sample image simple-pattern.cloop into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_ERROR", "data": {"device": "src", "operation": "write", "action": "report"}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2048, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 0, "speed": 0, "type": "mirror", "error": "Operation not permitted"}} {"return": []} read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2048, "offset": 2048, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 2048, "offset": 2048, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Image resized. Warning: Image size mismatch! Images are identical. === Write legitimate MBR into raw === -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 {"return": {}} WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed raw. Automatically detecting the format is dangerous for raw images, write operations on block 0 will be restricted. Specify the 'raw' format explicitly to remove the restrictions. {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 512, "offset": 512, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Warning: Image size mismatch! Images are identical. {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} -{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 512, "offset": 512, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} +{"return": [{"io-status": "ok", "device": "src", "busy": false, "len": 65536, "offset": 65536, "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} Warning: Image size mismatch! Images are identical. *** done diff --git a/trace-events b/trace-events index 075ec27100..deb4d808bd 100644 --- a/trace-events +++ b/trace-events @@ -97,7 +97,6 @@ mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirt mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d" mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d" -mirror_break_iov_max(void *s, int nb_chunks, int added_chunks) "s %p requested chunks %d added_chunks %d" # block/backup.c backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d" From 21cd917ff5ff30dbba49dd805da033f8c92fa958 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 5 Feb 2016 10:00:30 +0800 Subject: [PATCH 09/12] mirror: Add mirror_wait_for_io The three lines are duplicated a number of times now, refactor a function. Signed-off-by: Fam Zheng Reviewed-by: Max Reitz Message-id: 1454637630-10585-3-git-send-email-famz@redhat.com Signed-off-by: Jeff Cody --- block/mirror.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 48cd0b319b..9635fa8e62 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -196,6 +196,14 @@ static int mirror_cow_align(MirrorBlockJob *s, return ret; } +static inline void mirror_wait_for_io(MirrorBlockJob *s) +{ + assert(!s->waiting_for_io); + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; +} + /* Submit async read while handling COW. * Returns: nb_sectors if no alignment is necessary, or * (new_end - sector_num) if tail is rounded up or down due to @@ -228,9 +236,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, while (s->buf_free_count < nb_chunks) { trace_mirror_yield_in_flight(s, sector_num, s->in_flight); - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; + mirror_wait_for_io(s); } /* Allocate a MirrorOp that is used as an AIO callback. */ @@ -321,9 +327,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) break; } trace_mirror_yield_in_flight(s, next_sector, s->in_flight); - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; + mirror_wait_for_io(s); /* Now retry. */ } else { hbitmap_next = hbitmap_iter_next(&s->hbi); @@ -414,9 +418,7 @@ static void mirror_free_init(MirrorBlockJob *s) static void mirror_drain(MirrorBlockJob *s) { while (s->in_flight > 0) { - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; + mirror_wait_for_io(s); } } @@ -604,9 +606,7 @@ static void coroutine_fn mirror_run(void *opaque) if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; + mirror_wait_for_io(s); continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); From 16096a4d4789ef1f260484b4cc18b8327cf0c928 Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 25 Feb 2016 15:58:29 -0500 Subject: [PATCH 10/12] block/backup: make backup cluster size configurable 64K might not always be appropriate, make this a runtime value. Signed-off-by: John Snow Reviewed-by: Fam Zheng Message-id: 1456433911-24718-2-git-send-email-jsnow@redhat.com Signed-off-by: Jeff Cody --- block/backup.c | 64 ++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/block/backup.c b/block/backup.c index 00cafdbe2b..76addef3dc 100644 --- a/block/backup.c +++ b/block/backup.c @@ -21,10 +21,7 @@ #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" -#define BACKUP_CLUSTER_BITS 16 -#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) -#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) - +#define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) #define SLICE_TIME 100000000ULL /* ns */ typedef struct CowRequest { @@ -46,9 +43,16 @@ typedef struct BackupBlockJob { CoRwlock flush_rwlock; uint64_t sectors_read; HBitmap *bitmap; + int64_t cluster_size; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; +/* Size of a cluster in sectors, instead of bytes. */ +static inline int64_t cluster_size_sectors(BackupBlockJob *job) +{ + return job->cluster_size / BDRV_SECTOR_SIZE; +} + /* See if in-flight requests overlap and wait for them to complete */ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, int64_t start, @@ -97,13 +101,14 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, QEMUIOVector bounce_qiov; void *bounce_buffer = NULL; int ret = 0; + int64_t sectors_per_cluster = cluster_size_sectors(job); int64_t start, end; int n; qemu_co_rwlock_rdlock(&job->flush_rwlock); - start = sector_num / BACKUP_SECTORS_PER_CLUSTER; - end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); + start = sector_num / sectors_per_cluster; + end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster); trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); @@ -118,12 +123,12 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, trace_backup_do_cow_process(job, start); - n = MIN(BACKUP_SECTORS_PER_CLUSTER, + n = MIN(sectors_per_cluster, job->common.len / BDRV_SECTOR_SIZE - - start * BACKUP_SECTORS_PER_CLUSTER); + start * sectors_per_cluster); if (!bounce_buffer) { - bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); + bounce_buffer = qemu_blockalign(bs, job->cluster_size); } iov.iov_base = bounce_buffer; iov.iov_len = n * BDRV_SECTOR_SIZE; @@ -131,10 +136,10 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (is_write_notifier) { ret = bdrv_co_readv_no_serialising(bs, - start * BACKUP_SECTORS_PER_CLUSTER, + start * sectors_per_cluster, n, &bounce_qiov); } else { - ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, + ret = bdrv_co_readv(bs, start * sectors_per_cluster, n, &bounce_qiov); } if (ret < 0) { @@ -147,11 +152,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (buffer_is_zero(iov.iov_base, iov.iov_len)) { ret = bdrv_co_write_zeroes(job->target, - start * BACKUP_SECTORS_PER_CLUSTER, + start * sectors_per_cluster, n, BDRV_REQ_MAY_UNMAP); } else { ret = bdrv_co_writev(job->target, - start * BACKUP_SECTORS_PER_CLUSTER, n, + start * sectors_per_cluster, n, &bounce_qiov); } if (ret < 0) { @@ -322,21 +327,22 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) int64_t cluster; int64_t end; int64_t last_cluster = -1; + int64_t sectors_per_cluster = cluster_size_sectors(job); BlockDriverState *bs = job->common.bs; HBitmapIter hbi; granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); - clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); + clusters_per_iter = MAX((granularity / job->cluster_size), 1); bdrv_dirty_iter_init(job->sync_bitmap, &hbi); /* Find the next dirty sector(s) */ while ((sector = hbitmap_iter_next(&hbi)) != -1) { - cluster = sector / BACKUP_SECTORS_PER_CLUSTER; + cluster = sector / sectors_per_cluster; /* Fake progress updates for any clusters we skipped */ if (cluster != last_cluster + 1) { job->common.offset += ((cluster - last_cluster - 1) * - BACKUP_CLUSTER_SIZE); + job->cluster_size); } for (end = cluster + clusters_per_iter; cluster < end; cluster++) { @@ -344,8 +350,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) if (yield_and_check(job)) { return ret; } - ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, - BACKUP_SECTORS_PER_CLUSTER, &error_is_read, + ret = backup_do_cow(bs, cluster * sectors_per_cluster, + sectors_per_cluster, &error_is_read, false); if ((ret < 0) && backup_error_action(job, error_is_read, -ret) == @@ -357,17 +363,17 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) /* If the bitmap granularity is smaller than the backup granularity, * we need to advance the iterator pointer to the next cluster. */ - if (granularity < BACKUP_CLUSTER_SIZE) { - bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); + if (granularity < job->cluster_size) { + bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster); } last_cluster = cluster - 1; } /* Play some final catchup with the progress meter */ - end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + end = DIV_ROUND_UP(job->common.len, job->cluster_size); if (last_cluster + 1 < end) { - job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); + job->common.offset += ((end - last_cluster - 1) * job->cluster_size); } return ret; @@ -384,13 +390,14 @@ static void coroutine_fn backup_run(void *opaque) .notify = backup_before_write_notify, }; int64_t start, end; + int64_t sectors_per_cluster = cluster_size_sectors(job); int ret = 0; QLIST_INIT(&job->inflight_reqs); qemu_co_rwlock_init(&job->flush_rwlock); start = 0; - end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + end = DIV_ROUND_UP(job->common.len, job->cluster_size); job->bitmap = hbitmap_alloc(end, 0); @@ -427,7 +434,7 @@ static void coroutine_fn backup_run(void *opaque) /* Check to see if these blocks are already in the * backing file. */ - for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { + for (i = 0; i < sectors_per_cluster;) { /* bdrv_is_allocated() only returns true/false based * on the first set of sectors it comes across that * are are all in the same state. @@ -436,8 +443,8 @@ static void coroutine_fn backup_run(void *opaque) * needed but at some point that is always the case. */ alloced = bdrv_is_allocated(bs, - start * BACKUP_SECTORS_PER_CLUSTER + i, - BACKUP_SECTORS_PER_CLUSTER - i, &n); + start * sectors_per_cluster + i, + sectors_per_cluster - i, &n); i += n; if (alloced == 1 || n == 0) { @@ -452,8 +459,8 @@ static void coroutine_fn backup_run(void *opaque) } } /* FULL sync mode we copy the whole drive. */ - ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, - BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false); + ret = backup_do_cow(bs, start * sectors_per_cluster, + sectors_per_cluster, &error_is_read, false); if (ret < 0) { /* Depending on error action, fail now or retry cluster */ BlockErrorAction action = @@ -571,6 +578,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, job->sync_mode = sync_mode; job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? sync_bitmap : NULL; + job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; job->common.len = len; job->common.co = qemu_coroutine_create(backup_run); block_job_txn_add_job(txn, &job->common); From 4c9bca7e39a6e07ad02c1dcde3478363344ec60b Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 25 Feb 2016 15:58:30 -0500 Subject: [PATCH 11/12] block/backup: avoid copying less than full target clusters During incremental backups, if the target has a cluster size that is larger than the backup cluster size and we are backing up to a target that cannot (for whichever reason) pull clusters up from a backing image, we may inadvertantly create unusable incremental backup images. For example: If the bitmap tracks changes at a 64KB granularity and we transmit 64KB of data at a time but the target uses a 128KB cluster size, it is possible that only half of a target cluster will be recognized as dirty by the backup block job. When the cluster is allocated on the target image but only half populated with data, we lose the ability to distinguish between zero padding and uninitialized data. This does not happen if the target image has a backing file that points to the last known good backup. Even if we have a backing file, though, it's likely going to be faster to just buffer the redundant data ourselves from the live image than fetching it from the backing file, so let's just always round up to the target granularity. The same logic applies to backup modes top, none, and full. Copying fractional clusters without the guarantee of COW is dangerous, but even if we can rely on COW, it's likely better to just re-copy the data. Reported-by: Fam Zheng Signed-off-by: John Snow Reviewed-by: Fam Zheng Message-id: 1456433911-24718-3-git-send-email-jsnow@redhat.com Signed-off-by: Jeff Cody --- block/backup.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/block/backup.c b/block/backup.c index 76addef3dc..0f1b1bc084 100644 --- a/block/backup.c +++ b/block/backup.c @@ -501,6 +501,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, BlockJobTxn *txn, Error **errp) { int64_t len; + BlockDriverInfo bdi; + int ret; assert(bs); assert(target); @@ -570,15 +572,32 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, goto error; } - bdrv_op_block_all(target, job->common.blocker); - job->on_source_error = on_source_error; job->on_target_error = on_target_error; job->target = target; job->sync_mode = sync_mode; job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? sync_bitmap : NULL; - job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; + + /* If there is no backing file on the target, we cannot rely on COW if our + * backup cluster size is smaller than the target cluster size. Even for + * targets with a backing file, try to avoid COW if possible. */ + ret = bdrv_get_info(job->target, &bdi); + if (ret < 0 && !target->backing) { + error_setg_errno(errp, -ret, + "Couldn't determine the cluster size of the target image, " + "which has no backing file"); + error_append_hint(errp, + "Aborting, since this may create an unusable destination image\n"); + goto error; + } else if (ret < 0 && target->backing) { + /* Not fatal; just trudge on ahead. */ + job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; + } else { + job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); + } + + bdrv_op_block_all(target, job->common.blocker); job->common.len = len; job->common.co = qemu_coroutine_create(backup_run); block_job_txn_add_job(txn, &job->common); From cc199b16cf4cb9279aca73f5f5dce2cc337b9079 Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 25 Feb 2016 15:58:31 -0500 Subject: [PATCH 12/12] iotests/124: Add cluster_size mismatch test If a backing file isn't specified in the target image and the cluster_size is larger than the bitmap granularity, we run the risk of creating bitmaps with allocated clusters but empty/no data which will prevent the proper reading of the backup in the future. Signed-off-by: John Snow Reviewed-by: Fam Zheng Message-id: 1456433911-24718-4-git-send-email-jsnow@redhat.com Signed-off-by: Jeff Cody --- tests/qemu-iotests/124 | 58 ++++++++++++++++++++++++++++++++++---- tests/qemu-iotests/124.out | 4 +-- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124 index 7d334225b5..de7cdbe00e 100644 --- a/tests/qemu-iotests/124 +++ b/tests/qemu-iotests/124 @@ -132,14 +132,16 @@ class TestIncrementalBackupBase(iotests.QMPTestCase): def img_create(self, img, fmt=iotests.imgfmt, size='64M', - parent=None, parentFormat=None): + parent=None, parentFormat=None, **kwargs): + optargs = [] + for k,v in kwargs.iteritems(): + optargs = optargs + ['-o', '%s=%s' % (k,v)] + args = ['create', '-f', fmt] + optargs + [img, size] if parent: if parentFormat is None: parentFormat = fmt - iotests.qemu_img('create', '-f', fmt, img, size, - '-b', parent, '-F', parentFormat) - else: - iotests.qemu_img('create', '-f', fmt, img, size) + args = args + ['-b', parent, '-F', parentFormat] + iotests.qemu_img(*args) self.files.append(img) @@ -307,6 +309,52 @@ class TestIncrementalBackup(TestIncrementalBackupBase): return self.do_incremental_simple(granularity=131072) + def test_larger_cluster_target(self): + ''' + Test: Create and verify backups made to a larger cluster size target. + + With a default granularity of 64KiB, verify that backups made to a + larger cluster size target of 128KiB without a backing file works. + ''' + drive0 = self.drives[0] + + # Create a cluster_size=128k full backup / "anchor" backup + self.img_create(drive0['backup'], cluster_size='128k') + self.assertTrue(self.do_qmp_backup(device=drive0['id'], sync='full', + format=drive0['fmt'], + target=drive0['backup'], + mode='existing')) + + # Create bitmap and dirty it with some new writes. + # overwrite [32736, 32799] which will dirty bitmap clusters at + # 32M-64K and 32M. 32M+64K will be left undirtied. + bitmap0 = self.add_bitmap('bitmap0', drive0) + self.hmp_io_writes(drive0['id'], + (('0xab', 0, 512), + ('0xfe', '16M', '256k'), + ('0x64', '32736k', '64k'))) + + + # Prepare a cluster_size=128k backup target without a backing file. + (target, _) = bitmap0.new_target() + self.img_create(target, bitmap0.drive['fmt'], cluster_size='128k') + + # Perform Incremental Backup + self.assertTrue(self.do_qmp_backup(device=bitmap0.drive['id'], + sync='incremental', + bitmap=bitmap0.name, + format=bitmap0.drive['fmt'], + target=target, + mode='existing')) + self.make_reference_backup(bitmap0) + + # Add the backing file, then compare and exit. + iotests.qemu_img('rebase', '-f', drive0['fmt'], '-u', '-b', + drive0['backup'], '-F', drive0['fmt'], target) + self.vm.shutdown() + self.check_backups() + + def test_incremental_transaction(self): '''Test: Verify backups made from transactionally created bitmaps. diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out index dae404e278..36376bed87 100644 --- a/tests/qemu-iotests/124.out +++ b/tests/qemu-iotests/124.out @@ -1,5 +1,5 @@ -......... +.......... ---------------------------------------------------------------------- -Ran 9 tests +Ran 10 tests OK