From d24f80234b39d2d5c0d91e63b5e4569d37b2399e Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 9 May 2019 16:59:27 +0200 Subject: [PATCH 1/4] block/rbd: increase dynamically the image size RBD APIs don't allow us to write more than the size set with rbd_create() or rbd_resize(). In order to support growing images (eg. qcow2), we resize the image before write operations that exceed the current size. Signed-off-by: Stefano Garzarella Message-id: 20190509145927.293369-1-sgarzare@redhat.com Signed-off-by: Max Reitz --- block/rbd.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index f2ac2c06f4..59757b3120 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -103,6 +103,7 @@ typedef struct BDRVRBDState { rbd_image_t image; char *image_name; char *snap; + uint64_t image_size; } BDRVRBDState; static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, @@ -778,6 +779,14 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, goto failed_open; } + r = rbd_get_size(s->image, &s->image_size); + if (r < 0) { + error_setg_errno(errp, -r, "error getting image size from %s", + s->image_name); + rbd_close(s->image); + goto failed_open; + } + /* If we are using an rbd snapshot, we must be r/o, otherwise * leave as-is */ if (s->snap != NULL) { @@ -834,6 +843,22 @@ static void qemu_rbd_close(BlockDriverState *bs) rados_shutdown(s->cluster); } +/* Resize the RBD image and update the 'image_size' with the current size */ +static int qemu_rbd_resize(BlockDriverState *bs, uint64_t size) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_resize(s->image, size); + if (r < 0) { + return r; + } + + s->image_size = size; + + return 0; +} + static const AIOCBInfo rbd_aiocb_info = { .aiocb_size = sizeof(RBDAIOCB), }; @@ -935,13 +960,25 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, } switch (cmd) { - case RBD_AIO_WRITE: + case RBD_AIO_WRITE: { + /* + * RBD APIs don't allow us to write more than actual size, so in order + * to support growing images, we resize the image before write + * operations that exceed the current size. + */ + if (off + size > s->image_size) { + r = qemu_rbd_resize(bs, off + size); + if (r < 0) { + goto failed_completion; + } + } #ifdef LIBRBD_SUPPORTS_IOVEC r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c); #else r = rbd_aio_write(s->image, off, size, rcb->buf, c); #endif break; + } case RBD_AIO_READ: #ifdef LIBRBD_SUPPORTS_IOVEC r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c); @@ -1052,7 +1089,6 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, PreallocMode prealloc, Error **errp) { - BDRVRBDState *s = bs->opaque; int r; if (prealloc != PREALLOC_MODE_OFF) { @@ -1061,7 +1097,7 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, return -ENOTSUP; } - r = rbd_resize(s->image, offset); + r = qemu_rbd_resize(bs, offset); if (r < 0) { error_setg_errno(errp, -r, "Failed to resize file"); return r; From 170d3bd341b3955f10b40b4569f66bf3d4dbc4a0 Mon Sep 17 00:00:00 2001 From: Andrey Shinkevich Date: Wed, 29 May 2019 20:56:14 +0300 Subject: [PATCH 2/4] block: include base when checking image chain for block allocation This patch is used in the 'block/stream: introduce a bottom node' that is following. Instead of the base node, the caller may pass the node that has the base as its backing image to the function bdrv_is_allocated_above() with a new parameter include_base = true and get rid of the dependency on the base that may change during commit/stream parallel jobs. Now, if the specified base is not found in the backing image chain, the QEMU will abort. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Message-id: 1559152576-281803-2-git-send-email-andrey.shinkevich@virtuozzo.com [mreitz: Squashed in the following as a rebase on conflicting patches:] Message-id: e3cf99ae-62e9-8b6e-5a06-d3c8b9363b85@redhat.com Signed-off-by: Max Reitz --- block/commit.c | 2 +- block/io.c | 21 +++++++++++++++------ block/mirror.c | 2 +- block/qcow2.c | 3 ++- block/replication.c | 2 +- block/stream.c | 2 +- include/block/block.h | 3 ++- qemu-img.c | 2 +- 8 files changed, 24 insertions(+), 13 deletions(-) diff --git a/block/commit.c b/block/commit.c index 212c6f639e..ca7e408b26 100644 --- a/block/commit.c +++ b/block/commit.c @@ -174,7 +174,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) break; } /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), + ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false, offset, COMMIT_BUFFER_SIZE, &n); copy = (ret == 1); trace_commit_one_iteration(s, offset, n, ret); diff --git a/block/io.c b/block/io.c index 9ba1bada36..24a18759fd 100644 --- a/block/io.c +++ b/block/io.c @@ -2295,10 +2295,11 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, /* * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] * - * Return true if (a prefix of) the given range is allocated in any image - * between BASE and TOP (inclusive). BASE can be NULL to check if the given - * offset is allocated in any image of the chain. Return false otherwise, - * or negative errno on failure. + * Return 1 if (a prefix of) the given range is allocated in any image + * between BASE and TOP (BASE is only included if include_base is set). + * BASE can be NULL to check if the given offset is allocated in any + * image of the chain. Return 0 otherwise, or negative errno on + * failure. * * 'pnum' is set to the number of bytes (including and immediately * following the specified offset) that are known to be in the same @@ -2310,17 +2311,21 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, */ int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, - int64_t offset, int64_t bytes, int64_t *pnum) + bool include_base, int64_t offset, + int64_t bytes, int64_t *pnum) { BlockDriverState *intermediate; int ret; int64_t n = bytes; + assert(base || !include_base); + intermediate = top; - while (intermediate && intermediate != base) { + while (include_base || intermediate != base) { int64_t pnum_inter; int64_t size_inter; + assert(intermediate); ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); if (ret < 0) { return ret; @@ -2339,6 +2344,10 @@ int bdrv_is_allocated_above(BlockDriverState *top, n = pnum_inter; } + if (intermediate == base) { + break; + } + intermediate = backing_bs(intermediate); } diff --git a/block/mirror.c b/block/mirror.c index d17be4cdbc..2fcec70e35 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -808,7 +808,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) return 0; } - ret = bdrv_is_allocated_above(bs, base, offset, bytes, &count); + ret = bdrv_is_allocated_above(bs, base, false, offset, bytes, &count); if (ret < 0) { return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index 9396d490d5..2a59eb27fe 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2148,7 +2148,8 @@ static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes) { int64_t nr; return !bytes || - (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes); + (!bdrv_is_allocated_above(bs, NULL, false, offset, bytes, &nr) && + nr == bytes); } static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m) diff --git a/block/replication.c b/block/replication.c index b41bc507c0..23b2993d74 100644 --- a/block/replication.c +++ b/block/replication.c @@ -275,7 +275,7 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, while (remaining_sectors > 0) { int64_t count; - ret = bdrv_is_allocated_above(top->bs, base->bs, + ret = bdrv_is_allocated_above(top->bs, base->bs, false, sector_num * BDRV_SECTOR_SIZE, remaining_sectors * BDRV_SECTOR_SIZE, &count); diff --git a/block/stream.c b/block/stream.c index 1a906fd860..97fddb2608 100644 --- a/block/stream.c +++ b/block/stream.c @@ -160,7 +160,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ - ret = bdrv_is_allocated_above(backing_bs(bs), base, + ret = bdrv_is_allocated_above(backing_bs(bs), base, false, offset, n, &n); /* Finish early if end of backing file has been reached */ diff --git a/include/block/block.h b/include/block/block.h index f9415ed740..734c9d2f76 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -449,7 +449,8 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum); int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, - int64_t offset, int64_t bytes, int64_t *pnum); + bool include_base, int64_t offset, int64_t bytes, + int64_t *pnum); bool bdrv_is_read_only(BlockDriverState *bs); int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, diff --git a/qemu-img.c b/qemu-img.c index 158b3a505f..79983772de 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3518,7 +3518,7 @@ static int img_rebase(int argc, char **argv) * to take action */ ret = bdrv_is_allocated_above(backing_bs(bs), prefix_chain_bs, - offset, n, &n); + false, offset, n, &n); if (ret < 0) { error_report("error while reading image metadata: %s", strerror(-ret)); From 96a07d5bf447e616361acd4181a0fb24377483d9 Mon Sep 17 00:00:00 2001 From: Andrey Shinkevich Date: Wed, 29 May 2019 20:56:15 +0300 Subject: [PATCH 3/4] block/stream: refactor stream_run: drop goto The goto is unnecessary in the stream_run() since the common exit code was removed in the commit eb23654dbe43b549ea2a9ebff9d8e: "jobs: utilize job_exit shim". Signed-off-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich Reviewed-by: Alberto Garcia Reviewed-by: Max Reitz Message-id: 1559152576-281803-3-git-send-email-andrey.shinkevich@virtuozzo.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/stream.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/block/stream.c b/block/stream.c index 97fddb2608..65b13b27e0 100644 --- a/block/stream.c +++ b/block/stream.c @@ -120,13 +120,12 @@ static int coroutine_fn stream_run(Job *job, Error **errp) void *buf; if (!bs->backing) { - goto out; + return 0; } len = bdrv_getlength(bs); if (len < 0) { - ret = len; - goto out; + return len; } job_progress_set_remaining(&s->common.job, len); @@ -203,14 +202,10 @@ static int coroutine_fn stream_run(Job *job, Error **errp) bdrv_disable_copy_on_read(bs); } - /* Do not remove the backing file if an error was there but ignored. */ - ret = error; - qemu_vfree(buf); -out: - /* Modify backing chain and close BDSes in main loop */ - return ret; + /* Do not remove the backing file if an error was there but ignored. */ + return error; } static const BlockJobDriver stream_job_driver = { From c624b015bf14fe01f1e6452a36e63b3ea1ae4998 Mon Sep 17 00:00:00 2001 From: Andrey Shinkevich Date: Wed, 29 May 2019 20:56:16 +0300 Subject: [PATCH 4/4] block/stream: introduce a bottom node The bottom node is the intermediate block device that has the base as its backing image. It is used instead of the base node while a block stream job is running to avoid dependency on the base that may change due to the parallel jobs. The change may take place due to a filter node as well that is inserted between the base and the intermediate bottom node. It occurs when the base node is the top one for another commit or stream job. After the introduction of the bottom node, don't freeze its backing child, that's the base, anymore. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Message-id: 1559152576-281803-4-git-send-email-andrey.shinkevich@virtuozzo.com Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/stream.c | 43 ++++++++++++++++++++++-------------------- tests/qemu-iotests/245 | 4 ++-- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/block/stream.c b/block/stream.c index 65b13b27e0..cd5e2ba9b0 100644 --- a/block/stream.c +++ b/block/stream.c @@ -31,7 +31,7 @@ enum { typedef struct StreamBlockJob { BlockJob common; - BlockDriverState *base; + BlockDriverState *bottom; BlockdevOnError on_error; char *backing_file_str; bool bs_read_only; @@ -54,7 +54,7 @@ static void stream_abort(Job *job) if (s->chain_frozen) { BlockJob *bjob = &s->common; - bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->base); + bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->bottom); } } @@ -63,11 +63,11 @@ static int stream_prepare(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = &s->common; BlockDriverState *bs = blk_bs(bjob->blk); - BlockDriverState *base = s->base; + BlockDriverState *base = backing_bs(s->bottom); Error *local_err = NULL; int ret = 0; - bdrv_unfreeze_backing_chain(bs, base); + bdrv_unfreeze_backing_chain(bs, s->bottom); s->chain_frozen = false; if (bs->backing) { @@ -110,7 +110,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; BlockDriverState *bs = blk_bs(blk); - BlockDriverState *base = s->base; + bool enable_cor = !backing_bs(s->bottom); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; @@ -119,7 +119,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp) int64_t n = 0; /* bytes */ void *buf; - if (!bs->backing) { + if (bs == s->bottom) { + /* Nothing to stream */ return 0; } @@ -136,7 +137,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) * backing chain since the copy-on-read operation does not take base into * account. */ - if (!base) { + if (enable_cor) { bdrv_enable_copy_on_read(bs); } @@ -159,9 +160,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ - ret = bdrv_is_allocated_above(backing_bs(bs), base, false, + ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true, offset, n, &n); - /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { n = len - offset; @@ -198,7 +198,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } } - if (!base) { + if (enable_cor) { bdrv_disable_copy_on_read(bs); } @@ -230,8 +230,10 @@ void stream_start(const char *job_id, BlockDriverState *bs, StreamBlockJob *s; BlockDriverState *iter; bool bs_read_only; + int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED; + BlockDriverState *bottom = bdrv_find_overlay(bs, base); - if (bdrv_freeze_backing_chain(bs, base, errp) < 0) { + if (bdrv_freeze_backing_chain(bs, bottom, errp) < 0) { return; } @@ -248,10 +250,8 @@ void stream_start(const char *job_id, BlockDriverState *bs, * already have our own plans. Also don't allow resize as the image size is * queried only at the job start and then cached. */ s = block_job_create(job_id, &stream_job_driver, NULL, bs, - BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | - BLK_PERM_GRAPH_MOD, - BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | - BLK_PERM_WRITE, + basic_flags | BLK_PERM_GRAPH_MOD, + basic_flags | BLK_PERM_WRITE, speed, creation_flags, NULL, NULL, errp); if (!s) { goto fail; @@ -259,15 +259,18 @@ void stream_start(const char *job_id, BlockDriverState *bs, /* Block all intermediate nodes between bs and base, because they will * disappear from the chain after this operation. The streaming job reads - * every block only once, assuming that it doesn't change, so block writes - * and resizes. */ + * every block only once, assuming that it doesn't change, so forbid writes + * and resizes. Reassign the base node pointer because the backing BS of the + * bottom node might change after the call to bdrv_reopen_set_read_only() + * due to parallel block jobs running. + */ + base = backing_bs(bottom); for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) { block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED, - &error_abort); + basic_flags, &error_abort); } - s->base = base; + s->bottom = bottom; s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index 349b94aace..bc1ceb9792 100644 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -866,9 +866,9 @@ class TestBlockdevReopen(iotests.QMPTestCase): auto_finalize = False) self.assert_qmp(result, 'return', {}) - # We can't remove hd2 while the stream job is ongoing + # We can remove hd2 while the stream job is ongoing opts['backing']['backing'] = None - self.reopen(opts, {}, "Cannot change 'backing' link from 'hd1' to 'hd2'") + self.reopen(opts, {}) # We can't remove hd1 while the stream job is ongoing opts['backing'] = None