diff --git a/block.c b/block.c index 8b9d457546..91a66d4f3e 100644 --- a/block.c +++ b/block.c @@ -4660,6 +4660,31 @@ static void bdrv_delete(BlockDriverState *bs) g_free(bs); } +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp) +{ + BlockDriverState *new_node_bs; + Error *local_err = NULL; + + new_node_bs = bdrv_open(NULL, NULL, node_options, flags, errp); + if (new_node_bs == NULL) { + error_prepend(errp, "Could not create node: "); + return NULL; + } + + bdrv_drained_begin(bs); + bdrv_replace_node(bs, new_node_bs, &local_err); + bdrv_drained_end(bs); + + if (local_err) { + bdrv_unref(new_node_bs); + error_propagate(errp, local_err); + return NULL; + } + + return new_node_bs; +} + /* * Run consistency checks on an image * diff --git a/block/backup-top.c b/block/backup-top.c index fe6883cc97..6e7e7bf340 100644 --- a/block/backup-top.c +++ b/block/backup-top.c @@ -61,7 +61,7 @@ static coroutine_fn int backup_top_cbw(BlockDriverState *bs, uint64_t offset, off = QEMU_ALIGN_DOWN(offset, s->cluster_size); end = QEMU_ALIGN_UP(offset + bytes, s->cluster_size); - return block_copy(s->bcs, off, end - off, NULL); + return block_copy(s->bcs, off, end - off, true); } static int coroutine_fn backup_top_co_pdiscard(BlockDriverState *bs, @@ -186,6 +186,7 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState *source, BlockDriverState *target, const char *filter_node_name, uint64_t cluster_size, + BackupPerf *perf, BdrvRequestFlags write_flags, BlockCopyState **bcs, Error **errp) @@ -244,7 +245,8 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState *source, state->cluster_size = cluster_size; state->bcs = block_copy_state_new(top->backing, state->target, - cluster_size, write_flags, &local_err); + cluster_size, perf->use_copy_range, + write_flags, &local_err); if (local_err) { error_prepend(&local_err, "Cannot create block-copy-state: "); goto fail; diff --git a/block/backup-top.h b/block/backup-top.h index e5cabfa197..b28b0031c4 100644 --- a/block/backup-top.h +++ b/block/backup-top.h @@ -33,6 +33,7 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState *source, BlockDriverState *target, const char *filter_node_name, uint64_t cluster_size, + BackupPerf *perf, BdrvRequestFlags write_flags, BlockCopyState **bcs, Error **errp); diff --git a/block/backup.c b/block/backup.c index 9afa0bf3b4..cc525d5544 100644 --- a/block/backup.c +++ b/block/backup.c @@ -22,7 +22,6 @@ #include "block/block-copy.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" -#include "qemu/ratelimit.h" #include "qemu/cutils.h" #include "sysemu/block-backend.h" #include "qemu/bitmap.h" @@ -44,40 +43,17 @@ typedef struct BackupBlockJob { BlockdevOnError on_source_error; BlockdevOnError on_target_error; uint64_t len; - uint64_t bytes_read; int64_t cluster_size; + BackupPerf perf; BlockCopyState *bcs; + + bool wait; + BlockCopyCallState *bg_bcs_call; } BackupBlockJob; static const BlockJobDriver backup_job_driver; -static void backup_progress_bytes_callback(int64_t bytes, void *opaque) -{ - BackupBlockJob *s = opaque; - - s->bytes_read += bytes; -} - -static int coroutine_fn backup_do_cow(BackupBlockJob *job, - int64_t offset, uint64_t bytes, - bool *error_is_read) -{ - int ret = 0; - int64_t start, end; /* bytes */ - - start = QEMU_ALIGN_DOWN(offset, job->cluster_size); - end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size); - - trace_backup_do_cow_enter(job, start, offset, bytes); - - ret = block_copy(job->bcs, start, end - start, error_is_read); - - trace_backup_do_cow_return(job, offset, bytes, ret); - - return ret; -} - static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) { BdrvDirtyBitmap *bm; @@ -157,53 +133,96 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job, } } -static bool coroutine_fn yield_and_check(BackupBlockJob *job) +static void coroutine_fn backup_block_copy_callback(void *opaque) { - uint64_t delay_ns; + BackupBlockJob *s = opaque; - if (job_is_cancelled(&job->common.job)) { - return true; + if (s->wait) { + s->wait = false; + aio_co_wake(s->common.job.co); + } else { + job_enter(&s->common.job); } - - /* - * We need to yield even for delay_ns = 0 so that bdrv_drain_all() can - * return. Without a yield, the VM would not reboot. - */ - delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read); - job->bytes_read = 0; - job_sleep_ns(&job->common.job, delay_ns); - - if (job_is_cancelled(&job->common.job)) { - return true; - } - - return false; } static int coroutine_fn backup_loop(BackupBlockJob *job) { - bool error_is_read; - int64_t offset; - BdrvDirtyBitmapIter *bdbi; + BlockCopyCallState *s = NULL; int ret = 0; + bool error_is_read; + BlockErrorAction act; - bdbi = bdrv_dirty_iter_new(block_copy_dirty_bitmap(job->bcs)); - while ((offset = bdrv_dirty_iter_next(bdbi)) != -1) { - do { - if (yield_and_check(job)) { - goto out; - } - ret = backup_do_cow(job, offset, job->cluster_size, &error_is_read); - if (ret < 0 && backup_error_action(job, error_is_read, -ret) == - BLOCK_ERROR_ACTION_REPORT) - { - goto out; - } - } while (ret < 0); + while (true) { /* retry loop */ + job->bg_bcs_call = s = block_copy_async(job->bcs, 0, + QEMU_ALIGN_UP(job->len, job->cluster_size), + job->perf.max_workers, job->perf.max_chunk, + backup_block_copy_callback, job); + + while (!block_copy_call_finished(s) && + !job_is_cancelled(&job->common.job)) + { + job_yield(&job->common.job); + } + + if (!block_copy_call_finished(s)) { + assert(job_is_cancelled(&job->common.job)); + /* + * Note that we can't use job_yield() here, as it doesn't work for + * cancelled job. + */ + block_copy_call_cancel(s); + job->wait = true; + qemu_coroutine_yield(); + assert(block_copy_call_finished(s)); + ret = 0; + goto out; + } + + if (job_is_cancelled(&job->common.job) || + block_copy_call_succeeded(s)) + { + ret = 0; + goto out; + } + + if (block_copy_call_cancelled(s)) { + /* + * Job is not cancelled but only block-copy call. This is possible + * after job pause. Now the pause is finished, start new block-copy + * iteration. + */ + block_copy_call_free(s); + continue; + } + + /* The only remaining case is failed block-copy call. */ + assert(block_copy_call_failed(s)); + + ret = block_copy_call_status(s, &error_is_read); + act = backup_error_action(job, error_is_read, -ret); + switch (act) { + case BLOCK_ERROR_ACTION_REPORT: + goto out; + case BLOCK_ERROR_ACTION_STOP: + /* + * Go to pause prior to starting new block-copy call on the next + * iteration. + */ + job_pause_point(&job->common.job); + break; + case BLOCK_ERROR_ACTION_IGNORE: + /* Proceed to new block-copy call to retry. */ + break; + default: + abort(); + } + + block_copy_call_free(s); } - out: - bdrv_dirty_iter_free(bdbi); +out: + block_copy_call_free(s); + job->bg_bcs_call = NULL; return ret; } @@ -235,7 +254,7 @@ static void backup_init_bcs_bitmap(BackupBlockJob *job) static int coroutine_fn backup_run(Job *job, Error **errp) { BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); - int ret = 0; + int ret; backup_init_bcs_bitmap(s); @@ -244,14 +263,19 @@ static int coroutine_fn backup_run(Job *job, Error **errp) int64_t count; for (offset = 0; offset < s->len; ) { - if (yield_and_check(s)) { - ret = -ECANCELED; - goto out; + if (job_is_cancelled(job)) { + return -ECANCELED; + } + + job_pause_point(job); + + if (job_is_cancelled(job)) { + return -ECANCELED; } ret = block_copy_reset_unallocated(s->bcs, offset, &count); if (ret < 0) { - goto out; + return ret; } offset += count; @@ -272,11 +296,37 @@ static int coroutine_fn backup_run(Job *job, Error **errp) job_yield(job); } } else { - ret = backup_loop(s); + return backup_loop(s); } - out: - return ret; + return 0; +} + +static void coroutine_fn backup_pause(Job *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); + + if (s->bg_bcs_call && !block_copy_call_finished(s->bg_bcs_call)) { + block_copy_call_cancel(s->bg_bcs_call); + s->wait = true; + qemu_coroutine_yield(); + } +} + +static void coroutine_fn backup_set_speed(BlockJob *job, int64_t speed) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + /* + * block_job_set_speed() is called first from block_job_create(), when we + * don't yet have s->bcs. + */ + if (s->bcs) { + block_copy_set_speed(s->bcs, speed); + if (s->bg_bcs_call) { + block_copy_kick(s->bg_bcs_call); + } + } } static const BlockJobDriver backup_job_driver = { @@ -289,7 +339,9 @@ static const BlockJobDriver backup_job_driver = { .commit = backup_commit, .abort = backup_abort, .clean = backup_clean, - } + .pause = backup_pause, + }, + .set_speed = backup_set_speed, }; static int64_t backup_calculate_cluster_size(BlockDriverState *target, @@ -335,6 +387,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, BitmapSyncMode bitmap_mode, bool compress, const char *filter_node_name, + BackupPerf *perf, BlockdevOnError on_source_error, BlockdevOnError on_target_error, int creation_flags, @@ -386,6 +439,29 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, return NULL; } + cluster_size = backup_calculate_cluster_size(target, errp); + if (cluster_size < 0) { + goto error; + } + + if (perf->max_workers < 1) { + error_setg(errp, "max-workers must be greater than zero"); + return NULL; + } + + if (perf->max_chunk < 0) { + error_setg(errp, "max-chunk must be zero (which means no limit) or " + "positive"); + return NULL; + } + + if (perf->max_chunk && perf->max_chunk < cluster_size) { + error_setg(errp, "Required max-chunk (%" PRIi64 ") is less than backup " + "cluster size (%" PRIi64 ")", perf->max_chunk, cluster_size); + return NULL; + } + + if (sync_bitmap) { /* If we need to write to this bitmap, check that we can: */ if (bitmap_mode != BITMAP_SYNC_MODE_NEVER && @@ -418,11 +494,6 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, goto error; } - cluster_size = backup_calculate_cluster_size(target, errp); - if (cluster_size < 0) { - goto error; - } - /* * If source is in backing chain of target assume that target is going to be * used for "image fleecing", i.e. it should represent a kind of snapshot of @@ -441,7 +512,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, (compress ? BDRV_REQ_WRITE_COMPRESSED : 0), backup_top = bdrv_backup_top_append(bs, target, filter_node_name, - cluster_size, write_flags, &bcs, errp); + cluster_size, perf, + write_flags, &bcs, errp); if (!backup_top) { goto error; } @@ -464,9 +536,10 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, job->bcs = bcs; job->cluster_size = cluster_size; job->len = len; + job->perf = *perf; - block_copy_set_progress_callback(bcs, backup_progress_bytes_callback, job); block_copy_set_progress_meter(bcs, &job->common.job.progress); + block_copy_set_speed(bcs, speed); /* Required permissions are already taken by backup-top target */ block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, diff --git a/block/block-copy.c b/block/block-copy.c index cd9bc47c8f..39ae481c8b 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -26,11 +26,34 @@ #define BLOCK_COPY_MAX_BUFFER (1 * MiB) #define BLOCK_COPY_MAX_MEM (128 * MiB) #define BLOCK_COPY_MAX_WORKERS 64 +#define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ static coroutine_fn int block_copy_task_entry(AioTask *task); typedef struct BlockCopyCallState { - bool failed; + /* IN parameters. Initialized in block_copy_async() and never changed. */ + BlockCopyState *s; + int64_t offset; + int64_t bytes; + int max_workers; + int64_t max_chunk; + bool ignore_ratelimit; + BlockCopyAsyncCallbackFunc cb; + void *cb_opaque; + + /* Coroutine where async block-copy is running */ + Coroutine *co; + + /* To reference all call states from BlockCopyState */ + QLIST_ENTRY(BlockCopyCallState) list; + + /* State */ + int ret; + bool finished; + QemuCoSleepState *sleep_state; + bool cancelled; + + /* OUT parameters */ bool error_is_read; } BlockCopyCallState; @@ -65,7 +88,8 @@ typedef struct BlockCopyState { bool use_copy_range; int64_t copy_size; uint64_t len; - QLIST_HEAD(, BlockCopyTask) tasks; + QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls */ + QLIST_HEAD(, BlockCopyCallState) calls; BdrvRequestFlags write_flags; @@ -86,11 +110,11 @@ typedef struct BlockCopyState { bool skip_unallocated; ProgressMeter *progress; - /* progress_bytes_callback: called when some copying progress is done. */ - ProgressBytesCallbackFunc progress_bytes_callback; - void *progress_opaque; SharedResource *mem; + + uint64_t speed; + RateLimit rate_limit; } BlockCopyState; static BlockCopyTask *find_conflicting_task(BlockCopyState *s, @@ -134,10 +158,11 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState *s, int64_t offset, int64_t bytes) { BlockCopyTask *task; + int64_t max_chunk = MIN_NON_ZERO(s->copy_size, call_state->max_chunk); if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, offset, offset + bytes, - s->copy_size, &offset, &bytes)) + max_chunk, &offset, &bytes)) { return NULL; } @@ -218,7 +243,7 @@ static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) } BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, - int64_t cluster_size, + int64_t cluster_size, bool use_copy_range, BdrvRequestFlags write_flags, Error **errp) { BlockCopyState *s; @@ -260,24 +285,16 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, * We enable copy-range, but keep small copy_size, until first * successful copy_range (look at block_copy_do_copy). */ - s->use_copy_range = true; + s->use_copy_range = use_copy_range; s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); } QLIST_INIT(&s->tasks); + QLIST_INIT(&s->calls); return s; } -void block_copy_set_progress_callback( - BlockCopyState *s, - ProgressBytesCallbackFunc progress_bytes_callback, - void *progress_opaque) -{ - s->progress_bytes_callback = progress_bytes_callback; - s->progress_opaque = progress_opaque; -} - void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) { s->progress = pm; @@ -420,12 +437,11 @@ static coroutine_fn int block_copy_task_entry(AioTask *task) ret = block_copy_do_copy(t->s, t->offset, t->bytes, t->zeroes, &error_is_read); - if (ret < 0 && !t->call_state->failed) { - t->call_state->failed = true; + if (ret < 0 && !t->call_state->ret) { + t->call_state->ret = ret; t->call_state->error_is_read = error_is_read; } else { progress_work_done(t->s->progress, t->bytes); - t->s->progress_bytes_callback(t->bytes, t->s->progress_opaque); } co_put_to_shres(t->s->mem, t->bytes); block_copy_task_end(t, ret); @@ -544,15 +560,17 @@ int64_t block_copy_reset_unallocated(BlockCopyState *s, * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty * clusters found and -errno on failure. */ -static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s, - int64_t offset, int64_t bytes, - bool *error_is_read) +static int coroutine_fn +block_copy_dirty_clusters(BlockCopyCallState *call_state) { + BlockCopyState *s = call_state->s; + int64_t offset = call_state->offset; + int64_t bytes = call_state->bytes; + int ret = 0; bool found_dirty = false; int64_t end = offset + bytes; AioTaskPool *aio = NULL; - BlockCopyCallState call_state = {false, false}; /* * block_copy() user is responsible for keeping source and target in same @@ -564,11 +582,11 @@ static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s, assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); - while (bytes && aio_task_pool_status(aio) == 0) { + while (bytes && aio_task_pool_status(aio) == 0 && !call_state->cancelled) { BlockCopyTask *task; int64_t status_bytes; - task = block_copy_task_create(s, &call_state, offset, bytes); + task = block_copy_task_create(s, call_state, offset, bytes); if (!task) { /* No more dirty bits in the bitmap */ trace_block_copy_skip_range(s, offset, bytes); @@ -599,6 +617,21 @@ static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s, } task->zeroes = ret & BDRV_BLOCK_ZERO; + if (s->speed) { + if (!call_state->ignore_ratelimit) { + uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); + if (ns > 0) { + block_copy_task_end(task, -EAGAIN); + g_free(task); + qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, ns, + &call_state->sleep_state); + continue; + } + } + + ratelimit_calculate_delay(&s->rate_limit, task->bytes); + } + trace_block_copy_process(s, task->offset); co_get_from_shres(s->mem, task->bytes); @@ -607,7 +640,7 @@ static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s, bytes = end - offset; if (!aio && bytes) { - aio = aio_task_pool_new(BLOCK_COPY_MAX_WORKERS); + aio = aio_task_pool_new(call_state->max_workers); } ret = block_copy_task_run(aio, task); @@ -633,15 +666,19 @@ out: aio_task_pool_free(aio); } - if (error_is_read && ret < 0) { - *error_is_read = call_state.error_is_read; - } return ret < 0 ? ret : found_dirty; } +void block_copy_kick(BlockCopyCallState *call_state) +{ + if (call_state->sleep_state) { + qemu_co_sleep_wake(call_state->sleep_state); + } +} + /* - * block_copy + * block_copy_common * * Copy requested region, accordingly to dirty bitmap. * Collaborate with parallel block_copy requests: if they succeed it will help @@ -649,16 +686,18 @@ out: * it means that some I/O operation failed in context of _this_ block_copy call, * not some parallel operation. */ -int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, - bool *error_is_read) +static int coroutine_fn block_copy_common(BlockCopyCallState *call_state) { int ret; - do { - ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read); + QLIST_INSERT_HEAD(&call_state->s->calls, call_state, list); - if (ret == 0) { - ret = block_copy_wait_one(s, offset, bytes); + do { + ret = block_copy_dirty_clusters(call_state); + + if (ret == 0 && !call_state->cancelled) { + ret = block_copy_wait_one(call_state->s, call_state->offset, + call_state->bytes); } /* @@ -670,11 +709,110 @@ int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, * 2. We have waited for some intersecting block-copy request * It may have failed and produced new dirty bits. */ - } while (ret > 0); + } while (ret > 0 && !call_state->cancelled); + + call_state->finished = true; + + if (call_state->cb) { + call_state->cb(call_state->cb_opaque); + } + + QLIST_REMOVE(call_state, list); return ret; } +int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, + bool ignore_ratelimit) +{ + BlockCopyCallState call_state = { + .s = s, + .offset = start, + .bytes = bytes, + .ignore_ratelimit = ignore_ratelimit, + .max_workers = BLOCK_COPY_MAX_WORKERS, + }; + + return block_copy_common(&call_state); +} + +static void coroutine_fn block_copy_async_co_entry(void *opaque) +{ + block_copy_common(opaque); +} + +BlockCopyCallState *block_copy_async(BlockCopyState *s, + int64_t offset, int64_t bytes, + int max_workers, int64_t max_chunk, + BlockCopyAsyncCallbackFunc cb, + void *cb_opaque) +{ + BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); + + *call_state = (BlockCopyCallState) { + .s = s, + .offset = offset, + .bytes = bytes, + .max_workers = max_workers, + .max_chunk = max_chunk, + .cb = cb, + .cb_opaque = cb_opaque, + + .co = qemu_coroutine_create(block_copy_async_co_entry, call_state), + }; + + qemu_coroutine_enter(call_state->co); + + return call_state; +} + +void block_copy_call_free(BlockCopyCallState *call_state) +{ + if (!call_state) { + return; + } + + assert(call_state->finished); + g_free(call_state); +} + +bool block_copy_call_finished(BlockCopyCallState *call_state) +{ + return call_state->finished; +} + +bool block_copy_call_succeeded(BlockCopyCallState *call_state) +{ + return call_state->finished && !call_state->cancelled && + call_state->ret == 0; +} + +bool block_copy_call_failed(BlockCopyCallState *call_state) +{ + return call_state->finished && !call_state->cancelled && + call_state->ret < 0; +} + +bool block_copy_call_cancelled(BlockCopyCallState *call_state) +{ + return call_state->cancelled; +} + +int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) +{ + assert(call_state->finished); + if (error_is_read) { + *error_is_read = call_state->error_is_read; + } + return call_state->ret; +} + +void block_copy_call_cancel(BlockCopyCallState *call_state) +{ + call_state->cancelled = true; + block_copy_kick(call_state); +} + BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) { return s->copy_bitmap; @@ -684,3 +822,18 @@ void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) { s->skip_unallocated = skip; } + +void block_copy_set_speed(BlockCopyState *s, uint64_t speed) +{ + s->speed = speed; + if (speed > 0) { + ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); + } + + /* + * Note: it's good to kick all call states from here, but it should be done + * only from a coroutine, to not crash if s->calls list changed while + * entering one call. So for now, the only user of this function kicks its + * only one call_state by hand. + */ +} diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 2816e61afe..9cad9e1b8c 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,26 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { + bool active; + BlockDriverState *bottom_bs; + bool chain_frozen; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { + BlockDriverState *bottom_bs = NULL; + BDRVStateCOR *state = bs->opaque; + /* Find a bottom node name, if any */ + const char *bottom_node = qdict_get_try_str(options, "bottom"); + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -35,6 +50,8 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } + bs->supported_read_flags = BDRV_REQ_PREFETCH; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); @@ -42,6 +59,44 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); + if (bottom_node) { + bottom_bs = bdrv_find_node(bottom_node); + if (!bottom_bs) { + error_setg(errp, "Bottom node '%s' not found", bottom_node); + qdict_del(options, "bottom"); + return -EINVAL; + } + qdict_del(options, "bottom"); + + if (!bottom_bs->drv) { + error_setg(errp, "Bottom node '%s' not opened", bottom_node); + return -EINVAL; + } + + if (bottom_bs->drv->is_filter) { + error_setg(errp, "Bottom node '%s' is a filter", bottom_node); + return -EINVAL; + } + + if (bdrv_freeze_backing_chain(bs, bottom_bs, errp) < 0) { + return -EINVAL; + } + state->chain_frozen = true; + + /* + * We do freeze the chain, so it shouldn't be removed. Still, storing a + * pointer worth bdrv_ref(). + */ + bdrv_ref(bottom_bs); + } + state->active = true; + state->bottom_bs = bottom_bs; + + /* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +112,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { + BDRVStateCOR *s = bs->opaque; + + if (!s->active) { + /* + * While the filter is being removed + */ + *nperm = 0; + *nshared = BLK_PERM_ALL; + return; + } + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -74,21 +140,67 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { - return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); + int64_t n; + int local_flags; + int ret; + BDRVStateCOR *state = bs->opaque; + + if (!state->bottom_bs) { + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); + } + + while (bytes) { + local_flags = flags; + + /* In case of failure, try to copy-on-read anyway */ + ret = bdrv_is_allocated(bs->file->bs, offset, bytes, &n); + if (ret <= 0) { + ret = bdrv_is_allocated_above(bdrv_backing_chain_next(bs->file->bs), + state->bottom_bs, true, offset, + n, &n); + if (ret > 0 || ret < 0) { + local_flags |= BDRV_REQ_COPY_ON_READ; + } + /* Finish earlier if the end of a backing file has been reached */ + if (n == 0) { + break; + } + } + + /* Skip if neither read nor write are needed */ + if ((local_flags & (BDRV_REQ_PREFETCH | BDRV_REQ_COPY_ON_READ)) != + BDRV_REQ_PREFETCH) { + ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); + if (ret < 0) { + return ret; + } + } + + offset += n; + qiov_offset += n; + bytes -= n; + } + + return 0; } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, int flags) { - - return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); + return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); } @@ -129,16 +241,31 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) } +static void cor_close(BlockDriverState *bs) +{ + BDRVStateCOR *s = bs->opaque; + + if (s->chain_frozen) { + s->chain_frozen = false; + bdrv_unfreeze_backing_chain(bs, s->bottom_bs); + } + + bdrv_unref(s->bottom_bs); +} + + static BlockDriver bdrv_copy_on_read = { .format_name = "copy-on-read", + .instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, + .bdrv_close = cor_close, .bdrv_child_perm = cor_child_perm, .bdrv_getlength = cor_getlength, - .bdrv_co_preadv = cor_co_preadv, - .bdrv_co_pwritev = cor_co_pwritev, + .bdrv_co_preadv_part = cor_co_preadv_part, + .bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, @@ -150,6 +277,39 @@ static BlockDriver bdrv_copy_on_read = { .is_filter = true, }; + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ + BdrvChild *child; + BlockDriverState *bs; + BDRVStateCOR *s = cor_filter_bs->opaque; + + child = bdrv_filter_child(cor_filter_bs); + if (!child) { + return; + } + bs = child->bs; + + /* Retain the BDS until we complete the graph change. */ + bdrv_ref(bs); + /* Hold a guest back from writing while permissions are being reset. */ + bdrv_drained_begin(bs); + /* Drop permissions before the graph change. */ + s->active = false; + /* unfreeze, as otherwise bdrv_replace_node() will fail */ + if (s->chain_frozen) { + s->chain_frozen = false; + bdrv_unfreeze_backing_chain(cor_filter_bs, s->bottom_bs); + } + bdrv_child_refresh_perms(cor_filter_bs, child, &error_abort); + bdrv_replace_node(cor_filter_bs, bs, &error_abort); + + bdrv_drained_end(bs); + bdrv_unref(bs); + bdrv_unref(cor_filter_bs); +} + + static void bdrv_copy_on_read_init(void) { bdrv_register(&bdrv_copy_on_read); diff --git a/block/copy-on-read.h b/block/copy-on-read.h new file mode 100644 index 0000000000..7bf405dccd --- /dev/null +++ b/block/copy-on-read.h @@ -0,0 +1,32 @@ +/* + * Copy-on-read filter block driver + * + * The filter driver performs Copy-On-Read (COR) operations + * + * Copyright (c) 2018-2020 Virtuozzo International GmbH. + * + * Author: + * Andrey Shinkevich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef BLOCK_COPY_ON_READ +#define BLOCK_COPY_ON_READ + +#include "block/block_int.h" + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs); + +#endif /* BLOCK_COPY_ON_READ */ diff --git a/block/file-posix.c b/block/file-posix.c index 00cdaaa2d4..11aafa9d82 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -216,6 +216,20 @@ typedef struct RawPosixAIOData { static int cdrom_reopen(BlockDriverState *bs); #endif +/* + * Elide EAGAIN and EACCES details when failing to lock, as this + * indicates that the specified file region is already locked by + * another process, which is considered a common scenario. + */ +#define raw_lock_error_setg_errno(errp, err, fmt, ...) \ + do { \ + if ((err) == EAGAIN || (err) == EACCES) { \ + error_setg((errp), (fmt), ## __VA_ARGS__); \ + } else { \ + error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__); \ + } \ + } while (0) + #if defined(__NetBSD__) static int raw_normalize_devicepath(const char **filename, Error **errp) { @@ -836,7 +850,8 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd, if ((perm_lock_bits & bit) && !(locked_perm & bit)) { ret = qemu_lock_fd(fd, off, 1, false); if (ret) { - error_setg(errp, "Failed to lock byte %d", off); + raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d", + off); return ret; } else if (s) { s->locked_perm |= bit; @@ -844,7 +859,7 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd, } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) { ret = qemu_unlock_fd(fd, off, 1); if (ret) { - error_setg(errp, "Failed to unlock byte %d", off); + error_setg_errno(errp, -ret, "Failed to unlock byte %d", off); return ret; } else if (s) { s->locked_perm &= ~bit; @@ -857,7 +872,8 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd, if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) { ret = qemu_lock_fd(fd, off, 1, false); if (ret) { - error_setg(errp, "Failed to lock byte %d", off); + raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d", + off); return ret; } else if (s) { s->locked_shared_perm |= bit; @@ -866,7 +882,7 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd, !(shared_perm_lock_bits & bit)) { ret = qemu_unlock_fd(fd, off, 1); if (ret) { - error_setg(errp, "Failed to unlock byte %d", off); + error_setg_errno(errp, -ret, "Failed to unlock byte %d", off); return ret; } else if (s) { s->locked_shared_perm &= ~bit; @@ -890,9 +906,10 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm, ret = qemu_lock_fd_test(fd, off, 1, true); if (ret) { char *perm_name = bdrv_perm_names(p); - error_setg(errp, - "Failed to get \"%s\" lock", - perm_name); + + raw_lock_error_setg_errno(errp, -ret, + "Failed to get \"%s\" lock", + perm_name); g_free(perm_name); return ret; } @@ -905,9 +922,10 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm, ret = qemu_lock_fd_test(fd, off, 1, true); if (ret) { char *perm_name = bdrv_perm_names(p); - error_setg(errp, - "Failed to get shared \"%s\" lock", - perm_name); + + raw_lock_error_setg_errno(errp, -ret, + "Failed to get shared \"%s\" lock", + perm_name); g_free(perm_name); return ret; } diff --git a/block/io.c b/block/io.c index 95b1c56c06..d203435a73 100644 --- a/block/io.c +++ b/block/io.c @@ -1453,6 +1453,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, if (flags & BDRV_REQ_COPY_ON_READ) { int64_t pnum; + /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ + flags &= ~BDRV_REQ_COPY_ON_READ; + ret = bdrv_is_allocated(bs, offset, bytes, &pnum); if (ret < 0) { goto out; @@ -1474,9 +1477,11 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, goto out; } + assert(!(flags & ~bs->supported_read_flags)); + max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { - ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); goto out; } @@ -1489,7 +1494,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, num, qiov, - qiov_offset + bytes - bytes_remaining, 0); + qiov_offset + bytes - bytes_remaining, + flags); max_bytes -= num; } else { num = bytes_remaining; diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index d15a2be827..afd75ab628 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -507,9 +507,10 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) int64_t speed = qdict_get_try_int(qdict, "speed", 0); qmp_block_stream(true, device, device, base != NULL, base, false, NULL, - false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - &error); + false, NULL, false, NULL, + qdict_haskey(qdict, "speed"), speed, true, + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, &error); hmp_handle_error(mon, error); } diff --git a/block/replication.c b/block/replication.c index 0c70215784..97be7ef4de 100644 --- a/block/replication.c +++ b/block/replication.c @@ -454,6 +454,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, int64_t active_length, hidden_length, disk_length; AioContext *aio_context; Error *local_err = NULL; + BackupPerf perf = { .use_copy_range = true, .max_workers = 1 }; aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -558,6 +559,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, s->backup_job = backup_job_create( NULL, s->secondary_disk->bs, s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL, + &perf, BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL, backup_job_completed, bs, NULL, &local_err); diff --git a/block/stream.c b/block/stream.c index 236384f2f7..1fa742b0db 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ #include "block/blockjob_int.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -33,10 +35,11 @@ typedef struct StreamBlockJob { BlockJob common; BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ + BlockDriverState *cor_filter_bs; + BlockDriverState *target_bs; BlockdevOnError on_error; char *backing_file_str; bool bs_read_only; - bool chain_frozen; } StreamBlockJob; static int coroutine_fn stream_populate(BlockBackend *blk, @@ -44,39 +47,28 @@ static int coroutine_fn stream_populate(BlockBackend *blk, { assert(bytes < SIZE_MAX); - return blk_co_preadv(blk, offset, bytes, NULL, - BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); -} - -static void stream_abort(Job *job) -{ - StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); - - if (s->chain_frozen) { - BlockJob *bjob = &s->common; - bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); - } + return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH); } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); - BlockJob *bjob = &s->common; - BlockDriverState *bs = blk_bs(bjob->blk); - BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); + BlockDriverState *unfiltered_base = bdrv_skip_filters(base); Error *local_err = NULL; int ret = 0; - bdrv_unfreeze_backing_chain(bs, s->above_base); - s->chain_frozen = false; + /* We should drop filter at this point, as filter hold the backing chain */ + bdrv_cor_filter_drop(s->cor_filter_bs); + s->cor_filter_bs = NULL; if (bdrv_cow_child(unfiltered_bs)) { const char *base_id = NULL, *base_fmt = NULL; - if (base) { - base_id = s->backing_file_str; - if (base->drv) { - base_fmt = base->drv->format_name; + if (unfiltered_base) { + base_id = s->backing_file_str ?: unfiltered_base->filename; + if (unfiltered_base->drv) { + base_fmt = unfiltered_base->drv->format_name; } } bdrv_set_backing_hd(unfiltered_bs, base, &local_err); @@ -94,13 +86,17 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = &s->common; - BlockDriverState *bs = blk_bs(bjob->blk); + + if (s->cor_filter_bs) { + bdrv_cor_filter_drop(s->cor_filter_bs); + s->cor_filter_bs = NULL; + } /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); - bdrv_reopen_set_read_only(bs, true, NULL); + bdrv_reopen_set_read_only(s->target_bs, true, NULL); } g_free(s->backing_file_str); @@ -110,9 +106,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; - BlockDriverState *bs = blk_bs(blk); - BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); - bool enable_cor = !bdrv_cow_child(s->base_overlay); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; @@ -124,21 +118,12 @@ static int coroutine_fn stream_run(Job *job, Error **errp) return 0; } - len = bdrv_getlength(bs); + len = bdrv_getlength(s->target_bs); if (len < 0) { return len; } job_progress_set_remaining(&s->common.job, len); - /* Turn on copy-on-read for the whole block device so that guest read - * requests help us make progress. Only do this when copying the entire - * backing chain since the copy-on-read operation does not take base into - * account. - */ - if (enable_cor) { - bdrv_enable_copy_on_read(bs); - } - for ( ; offset < len; offset += n) { bool copy; int ret; @@ -197,10 +182,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } } - if (enable_cor) { - bdrv_disable_copy_on_read(bs); - } - /* Do not remove the backing file if an error was there but ignored. */ return error; } @@ -212,7 +193,6 @@ static const BlockJobDriver stream_job_driver = { .free = block_job_free, .run = stream_run, .prepare = stream_prepare, - .abort = stream_abort, .clean = stream_clean, .user_resume = block_job_user_resume, }, @@ -220,59 +200,113 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, + BlockDriverState *bottom, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; bool bs_read_only; int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED; - BlockDriverState *base_overlay = bdrv_find_overlay(bs, base); + BlockDriverState *base_overlay; + BlockDriverState *cor_filter_bs = NULL; BlockDriverState *above_base; + QDict *opts; - if (!base_overlay) { - error_setg(errp, "'%s' is not in the backing chain of '%s'", - base->node_name, bs->node_name); - return; - } + assert(!(base && bottom)); + assert(!(backing_file_str && bottom)); - /* - * Find the node directly above @base. @base_overlay is a COW overlay, so - * it must have a bdrv_cow_child(), but it is the immediate overlay of - * @base, so between the two there can only be filters. - */ - above_base = base_overlay; - if (bdrv_cow_bs(above_base) != base) { - above_base = bdrv_cow_bs(above_base); - while (bdrv_filter_bs(above_base) != base) { - above_base = bdrv_filter_bs(above_base); + if (bottom) { + /* + * New simple interface. The code is written in terms of old interface + * with @base parameter (still, it doesn't freeze link to base, so in + * this mean old code is correct for new interface). So, for now, just + * emulate base_overlay and above_base. Still, when old interface + * finally removed, we should refactor code to use only "bottom", but + * not "*base*" things. + */ + assert(!bottom->drv->is_filter); + base_overlay = above_base = bottom; + } else { + base_overlay = bdrv_find_overlay(bs, base); + if (!base_overlay) { + error_setg(errp, "'%s' is not in the backing chain of '%s'", + base->node_name, bs->node_name); + return; } - } - if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) { - return; + /* + * Find the node directly above @base. @base_overlay is a COW overlay, + * so it must have a bdrv_cow_child(), but it is the immediate overlay + * of @base, so between the two there can only be filters. + */ + above_base = base_overlay; + if (bdrv_cow_bs(above_base) != base) { + above_base = bdrv_cow_bs(above_base); + while (bdrv_filter_bs(above_base) != base) { + above_base = bdrv_filter_bs(above_base); + } + } } /* Make sure that the image is opened in read-write mode */ bs_read_only = bdrv_is_read_only(bs); if (bs_read_only) { - if (bdrv_reopen_set_read_only(bs, false, errp) != 0) { - bs_read_only = false; - goto fail; + int ret; + /* Hold the chain during reopen */ + if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) { + return; + } + + ret = bdrv_reopen_set_read_only(bs, false, errp); + + /* failure, or cor-filter will hold the chain */ + bdrv_unfreeze_backing_chain(bs, above_base); + + if (ret < 0) { + return; } } - /* Prevent concurrent jobs trying to modify the graph structure here, we - * already have our own plans. Also don't allow resize as the image size is - * queried only at the job start and then cached. */ - s = block_job_create(job_id, &stream_job_driver, NULL, bs, - basic_flags | BLK_PERM_GRAPH_MOD, + opts = qdict_new(); + + qdict_put_str(opts, "driver", "copy-on-read"); + qdict_put_str(opts, "file", bdrv_get_node_name(bs)); + /* Pass the base_overlay node name as 'bottom' to COR driver */ + qdict_put_str(opts, "bottom", base_overlay->node_name); + if (filter_node_name) { + qdict_put_str(opts, "node-name", filter_node_name); + } + + cor_filter_bs = bdrv_insert_node(bs, opts, BDRV_O_RDWR, errp); + if (!cor_filter_bs) { + goto fail; + } + + if (!filter_node_name) { + cor_filter_bs->implicit = true; + } + + s = block_job_create(job_id, &stream_job_driver, NULL, cor_filter_bs, + BLK_PERM_CONSISTENT_READ, basic_flags | BLK_PERM_WRITE, speed, creation_flags, NULL, NULL, errp); if (!s) { goto fail; } + /* + * Prevent concurrent jobs trying to modify the graph structure here, we + * already have our own plans. Also don't allow resize as the image size is + * queried only at the job start and then cached. + */ + if (block_job_add_bdrv(&s->common, "active node", bs, 0, + basic_flags | BLK_PERM_WRITE, &error_abort)) { + goto fail; + } + /* Block all intermediate nodes between bs and base, because they will * disappear from the chain after this operation. The streaming job reads * every block only once, assuming that it doesn't change, so forbid writes @@ -293,8 +327,9 @@ void stream_start(const char *job_id, BlockDriverState *bs, s->base_overlay = base_overlay; s->above_base = above_base; s->backing_file_str = g_strdup(backing_file_str); + s->cor_filter_bs = cor_filter_bs; + s->target_bs = bs; s->bs_read_only = bs_read_only; - s->chain_frozen = true; s->on_error = on_error; trace_stream_start(bs, base, s); @@ -302,8 +337,10 @@ void stream_start(const char *job_id, BlockDriverState *bs, return; fail: + if (cor_filter_bs) { + bdrv_cor_filter_drop(cor_filter_bs); + } if (bs_read_only) { bdrv_reopen_set_read_only(bs, true, NULL); } - bdrv_unfreeze_backing_chain(bs, above_base); } diff --git a/blockdev.c b/blockdev.c index 2431448c5d..93417f6302 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2500,19 +2500,39 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_base, const char *base, bool has_base_node, const char *base_node, bool has_backing_file, const char *backing_file, + bool has_bottom, const char *bottom, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) { - BlockDriverState *bs, *iter; + BlockDriverState *bs, *iter, *iter_end; BlockDriverState *base_bs = NULL; + BlockDriverState *bottom_bs = NULL; AioContext *aio_context; Error *local_err = NULL; - const char *base_name = NULL; int job_flags = JOB_DEFAULT; + if (has_base && has_base_node) { + error_setg(errp, "'base' and 'base-node' cannot be specified " + "at the same time"); + return; + } + + if (has_base && has_bottom) { + error_setg(errp, "'base' and 'bottom' cannot be specified " + "at the same time"); + return; + } + + if (has_bottom && has_base_node) { + error_setg(errp, "'bottom' and 'base-node' cannot be specified " + "at the same time"); + return; + } + if (!has_on_error) { on_error = BLOCKDEV_ON_ERROR_REPORT; } @@ -2525,12 +2545,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (has_base && has_base_node) { - error_setg(errp, "'base' and 'base-node' cannot be specified " - "at the same time"); - goto out; - } - if (has_base) { base_bs = bdrv_find_backing_image(bs, base); if (base_bs == NULL) { @@ -2538,7 +2552,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); - base_name = base; } if (has_base_node) { @@ -2553,11 +2566,35 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); - base_name = base_bs->filename; } - /* Check for op blockers in the whole chain between bs and base */ - for (iter = bs; iter && iter != base_bs; + if (has_bottom) { + bottom_bs = bdrv_lookup_bs(NULL, bottom, errp); + if (!bottom_bs) { + goto out; + } + if (!bottom_bs->drv) { + error_setg(errp, "Node '%s' is not open", bottom); + goto out; + } + if (bottom_bs->drv->is_filter) { + error_setg(errp, "Node '%s' is a filter, use a non-filter node " + "as 'bottom'", bottom); + goto out; + } + if (!bdrv_chain_contains(bs, bottom_bs)) { + error_setg(errp, "Node '%s' is not in a chain starting from '%s'", + bottom, device); + goto out; + } + assert(bdrv_get_aio_context(bottom_bs) == aio_context); + } + + /* + * Check for op blockers in the whole chain between bs and base (or bottom) + */ + iter_end = has_bottom ? bdrv_filter_or_cow_bs(bottom_bs) : base_bs; + for (iter = bs; iter && iter != iter_end; iter = bdrv_filter_or_cow_bs(iter)) { if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) { @@ -2573,9 +2610,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } - /* backing_file string overrides base bs filename */ - base_name = has_backing_file ? backing_file : base_name; - if (has_auto_finalize && !auto_finalize) { job_flags |= JOB_MANUAL_FINALIZE; } @@ -2583,8 +2617,9 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, job_flags |= JOB_MANUAL_DISMISS; } - stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, &local_err); + stream_start(has_job_id ? job_id : NULL, bs, base_bs, backing_file, + bottom_bs, job_flags, has_speed ? speed : 0, on_error, + filter_node_name, &local_err); if (local_err) { error_propagate(errp, local_err); goto out; @@ -2794,6 +2829,7 @@ static BlockJob *do_backup_common(BackupCommon *backup, { BlockJob *job = NULL; BdrvDirtyBitmap *bmap = NULL; + BackupPerf perf = { .max_workers = 64 }; int job_flags = JOB_DEFAULT; if (!backup->has_speed) { @@ -2818,6 +2854,18 @@ static BlockJob *do_backup_common(BackupCommon *backup, backup->compress = false; } + if (backup->x_perf) { + if (backup->x_perf->has_use_copy_range) { + perf.use_copy_range = backup->x_perf->use_copy_range; + } + if (backup->x_perf->has_max_workers) { + perf.max_workers = backup->x_perf->max_workers; + } + if (backup->x_perf->has_max_chunk) { + perf.max_chunk = backup->x_perf->max_chunk; + } + } + if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) || (backup->sync == MIRROR_SYNC_MODE_INCREMENTAL)) { /* done before desugaring 'incremental' to print the right message */ @@ -2891,6 +2939,7 @@ static BlockJob *do_backup_common(BackupCommon *backup, backup->sync, bmap, backup->bitmap_mode, backup->compress, backup->filter_node_name, + &perf, backup->on_source_error, backup->on_target_error, job_flags, NULL, NULL, txn, errp); diff --git a/blockjob.c b/blockjob.c index 98ac8af982..db3a21699c 100644 --- a/blockjob.c +++ b/blockjob.c @@ -256,6 +256,7 @@ static bool job_timer_pending(Job *job) void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) { + const BlockJobDriver *drv = block_job_driver(job); int64_t old_speed = job->speed; if (job_apply_verb(&job->job, JOB_VERB_SET_SPEED, errp)) { @@ -270,6 +271,11 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&job->limit, speed, BLOCK_JOB_SLICE_TIME); job->speed = speed; + + if (drv->set_speed) { + drv->set_speed(job, speed); + } + if (speed && speed <= old_speed) { return; } diff --git a/include/block/block-copy.h b/include/block/block-copy.h index aac85e1488..338f2ea7fd 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -18,19 +18,15 @@ #include "block/block.h" #include "qemu/co-shared-resource.h" -typedef void (*ProgressBytesCallbackFunc)(int64_t bytes, void *opaque); +typedef void (*BlockCopyAsyncCallbackFunc)(void *opaque); typedef struct BlockCopyState BlockCopyState; +typedef struct BlockCopyCallState BlockCopyCallState; BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, - int64_t cluster_size, + int64_t cluster_size, bool use_copy_range, BdrvRequestFlags write_flags, Error **errp); -void block_copy_set_progress_callback( - BlockCopyState *s, - ProgressBytesCallbackFunc progress_bytes_callback, - void *progress_opaque); - void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm); void block_copy_state_free(BlockCopyState *s); @@ -39,7 +35,56 @@ int64_t block_copy_reset_unallocated(BlockCopyState *s, int64_t offset, int64_t *count); int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, - bool *error_is_read); + bool ignore_ratelimit); + +/* + * Run block-copy in a coroutine, create corresponding BlockCopyCallState + * object and return pointer to it. Never returns NULL. + * + * Caller is responsible to call block_copy_call_free() to free + * BlockCopyCallState object. + * + * @max_workers means maximum of parallel coroutines to execute sub-requests, + * must be > 0. + * + * @max_chunk means maximum length for one IO operation. Zero means unlimited. + */ +BlockCopyCallState *block_copy_async(BlockCopyState *s, + int64_t offset, int64_t bytes, + int max_workers, int64_t max_chunk, + BlockCopyAsyncCallbackFunc cb, + void *cb_opaque); + +/* + * Free finished BlockCopyCallState. Trying to free running + * block-copy will crash. + */ +void block_copy_call_free(BlockCopyCallState *call_state); + +/* + * Note, that block-copy call is marked finished prior to calling + * the callback. + */ +bool block_copy_call_finished(BlockCopyCallState *call_state); +bool block_copy_call_succeeded(BlockCopyCallState *call_state); +bool block_copy_call_failed(BlockCopyCallState *call_state); +bool block_copy_call_cancelled(BlockCopyCallState *call_state); +int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read); + +void block_copy_set_speed(BlockCopyState *s, uint64_t speed); +void block_copy_kick(BlockCopyCallState *call_state); + +/* + * Cancel running block-copy call. + * + * Cancel leaves block-copy state valid: dirty bits are correct and you may use + * cancel + to emulate pause/resume. + * + * Note also, that the cancel is async: it only marks block-copy call to be + * cancelled. So, the call may be cancelled (block_copy_call_cancelled() reports + * true) but not yet finished (block_copy_call_finished() reports false). + */ +void block_copy_call_cancel(BlockCopyCallState *call_state); BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s); void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip); diff --git a/include/block/block.h b/include/block/block.h index a193545b6a..81fcaad5ac 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -72,9 +72,11 @@ typedef enum { BDRV_REQ_NO_FALLBACK = 0x100, /* - * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ - * on read request and means that caller doesn't really need data to be - * written to qiov parameter which may be NULL. + * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read + * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR + * filter is involved), in which case it signals that the COR operation + * need not read the data into memory (qiov) but only ensure they are + * copied to the top layer (i.e., that COR operation is done). */ BDRV_REQ_PREFETCH = 0x200, @@ -358,6 +360,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, Error **errp); void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp); +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp); int bdrv_parse_aio(const char *mode, int *flags); int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); diff --git a/include/block/block_int.h b/include/block/block_int.h index b9ef61fe4d..d01fc23720 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -881,6 +881,10 @@ struct BlockDriverState { /* I/O Limits */ BlockLimits bl; + /* + * Flags honored during pread + */ + unsigned int supported_read_flags; /* Flags honored during pwrite (so far: BDRV_REQ_FUA, * BDRV_REQ_WRITE_UNCHANGED). * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those @@ -1143,6 +1147,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the stream job inserts into the graph above + * @bs. NULL means that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1154,8 +1161,11 @@ int is_windows_drive(const char *filename); */ void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, + BlockDriverState *bottom, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: @@ -1256,6 +1266,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs, * @sync_mode: What parts of the disk image should be copied to the destination. * @sync_bitmap: The dirty bitmap if sync_mode is 'bitmap' or 'incremental' * @bitmap_mode: The bitmap synchronization policy to use. + * @perf: Performance options. All actual fields assumed to be present, + * all ".has_*" fields are ignored. * @on_source_error: The action to take upon error reading from the source. * @on_target_error: The action to take upon error writing to the target. * @creation_flags: Flags that control the behavior of the Job lifetime. @@ -1274,6 +1286,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, BitmapSyncMode bitmap_mode, bool compress, const char *filter_node_name, + BackupPerf *perf, BlockdevOnError on_source_error, BlockdevOnError on_target_error, int creation_flags, diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index e2824a36a8..6633d83da2 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -52,6 +52,8 @@ struct BlockJobDriver { * besides job->blk to the new AioContext. */ void (*attached_aio_context)(BlockJob *job, AioContext *new_context); + + void (*set_speed)(BlockJob *job, int64_t speed); }; /** diff --git a/job.c b/job.c index 8fecf38960..3aaaebafe2 100644 --- a/job.c +++ b/job.c @@ -553,6 +553,9 @@ static bool job_timer_not_pending(Job *job) void job_pause(Job *job) { job->pause_count++; + if (!job->paused) { + job_enter(job); + } } void job_resume(Job *job) diff --git a/qapi/block-core.json b/qapi/block-core.json index 3484986d1c..9f555d5c1d 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1371,6 +1371,30 @@ { 'struct': 'BlockdevSnapshot', 'data': { 'node': 'str', 'overlay': 'str' } } +## +# @BackupPerf: +# +# Optional parameters for backup. These parameters don't affect +# functionality, but may significantly affect performance. +# +# @use-copy-range: Use copy offloading. Default false. +# +# @max-workers: Maximum number of parallel requests for the sustained background +# copying process. Doesn't influence copy-before-write operations. +# Default 64. +# +# @max-chunk: Maximum request length for the sustained background copying +# process. Doesn't influence copy-before-write operations. +# 0 means unlimited. If max-chunk is non-zero then it should not be +# less than job cluster size which is calculated as maximum of +# target image cluster size and 64k. Default 0. +# +# Since: 6.0 +## +{ 'struct': 'BackupPerf', + 'data': { '*use-copy-range': 'bool', + '*max-workers': 'int', '*max-chunk': 'int64' } } + ## # @BackupCommon: # @@ -1426,6 +1450,8 @@ # above node specified by @drive. If this option is not given, # a node name is autogenerated. (Since: 4.2) # +# @x-perf: Performance options. (Since 6.0) +# # Note: @on-source-error and @on-target-error only affect background # I/O. If an error occurs during a guest write request, the device's # rerror/werror actions will be used. @@ -1440,7 +1466,7 @@ '*on-source-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError', '*auto-finalize': 'bool', '*auto-dismiss': 'bool', - '*filter-node-name': 'str' } } + '*filter-node-name': 'str', '*x-perf': 'BackupPerf' } } ## # @DriveBackup: @@ -2517,10 +2543,14 @@ # @device: the device or node name of the top image # # @base: the common backing file name. -# It cannot be set if @base-node is also set. +# It cannot be set if @base-node or @bottom is also set. # # @base-node: the node name of the backing file. -# It cannot be set if @base is also set. (Since 2.8) +# It cannot be set if @base or @bottom is also set. (Since 2.8) +# +# @bottom: the last node in the chain that should be streamed into +# top. It cannot be set if @base or @base-node is also set. +# It cannot be filter node. (Since 6.0) # # @backing-file: The backing file string to write into the top # image. This filename is not validated. @@ -2543,6 +2573,11 @@ # 'stop' and 'enospc' can only be used if the block device # supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +# filter driver that the stream job inserts into the graph +# above @device. If this option is not given, a node name is +# autogenerated. (Since: 6.0) +# # @auto-finalize: When false, this job will wait in a PENDING state after it has # finished its work, waiting for @block-job-finalize before # making any block graph changes. @@ -2571,8 +2606,9 @@ ## { 'command': 'block-stream', 'data': { '*job-id': 'str', 'device': 'str', '*base': 'str', - '*base-node': 'str', '*backing-file': 'str', '*speed': 'int', - '*on-error': 'BlockdevOnError', + '*base-node': 'str', '*backing-file': 'str', '*bottom': 'str', + '*speed': 'int', '*on-error': 'BlockdevOnError', + '*filter-node-name': 'str', '*auto-finalize': 'bool', '*auto-dismiss': 'bool' } } ## @@ -3953,6 +3989,24 @@ 'data': { 'throttle-group': 'str', 'file' : 'BlockdevRef' } } + +## +# @BlockdevOptionsCor: +# +# Driver specific block device options for the copy-on-read driver. +# +# @bottom: The name of a non-filter node (allocation-bearing layer) that +# limits the COR operations in the backing chain (inclusive), so +# that no data below this node will be copied by this filter. +# If option is absent, the limit is not applied, so that data +# from all backing layers may be copied. +# +# Since: 6.0 +## +{ 'struct': 'BlockdevOptionsCor', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { '*bottom': 'str' } } + ## # @BlockdevOptions: # @@ -4005,7 +4059,7 @@ 'bochs': 'BlockdevOptionsGenericFormat', 'cloop': 'BlockdevOptionsGenericFormat', 'compress': 'BlockdevOptionsGenericFormat', - 'copy-on-read':'BlockdevOptionsGenericFormat', + 'copy-on-read':'BlockdevOptionsCor', 'dmg': 'BlockdevOptionsGenericFormat', 'file': 'BlockdevOptionsFile', 'ftp': 'BlockdevOptionsCurlFtp', diff --git a/scripts/simplebench/bench-backup.py b/scripts/simplebench/bench-backup.py new file mode 100755 index 0000000000..33a1ecfefa --- /dev/null +++ b/scripts/simplebench/bench-backup.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# +# Bench backup block-job +# +# Copyright (c) 2020 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import argparse +import json + +import simplebench +from results_to_text import results_to_text +from bench_block_job import bench_block_copy, drv_file, drv_nbd + + +def bench_func(env, case): + """ Handle one "cell" of benchmarking table. """ + cmd_options = env['cmd-options'] if 'cmd-options' in env else {} + return bench_block_copy(env['qemu-binary'], env['cmd'], + cmd_options, + case['source'], case['target']) + + +def bench(args): + test_cases = [] + + sources = {} + targets = {} + for d in args.dir: + label, path = d.split(':') # paths with colon not supported + sources[label] = drv_file(path + '/test-source') + targets[label] = drv_file(path + '/test-target') + + if args.nbd: + nbd = args.nbd.split(':') + host = nbd[0] + port = '10809' if len(nbd) == 1 else nbd[1] + drv = drv_nbd(host, port) + sources['nbd'] = drv + targets['nbd'] = drv + + for t in args.test: + src, dst = t.split(':') + + test_cases.append({ + 'id': t, + 'source': sources[src], + 'target': targets[dst] + }) + + binaries = [] # list of (