From c13e80d792d00fbdb48e8e81bcc0d7616cc29821 Mon Sep 17 00:00:00 2001 From: Julio Faracco Date: Mon, 5 Nov 2018 13:08:03 -0200 Subject: [PATCH 01/42] block: adding lzfse decompressing support as a module. QEMU dmg support includes zlib and bzip2, but it does not contains lzfse support. This commit adds the source file to extend compression support for new DMGs. Signed-off-by: Julio Faracco Signed-off-by: Kevin Wolf --- block/dmg-lzfse.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 block/dmg-lzfse.c diff --git a/block/dmg-lzfse.c b/block/dmg-lzfse.c new file mode 100644 index 0000000000..19d25bc646 --- /dev/null +++ b/block/dmg-lzfse.c @@ -0,0 +1,49 @@ +/* + * DMG lzfse uncompression + * + * Copyright (c) 2018 Julio Cesar Faracco + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "dmg.h" +#include + +static int dmg_uncompress_lzfse_do(char *next_in, unsigned int avail_in, + char *next_out, unsigned int avail_out) +{ + size_t out_size = lzfse_decode_buffer((uint8_t *) next_out, avail_out, + (uint8_t *) next_in, avail_in, + NULL); + + /* We need to decode the single chunk only. */ + /* So, out_size == avail_out is not an error here. */ + if (out_size > 0) { + return out_size; + } + return -1; +} + +__attribute__((constructor)) +static void dmg_lzfse_init(void) +{ + assert(!dmg_uncompress_lzfse); + dmg_uncompress_lzfse = dmg_uncompress_lzfse_do; +} From 83bc1f9768f29d350494dba9b36d5da217fbb989 Mon Sep 17 00:00:00 2001 From: Julio Faracco Date: Mon, 5 Nov 2018 13:08:04 -0200 Subject: [PATCH 02/42] configure: adding support to lzfse library. This commit includes the support to lzfse opensource library. With this library dmg block driver can decompress images with this type of compression inside. Signed-off-by: Julio Faracco Signed-off-by: Kevin Wolf --- block/Makefile.objs | 2 ++ configure | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/block/Makefile.objs b/block/Makefile.objs index 46d585cfd0..7a81892a52 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -57,6 +57,8 @@ ssh.o-libs := $(LIBSSH2_LIBS) block-obj-dmg-bz2-$(CONFIG_BZIP2) += dmg-bz2.o block-obj-$(if $(CONFIG_DMG),m,n) += $(block-obj-dmg-bz2-y) dmg-bz2.o-libs := $(BZIP2_LIBS) +block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o +dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio parallels.o-cflags := $(LIBXML2_CFLAGS) diff --git a/configure b/configure index 0a3c6a72c3..f32d4fa925 100755 --- a/configure +++ b/configure @@ -434,6 +434,7 @@ capstone="" lzo="" snappy="" bzip2="" +lzfse="" guest_agent="" guest_agent_with_vss="no" guest_agent_ntddscsi="no" @@ -1310,6 +1311,10 @@ for opt do ;; --enable-bzip2) bzip2="yes" ;; + --enable-lzfse) lzfse="yes" + ;; + --disable-lzfse) lzfse="no" + ;; --enable-guest-agent) guest_agent="yes" ;; --disable-guest-agent) guest_agent="no" @@ -1745,6 +1750,8 @@ disabled with --disable-FEATURE, default is enabled if available: snappy support of snappy compression library bzip2 support of bzip2 compression library (for reading bzip2-compressed dmg images) + lzfse support of lzfse compression library + (for reading lzfse-compressed dmg images) seccomp seccomp support coroutine-pool coroutine freelist (better performance) glusterfs GlusterFS backend @@ -2263,6 +2270,24 @@ EOF fi fi +########################################## +# lzfse check + +if test "$lzfse" != "no" ; then + cat > $TMPC << EOF +#include +int main(void) { lzfse_decode_scratch_size(); return 0; } +EOF + if compile_prog "" "-llzfse" ; then + lzfse="yes" + else + if test "$lzfse" = "yes"; then + feature_not_found "lzfse" "Install lzfse devel" + fi + lzfse="no" + fi +fi + ########################################## # libseccomp check @@ -6090,6 +6115,7 @@ echo "Live block migration $live_block_migration" echo "lzo support $lzo" echo "snappy support $snappy" echo "bzip2 support $bzip2" +echo "lzfse support $lzfse" echo "NUMA host support $numa" echo "libxml2 $libxml2" echo "tcmalloc support $tcmalloc" @@ -6612,6 +6638,11 @@ if test "$bzip2" = "yes" ; then echo "BZIP2_LIBS=-lbz2" >> $config_host_mak fi +if test "$lzfse" = "yes" ; then + echo "CONFIG_LZFSE=y" >> $config_host_mak + echo "LZFSE_LIBS=-llzfse" >> $config_host_mak +fi + if test "$libiscsi" = "yes" ; then echo "CONFIG_LIBISCSI=m" >> $config_host_mak echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak From 7a40b418ec95e80a7d164acb5104a9fa50ecc901 Mon Sep 17 00:00:00 2001 From: Julio Faracco Date: Mon, 5 Nov 2018 13:08:05 -0200 Subject: [PATCH 03/42] dmg: including dmg-lzfse module inside dmg block driver. This commit includes the support to new module dmg-lzfse into dmg block driver. It includes the support for block type ULFO (0x80000007). Signed-off-by: Julio Faracco Signed-off-by: Kevin Wolf --- block/dmg.c | 28 ++++++++++++++++++++++++++++ block/dmg.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/block/dmg.c b/block/dmg.c index 1d9283ba2f..86bb2344ea 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -33,6 +33,9 @@ int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in, char *next_out, unsigned int avail_out); +int (*dmg_uncompress_lzfse)(char *next_in, unsigned int avail_in, + char *next_out, unsigned int avail_out); + enum { /* Limit chunk sizes to prevent unreasonable amounts of memory being used * or truncating when converting to 32-bit types @@ -107,6 +110,7 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, switch (s->types[chunk]) { case 0x80000005: /* zlib compressed */ case 0x80000006: /* bzip2 compressed */ + case 0x80000007: /* lzfse compressed */ compressed_size = s->lengths[chunk]; uncompressed_sectors = s->sectorcounts[chunk]; break; @@ -188,6 +192,8 @@ static bool dmg_is_known_block_type(uint32_t entry_type) return true; case 0x80000006: /* bzip2 */ return !!dmg_uncompress_bz2; + case 0x80000007: /* lzfse */ + return !!dmg_uncompress_lzfse; default: return false; } @@ -425,6 +431,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, } block_module_load_one("dmg-bz2"); + block_module_load_one("dmg-lzfse"); s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; @@ -623,6 +630,27 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return ret; } break; + case 0x80000007: + if (!dmg_uncompress_lzfse) { + break; + } + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + ret = dmg_uncompress_lzfse((char *)s->compressed_chunk, + (unsigned int) s->lengths[chunk], + (char *)s->uncompressed_chunk, + (unsigned int) + (512 * s->sectorcounts[chunk])); + if (ret < 0) { + return ret; + } + break; case 1: /* copy */ ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); diff --git a/block/dmg.h b/block/dmg.h index 2ecf239ba5..f28929998f 100644 --- a/block/dmg.h +++ b/block/dmg.h @@ -55,4 +55,7 @@ typedef struct BDRVDMGState { extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in, char *next_out, unsigned int avail_out); +extern int (*dmg_uncompress_lzfse)(char *next_in, unsigned int avail_in, + char *next_out, unsigned int avail_out); + #endif From 95a156f689269da57e42dd1b001ea86c0877ed86 Mon Sep 17 00:00:00 2001 From: Julio Faracco Date: Mon, 5 Nov 2018 13:08:06 -0200 Subject: [PATCH 04/42] dmg: exchanging hardcoded dmg UDIF block types to enum. This change is better to understand what kind of block type is being handled by the code. Using a syntax similar to the DMG documentation is easier than tracking all hex values assigned to a block type. Signed-off-by: Julio Faracco Signed-off-by: Kevin Wolf --- block/dmg.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 86bb2344ea..50e91aef6d 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -44,6 +44,19 @@ enum { DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, }; +enum { + /* DMG Block Type */ + UDZE = 0, /* Zeroes */ + UDRW, /* RAW type */ + UDIG, /* Ignore */ + UDCO = 0x80000004, + UDZO, + UDBZ, + ULFO, + UDCM = 0x7ffffffe, /* Comments */ + UDLE /* Last Entry */ +}; + static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) { int len; @@ -108,16 +121,16 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, uint32_t uncompressed_sectors = 0; switch (s->types[chunk]) { - case 0x80000005: /* zlib compressed */ - case 0x80000006: /* bzip2 compressed */ - case 0x80000007: /* lzfse compressed */ + case UDZO: /* zlib compressed */ + case UDBZ: /* bzip2 compressed */ + case ULFO: /* lzfse compressed */ compressed_size = s->lengths[chunk]; uncompressed_sectors = s->sectorcounts[chunk]; break; - case 1: /* copy */ + case UDRW: /* copy */ uncompressed_sectors = DIV_ROUND_UP(s->lengths[chunk], 512); break; - case 2: /* zero */ + case UDIG: /* zero */ /* as the all-zeroes block may be large, it is treated specially: the * sector is not copied from a large buffer, a simple memset is used * instead. Therefore uncompressed_sectors does not need to be set. */ @@ -186,13 +199,13 @@ typedef struct DmgHeaderState { static bool dmg_is_known_block_type(uint32_t entry_type) { switch (entry_type) { - case 0x00000001: /* uncompressed */ - case 0x00000002: /* zeroes */ - case 0x80000005: /* zlib */ + case UDRW: /* uncompressed */ + case UDIG: /* zeroes */ + case UDZO: /* zlib */ return true; - case 0x80000006: /* bzip2 */ + case UDBZ: /* bzip2 */ return !!dmg_uncompress_bz2; - case 0x80000007: /* lzfse */ + case ULFO: /* lzfse */ return !!dmg_uncompress_lzfse; default: return false; @@ -586,7 +599,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) s->current_chunk = s->n_chunks; switch (s->types[chunk]) { /* block entry type */ - case 0x80000005: { /* zlib compressed */ + case UDZO: { /* zlib compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ ret = bdrv_pread(bs->file, s->offsets[chunk], @@ -609,7 +622,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return -1; } break; } - case 0x80000006: /* bzip2 compressed */ + case UDBZ: /* bzip2 compressed */ if (!dmg_uncompress_bz2) { break; } @@ -630,7 +643,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return ret; } break; - case 0x80000007: + case ULFO: if (!dmg_uncompress_lzfse) { break; } @@ -651,14 +664,14 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) return ret; } break; - case 1: /* copy */ + case UDRW: /* copy */ ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; } break; - case 2: /* zero */ + case UDIG: /* zero */ /* see dmg_read, it is treated specially. No buffer needs to be * pre-filled, the zeroes can be set directly. */ break; From e4f9752c4a9c1b5d33bf6494aaff261b401933f2 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 19 Sep 2018 15:43:42 +0300 Subject: [PATCH 05/42] block/replication: drop extra synchronization After commit f8d59dfb40 "block/backup: fix fleecing scheme: use serialized writes" fleecing (specifically reading from backup target, when backup source is in backing chain of backup target) is safe, because all backup-job writes to target are serialized. Therefore we don't need additional synchronization for these reads. Signed-off-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- block/replication.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/block/replication.c b/block/replication.c index 6349d6958e..0c2989d2cb 100644 --- a/block/replication.c +++ b/block/replication.c @@ -218,9 +218,6 @@ static coroutine_fn int replication_co_readv(BlockDriverState *bs, QEMUIOVector *qiov) { BDRVReplicationState *s = bs->opaque; - BdrvChild *child = s->secondary_disk; - BlockJob *job = NULL; - CowRequest req; int ret; if (s->mode == REPLICATION_MODE_PRIMARY) { @@ -233,28 +230,9 @@ static coroutine_fn int replication_co_readv(BlockDriverState *bs, return ret; } - if (child && child->bs) { - job = child->bs->job; - } - - if (job) { - uint64_t remaining_bytes = remaining_sectors * BDRV_SECTOR_SIZE; - - backup_wait_for_overlapping_requests(child->bs->job, - sector_num * BDRV_SECTOR_SIZE, - remaining_bytes); - backup_cow_request_begin(&req, child->bs->job, - sector_num * BDRV_SECTOR_SIZE, - remaining_bytes); - ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE, - remaining_bytes, qiov, 0); - backup_cow_request_end(&req); - goto out; - } - ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE, remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0); -out: + return replication_return_value(s, ret); } From 3a75187fd878be685e38d94583aa14bb87b892e9 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 19 Sep 2018 15:43:43 +0300 Subject: [PATCH 06/42] block/backup: drop unused synchronization interface Signed-off-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- block/backup.c | 38 +++++++----------------------------- include/block/block_backup.h | 13 ------------ 2 files changed, 7 insertions(+), 44 deletions(-) diff --git a/block/backup.c b/block/backup.c index 4d084f6ca6..b829b251eb 100644 --- a/block/backup.c +++ b/block/backup.c @@ -28,6 +28,13 @@ #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) +typedef struct CowRequest { + int64_t start_byte; + int64_t end_byte; + QLIST_ENTRY(CowRequest) list; + CoQueue wait_queue; /* coroutines blocked on this request */ +} CowRequest; + typedef struct BackupBlockJob { BlockJob common; BlockBackend *target; @@ -322,37 +329,6 @@ void backup_do_checkpoint(BlockJob *job, Error **errp) hbitmap_set(backup_job->copy_bitmap, 0, len); } -void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset, - uint64_t bytes) -{ - BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); - int64_t start, end; - - assert(block_job_driver(job) == &backup_job_driver); - - start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); - end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); - wait_for_overlapping_requests(backup_job, start, end); -} - -void backup_cow_request_begin(CowRequest *req, BlockJob *job, - int64_t offset, uint64_t bytes) -{ - BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); - int64_t start, end; - - assert(block_job_driver(job) == &backup_job_driver); - - start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); - end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); - cow_request_begin(req, backup_job, start, end); -} - -void backup_cow_request_end(CowRequest *req) -{ - cow_request_end(req); -} - static void backup_drain(BlockJob *job) { BackupBlockJob *s = container_of(job, BackupBlockJob, common); diff --git a/include/block/block_backup.h b/include/block/block_backup.h index 994a3bd2ec..157596c296 100644 --- a/include/block/block_backup.h +++ b/include/block/block_backup.h @@ -20,19 +20,6 @@ #include "block/block_int.h" -typedef struct CowRequest { - int64_t start_byte; - int64_t end_byte; - QLIST_ENTRY(CowRequest) list; - CoQueue wait_queue; /* coroutines blocked on this request */ -} CowRequest; - -void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset, - uint64_t bytes); -void backup_cow_request_begin(CowRequest *req, BlockJob *job, - int64_t offset, uint64_t bytes); -void backup_cow_request_end(CowRequest *req); - void backup_do_checkpoint(BlockJob *job, Error **errp); #endif From 19a44488532eeccb522c6ee6c36f08b6c69809b0 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:32 +0300 Subject: [PATCH 07/42] qcow2: use Z_OK instead of 0 for deflateInit2 return code check Use appropriate macro, corresponding to deflateInit2 spec. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/qcow2.c b/block/qcow2.c index bc8868c36a..ab1f7388a7 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3738,7 +3738,7 @@ static ssize_t qcow2_compress(void *dest, const void *src, size_t size) memset(&strm, 0, sizeof(strm)); ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -12, 9, Z_DEFAULT_STRATEGY); - if (ret != 0) { + if (ret != Z_OK) { return -2; } From 6994fd78b9a98e9cd32ee45c23edba1919cfbdd0 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:33 +0300 Subject: [PATCH 08/42] qcow2: make more generic interface for qcow2_compress Give explicit size both for source and destination buffers, to make it similar with decompression path and than cleanly reuse parameter structure for decompression threads. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index ab1f7388a7..8606588aca 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3722,14 +3722,15 @@ fail: /* * qcow2_compress() * - * @dest - destination buffer, at least of @size-1 bytes - * @src - source buffer, @size bytes + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes * * Returns: compressed size on success - * -1 if compression is inefficient + * -1 destination buffer is not enough to store compressed data * -2 on any other error */ -static ssize_t qcow2_compress(void *dest, const void *src, size_t size) +static ssize_t qcow2_compress(void *dest, size_t dest_size, + const void *src, size_t src_size) { ssize_t ret; z_stream strm; @@ -3744,14 +3745,14 @@ static ssize_t qcow2_compress(void *dest, const void *src, size_t size) /* strm.next_in is not const in old zlib versions, such as those used on * OpenBSD/NetBSD, so cast the const away */ - strm.avail_in = size; + strm.avail_in = src_size; strm.next_in = (void *) src; - strm.avail_out = size - 1; + strm.avail_out = dest_size; strm.next_out = dest; ret = deflate(&strm, Z_FINISH); if (ret == Z_STREAM_END) { - ret = size - 1 - strm.avail_out; + ret = dest_size - strm.avail_out; } else { ret = (ret == Z_OK ? -1 : -2); } @@ -3765,8 +3766,9 @@ static ssize_t qcow2_compress(void *dest, const void *src, size_t size) typedef struct Qcow2CompressData { void *dest; + size_t dest_size; const void *src; - size_t size; + size_t src_size; ssize_t ret; } Qcow2CompressData; @@ -3774,7 +3776,8 @@ static int qcow2_compress_pool_func(void *opaque) { Qcow2CompressData *data = opaque; - data->ret = qcow2_compress(data->dest, data->src, data->size); + data->ret = qcow2_compress(data->dest, data->dest_size, + data->src, data->src_size); return 0; } @@ -3786,15 +3789,17 @@ static void qcow2_compress_complete(void *opaque, int ret) /* See qcow2_compress definition for parameters description */ static ssize_t qcow2_co_compress(BlockDriverState *bs, - void *dest, const void *src, size_t size) + void *dest, size_t dest_size, + const void *src, size_t src_size) { BDRVQcow2State *s = bs->opaque; BlockAIOCB *acb; ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); Qcow2CompressData arg = { .dest = dest, + .dest_size = dest_size, .src = src, - .size = size, + .src_size = src_size, }; while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { @@ -3861,7 +3866,8 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, out_buf = g_malloc(s->cluster_size); - out_len = qcow2_co_compress(bs, out_buf, buf, s->cluster_size); + out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1, + buf, s->cluster_size); if (out_len == -2) { ret = -EINVAL; goto fail; From f4b3e2a960ed571397292150539455a2dd981338 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:34 +0300 Subject: [PATCH 09/42] qcow2: move decompression from qcow2-cluster.c to qcow2.c Compression is done in threads in qcow2.c. We want to do decompression in the same way, so, firstly, move it to the same file. The only change is braces around if-body in decompress_buffer, to satisfy checkpatch. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2-cluster.c | 70 ------------------------------------------ block/qcow2.c | 71 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 70 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index d37fe08b3d..e2737429f5 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1571,76 +1571,6 @@ again: return 0; } -static int decompress_buffer(uint8_t *out_buf, int out_buf_size, - const uint8_t *buf, int buf_size) -{ - z_stream strm1, *strm = &strm1; - int ret, out_len; - - memset(strm, 0, sizeof(*strm)); - - strm->next_in = (uint8_t *)buf; - strm->avail_in = buf_size; - strm->next_out = out_buf; - strm->avail_out = out_buf_size; - - ret = inflateInit2(strm, -12); - if (ret != Z_OK) - return -1; - ret = inflate(strm, Z_FINISH); - out_len = strm->next_out - out_buf; - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || - out_len != out_buf_size) { - inflateEnd(strm); - return -1; - } - inflateEnd(strm); - return 0; -} - -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) -{ - BDRVQcow2State *s = bs->opaque; - int ret, csize, nb_csectors, sector_offset; - uint64_t coffset; - - coffset = cluster_offset & s->cluster_offset_mask; - if (s->cluster_cache_offset != coffset) { - nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; - sector_offset = coffset & 511; - csize = nb_csectors * 512 - sector_offset; - - /* Allocate buffers on first decompress operation, most images are - * uncompressed and the memory overhead can be avoided. The buffers - * are freed in .bdrv_close(). - */ - if (!s->cluster_data) { - /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_try_blockalign(bs->file->bs, - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512); - if (!s->cluster_data) { - return -ENOMEM; - } - } - if (!s->cluster_cache) { - s->cluster_cache = g_malloc(s->cluster_size); - } - - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, - nb_csectors); - if (ret < 0) { - return ret; - } - if (decompress_buffer(s->cluster_cache, s->cluster_size, - s->cluster_data + sector_offset, csize) < 0) { - return -EIO; - } - s->cluster_cache_offset = coffset; - } - return 0; -} - /* * This discards as many clusters of nb_clusters as possible at once (i.e. * all clusters in the same L2 slice) and returns the number of discarded diff --git a/block/qcow2.c b/block/qcow2.c index 8606588aca..a3b9fe391c 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3762,6 +3762,34 @@ static ssize_t qcow2_compress(void *dest, size_t dest_size, return ret; } +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) { + return -1; + } + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + #define MAX_COMPRESS_THREADS 4 typedef struct Qcow2CompressData { @@ -3915,6 +3943,49 @@ fail: return ret; } +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcow2State *s = bs->opaque; + int ret, csize, nb_csectors, sector_offset; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; + sector_offset = coffset & 511; + csize = nb_csectors * 512 - sector_offset; + + /* Allocate buffers on first decompress operation, most images are + * uncompressed and the memory overhead can be avoided. The buffers + * are freed in .bdrv_close(). + */ + if (!s->cluster_data) { + /* one more sector for decompressed data alignment */ + s->cluster_data = qemu_try_blockalign(bs->file->bs, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512); + if (!s->cluster_data) { + return -ENOMEM; + } + } + if (!s->cluster_cache) { + s->cluster_cache = g_malloc(s->cluster_size); + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); + ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, + nb_csectors); + if (ret < 0) { + return ret; + } + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data + sector_offset, csize) < 0) { + return -EIO; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + static int make_completely_empty(BlockDriverState *bs) { BDRVQcow2State *s = bs->opaque; From 341926ab83e2b4d5af1f8c7633e8b6d72d318458 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:35 +0300 Subject: [PATCH 10/42] qcow2: refactor decompress_buffer - make it look more like a pair of qcow2_compress - rename the function and its parameters - drop extra out_len variable, check filling of output buffer by strm structure itself - fix code style - add some documentation Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2.c | 56 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index a3b9fe391c..b60c01582c 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3762,32 +3762,46 @@ static ssize_t qcow2_compress(void *dest, size_t dest_size, return ret; } -static int decompress_buffer(uint8_t *out_buf, int out_buf_size, - const uint8_t *buf, int buf_size) +/* + * qcow2_decompress() + * + * Decompress some data (not more than @src_size bytes) to produce exactly + * @dest_size bytes. + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: 0 on success + * -1 on fail + */ +static ssize_t qcow2_decompress(void *dest, size_t dest_size, + const void *src, size_t src_size) { - z_stream strm1, *strm = &strm1; - int ret, out_len; + int ret = 0; + z_stream strm; - memset(strm, 0, sizeof(*strm)); + memset(&strm, 0, sizeof(strm)); + strm.avail_in = src_size; + strm.next_in = (void *) src; + strm.avail_out = dest_size; + strm.next_out = dest; - strm->next_in = (uint8_t *)buf; - strm->avail_in = buf_size; - strm->next_out = out_buf; - strm->avail_out = out_buf_size; - - ret = inflateInit2(strm, -12); + ret = inflateInit2(&strm, -12); if (ret != Z_OK) { return -1; } - ret = inflate(strm, Z_FINISH); - out_len = strm->next_out - out_buf; - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || - out_len != out_buf_size) { - inflateEnd(strm); - return -1; + + ret = inflate(&strm, Z_FINISH); + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || strm.avail_out != 0) { + /* We approve Z_BUF_ERROR because we need @dest buffer to be filled, but + * @src buffer may be processed partly (because in qcow2 we know size of + * compressed data with precision of one sector) */ + ret = -1; } - inflateEnd(strm); - return 0; + + inflateEnd(&strm); + + return ret; } #define MAX_COMPRESS_THREADS 4 @@ -3977,8 +3991,8 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) if (ret < 0) { return ret; } - if (decompress_buffer(s->cluster_cache, s->cluster_size, - s->cluster_data + sector_offset, csize) < 0) { + if (qcow2_decompress(s->cluster_cache, s->cluster_size, + s->cluster_data + sector_offset, csize) < 0) { return -EIO; } s->cluster_cache_offset = coffset; From c068a1cd5203cc406a648a641e773ba906cf4aaa Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:36 +0300 Subject: [PATCH 11/42] qcow2: use byte-based read in qcow2_decompress_cluster We are gradually moving away from sector-based interfaces, towards byte-based. Get rid of it here too. Signed-off-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- block/qcow2.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index b60c01582c..014aca6492 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3957,17 +3957,19 @@ fail: return ret; } -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +int coroutine_fn +qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) { BDRVQcow2State *s = bs->opaque; - int ret, csize, nb_csectors, sector_offset; + int ret, csize, nb_csectors; uint64_t coffset; + struct iovec iov; + QEMUIOVector local_qiov; coffset = cluster_offset & s->cluster_offset_mask; if (s->cluster_cache_offset != coffset) { nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; - sector_offset = coffset & 511; - csize = nb_csectors * 512 - sector_offset; + csize = nb_csectors * 512 - (coffset & 511); /* Allocate buffers on first decompress operation, most images are * uncompressed and the memory overhead can be avoided. The buffers @@ -3985,14 +3987,17 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) s->cluster_cache = g_malloc(s->cluster_size); } + iov.iov_base = s->cluster_data; + iov.iov_len = csize; + qemu_iovec_init_external(&local_qiov, &iov, 1); + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, - nb_csectors); + ret = bdrv_co_preadv(bs->file, coffset, csize, &local_qiov, 0); if (ret < 0) { return ret; } if (qcow2_decompress(s->cluster_cache, s->cluster_size, - s->cluster_data + sector_offset, csize) < 0) { + s->cluster_data, csize) < 0) { return -EIO; } s->cluster_cache_offset = coffset; From c3c10f7295421f3b6638716d59cac9531c34adc8 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:37 +0300 Subject: [PATCH 12/42] qcow2: aio support for compressed cluster read Allocate buffers locally and release qcow2 lock. Than, reads inside qcow2_co_preadv_compressed may be done in parallel, however all decompression is still done synchronously. Let's improve it in the following commit. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2.c | 100 +++++++++++++++++++++++++------------------------- block/qcow2.h | 4 -- 2 files changed, 51 insertions(+), 53 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 014aca6492..5d8bac6b62 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -74,6 +74,13 @@ typedef struct { #define QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77 #define QCOW2_EXT_MAGIC_BITMAPS 0x23852875 +static int coroutine_fn +qcow2_co_preadv_compressed(BlockDriverState *bs, + uint64_t file_cluster_offset, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov); + static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) { const QCowHeader *cow_header = (const void *)buf; @@ -1414,7 +1421,6 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, goto fail; } - s->cluster_cache_offset = -1; s->flags = flags; ret = qcow2_refcount_init(bs); @@ -1914,15 +1920,15 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, break; case QCOW2_CLUSTER_COMPRESSED: - /* add AIO support for compressed blocks ? */ - ret = qcow2_decompress_cluster(bs, cluster_offset); + qemu_co_mutex_unlock(&s->lock); + ret = qcow2_co_preadv_compressed(bs, cluster_offset, + offset, cur_bytes, + &hd_qiov); + qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; } - qemu_iovec_from_buf(&hd_qiov, 0, - s->cluster_cache + offset_in_cluster, - cur_bytes); break; case QCOW2_CLUSTER_NORMAL: @@ -2058,8 +2064,6 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, qemu_iovec_init(&hd_qiov, qiov->niov); - s->cluster_cache_offset = -1; /* disable compressed cache */ - qemu_co_mutex_lock(&s->lock); while (bytes != 0) { @@ -2223,8 +2227,6 @@ static void qcow2_close(BlockDriverState *bs) g_free(s->image_backing_file); g_free(s->image_backing_format); - g_free(s->cluster_cache); - qemu_vfree(s->cluster_data); qcow2_refcount_close(bs); qcow2_free_snapshots(bs); } @@ -3401,7 +3403,6 @@ qcow2_co_copy_range_to(BlockDriverState *bs, QCowL2Meta *l2meta = NULL; assert(!bs->encrypted); - s->cluster_cache_offset = -1; /* disable compressed cache */ qemu_co_mutex_lock(&s->lock); @@ -3957,52 +3958,53 @@ fail: return ret; } -int coroutine_fn -qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +static int coroutine_fn +qcow2_co_preadv_compressed(BlockDriverState *bs, + uint64_t file_cluster_offset, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) { BDRVQcow2State *s = bs->opaque; - int ret, csize, nb_csectors; + int ret = 0, csize, nb_csectors; uint64_t coffset; + uint8_t *buf, *out_buf; struct iovec iov; QEMUIOVector local_qiov; + int offset_in_cluster = offset_into_cluster(s, offset); - coffset = cluster_offset & s->cluster_offset_mask; - if (s->cluster_cache_offset != coffset) { - nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; - csize = nb_csectors * 512 - (coffset & 511); + coffset = file_cluster_offset & s->cluster_offset_mask; + nb_csectors = ((file_cluster_offset >> s->csize_shift) & s->csize_mask) + 1; + csize = nb_csectors * 512 - (coffset & 511); - /* Allocate buffers on first decompress operation, most images are - * uncompressed and the memory overhead can be avoided. The buffers - * are freed in .bdrv_close(). - */ - if (!s->cluster_data) { - /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_try_blockalign(bs->file->bs, - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512); - if (!s->cluster_data) { - return -ENOMEM; - } - } - if (!s->cluster_cache) { - s->cluster_cache = g_malloc(s->cluster_size); - } - - iov.iov_base = s->cluster_data; - iov.iov_len = csize; - qemu_iovec_init_external(&local_qiov, &iov, 1); - - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_co_preadv(bs->file, coffset, csize, &local_qiov, 0); - if (ret < 0) { - return ret; - } - if (qcow2_decompress(s->cluster_cache, s->cluster_size, - s->cluster_data, csize) < 0) { - return -EIO; - } - s->cluster_cache_offset = coffset; + buf = g_try_malloc(csize); + if (!buf) { + return -ENOMEM; } - return 0; + iov.iov_base = buf; + iov.iov_len = csize; + qemu_iovec_init_external(&local_qiov, &iov, 1); + + out_buf = qemu_blockalign(bs, s->cluster_size); + + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); + ret = bdrv_co_preadv(bs->file, coffset, csize, &local_qiov, 0); + if (ret < 0) { + goto fail; + } + + if (qcow2_decompress(out_buf, s->cluster_size, buf, csize) < 0) { + ret = -EIO; + goto fail; + } + + qemu_iovec_from_buf(qiov, 0, out_buf + offset_in_cluster, bytes); + +fail: + qemu_vfree(out_buf); + g_free(buf); + + return ret; } static int make_completely_empty(BlockDriverState *bs) diff --git a/block/qcow2.h b/block/qcow2.h index 8662b68575..a98d24500b 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -278,9 +278,6 @@ typedef struct BDRVQcow2State { QEMUTimer *cache_clean_timer; unsigned cache_clean_interval; - uint8_t *cluster_cache; - uint8_t *cluster_data; - uint64_t cluster_cache_offset; QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; uint64_t *refcount_table; @@ -616,7 +613,6 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, bool exact_size); int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size); int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, uint8_t *buf, int nb_sectors, bool enc, Error **errp); From e23c9d7a1c827ac41cc4719d4c4800139b232857 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 1 Nov 2018 21:27:38 +0300 Subject: [PATCH 13/42] qcow2: do decompression in threads Do decompression in threads, like it is already done for compression. This improves asynchronous compressed reads performance. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 5d8bac6b62..4897abae5e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3807,20 +3807,24 @@ static ssize_t qcow2_decompress(void *dest, size_t dest_size, #define MAX_COMPRESS_THREADS 4 +typedef ssize_t (*Qcow2CompressFunc)(void *dest, size_t dest_size, + const void *src, size_t src_size); typedef struct Qcow2CompressData { void *dest; size_t dest_size; const void *src; size_t src_size; ssize_t ret; + + Qcow2CompressFunc func; } Qcow2CompressData; static int qcow2_compress_pool_func(void *opaque) { Qcow2CompressData *data = opaque; - data->ret = qcow2_compress(data->dest, data->dest_size, - data->src, data->src_size); + data->ret = data->func(data->dest, data->dest_size, + data->src, data->src_size); return 0; } @@ -3830,10 +3834,9 @@ static void qcow2_compress_complete(void *opaque, int ret) qemu_coroutine_enter(opaque); } -/* See qcow2_compress definition for parameters description */ -static ssize_t qcow2_co_compress(BlockDriverState *bs, - void *dest, size_t dest_size, - const void *src, size_t src_size) +static ssize_t coroutine_fn +qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size, Qcow2CompressFunc func) { BDRVQcow2State *s = bs->opaque; BlockAIOCB *acb; @@ -3843,6 +3846,7 @@ static ssize_t qcow2_co_compress(BlockDriverState *bs, .dest_size = dest_size, .src = src, .src_size = src_size, + .func = func, }; while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) { @@ -3865,6 +3869,22 @@ static ssize_t qcow2_co_compress(BlockDriverState *bs, return arg.ret; } +static ssize_t coroutine_fn +qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size) +{ + return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, + qcow2_compress); +} + +static ssize_t coroutine_fn +qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, + const void *src, size_t src_size) +{ + return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, + qcow2_decompress); +} + /* XXX: put compressed sectors first, then all the cluster aligned tables to avoid losing bytes in alignment */ static coroutine_fn int @@ -3993,7 +4013,7 @@ qcow2_co_preadv_compressed(BlockDriverState *bs, goto fail; } - if (qcow2_decompress(out_buf, s->cluster_size, buf, csize) < 0) { + if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) { ret = -EIO; goto fail; } From d57c44d00f0daddd6d31b326ef844bf2facf6d8a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 16:21:14 +0100 Subject: [PATCH 14/42] file-posix: Reorganise RawPosixAIOData RawPosixAIOData contains a lot of fields for several separate operations that are to be processed in a worker thread and that need different parameters. The struct is currently rather unorganised, with unions that cover some, but not all operations, and even one #define for field names instead of a union. Clean this up to have some common fields and a single union. As a side effect, on x86_64 the struct shrinks from 72 to 48 bytes. Signed-off-by: Kevin Wolf --- block/file-posix.c | 91 +++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 07bbdab953..d0102a7ec0 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -182,25 +182,29 @@ static int64_t raw_getlength(BlockDriverState *bs); typedef struct RawPosixAIOData { BlockDriverState *bs; - int aio_fildes; - union { - struct iovec *aio_iov; - void *aio_ioctl_buf; - }; - int aio_niov; - uint64_t aio_nbytes; -#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ - off_t aio_offset; int aio_type; + int aio_fildes; + + off_t aio_offset; + uint64_t aio_nbytes; + union { + struct { + struct iovec *iov; + int niov; + } io; + struct { + uint64_t cmd; + void *buf; + } ioctl; struct { int aio_fd2; off_t aio_offset2; - }; + } copy_range; struct { PreallocMode prealloc; Error **errp; - }; + } truncate; }; } RawPosixAIOData; @@ -1152,7 +1156,7 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) { int ret; - ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); + ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf); if (ret == -1) { return -errno; } @@ -1233,13 +1237,13 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) do { if (aiocb->aio_type & QEMU_AIO_WRITE) len = qemu_pwritev(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, + aiocb->io.iov, + aiocb->io.niov, aiocb->aio_offset); else len = qemu_preadv(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, + aiocb->io.iov, + aiocb->io.niov, aiocb->aio_offset); } while (len == -1 && errno == EINTR); @@ -1305,8 +1309,8 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) * If there is just a single buffer, and it is properly aligned * we can just use plain pread/pwrite without any problems. */ - if (aiocb->aio_niov == 1) { - return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); + if (aiocb->io.niov == 1) { + return handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base); } /* * We have more than one iovec, and all are properly aligned. @@ -1343,9 +1347,9 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) char *p = buf; int i; - for (i = 0; i < aiocb->aio_niov; ++i) { - memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); - p += aiocb->aio_iov[i].iov_len; + for (i = 0; i < aiocb->io.niov; ++i) { + memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len); + p += aiocb->io.iov[i].iov_len; } assert(p - buf == aiocb->aio_nbytes); } @@ -1356,12 +1360,12 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) size_t count = aiocb->aio_nbytes, copy; int i; - for (i = 0; i < aiocb->aio_niov && count; ++i) { + for (i = 0; i < aiocb->io.niov && count; ++i) { copy = count; - if (copy > aiocb->aio_iov[i].iov_len) { - copy = aiocb->aio_iov[i].iov_len; + if (copy > aiocb->io.iov[i].iov_len) { + copy = aiocb->io.iov[i].iov_len; } - memcpy(aiocb->aio_iov[i].iov_base, p, copy); + memcpy(aiocb->io.iov[i].iov_base, p, copy); assert(count >= copy); p += copy; count -= copy; @@ -1572,14 +1576,15 @@ static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) { uint64_t bytes = aiocb->aio_nbytes; off_t in_off = aiocb->aio_offset; - off_t out_off = aiocb->aio_offset2; + off_t out_off = aiocb->copy_range.aio_offset2; while (bytes) { ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, - aiocb->aio_fd2, &out_off, + aiocb->copy_range.aio_fd2, &out_off, bytes, 0); trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off, - aiocb->aio_fd2, out_off, bytes, 0, ret); + aiocb->copy_range.aio_fd2, out_off, bytes, + 0, ret); if (ret == 0) { /* No progress (e.g. when beyond EOF), let the caller fall back to * buffer I/O. */ @@ -1648,7 +1653,8 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb) struct stat st; int fd = aiocb->aio_fildes; int64_t offset = aiocb->aio_offset; - Error **errp = aiocb->errp; + PreallocMode prealloc = aiocb->truncate.prealloc; + Error **errp = aiocb->truncate.errp; if (fstat(fd, &st) < 0) { result = -errno; @@ -1657,12 +1663,12 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb) } current_length = st.st_size; - if (current_length > offset && aiocb->prealloc != PREALLOC_MODE_OFF) { + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { error_setg(errp, "Cannot use preallocation for shrinking files"); return -ENOTSUP; } - switch (aiocb->prealloc) { + switch (prealloc) { #ifdef CONFIG_POSIX_FALLOCATE case PREALLOC_MODE_FALLOC: /* @@ -1743,7 +1749,7 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb) default: result = -ENOTSUP; error_setg(errp, "Unsupported preallocation mode: %s", - PreallocMode_str(aiocb->prealloc)); + PreallocMode_str(prealloc)); return result; } @@ -1768,7 +1774,7 @@ static int aio_worker(void *arg) case QEMU_AIO_READ: ret = handle_aiocb_rw(aiocb); if (ret >= 0 && ret < aiocb->aio_nbytes) { - iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, + iov_memset(aiocb->io.iov, aiocb->io.niov, ret, 0, aiocb->aio_nbytes - ret); ret = aiocb->aio_nbytes; @@ -1829,16 +1835,17 @@ static int paio_submit_co_full(BlockDriverState *bs, int fd, acb->bs = bs; acb->aio_type = type; acb->aio_fildes = fd; - acb->aio_fd2 = fd2; - acb->aio_offset2 = offset2; acb->aio_nbytes = bytes; acb->aio_offset = offset; if (qiov) { - acb->aio_iov = qiov->iov; - acb->aio_niov = qiov->niov; + acb->io.iov = qiov->iov; + acb->io.niov = qiov->niov; assert(qiov->size == bytes); + } else { + acb->copy_range.aio_fd2 = fd2; + acb->copy_range.aio_offset2 = offset2; } trace_file_paio_submit_co(offset, bytes, type); @@ -1976,8 +1983,10 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, .aio_fildes = fd, .aio_type = QEMU_AIO_TRUNCATE, .aio_offset = offset, - .prealloc = prealloc, - .errp = errp, + .truncate = { + .prealloc = prealloc, + .errp = errp, + }, }; /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */ @@ -3089,8 +3098,8 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, acb->aio_type = QEMU_AIO_IOCTL; acb->aio_fildes = s->fd; acb->aio_offset = 0; - acb->aio_ioctl_buf = buf; - acb->aio_ioctl_cmd = req; + acb->ioctl.buf = buf; + acb->ioctl.cmd = req; pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } From 5d5de250056b0972cde2e88133db702960a32b72 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 26 Oct 2018 16:53:47 +0100 Subject: [PATCH 15/42] file-posix: Factor out raw_thread_pool_submit() Getting the thread pool of the AioContext of a block node and scheduling some work in it is an operation that is already done twice, and we'll get more instances. Factor it out into a separate function. Signed-off-by: Kevin Wolf --- block/file-posix.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index d0102a7ec0..27be94cfe5 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1824,13 +1824,20 @@ static int aio_worker(void *arg) return ret; } +static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs, + ThreadPoolFunc func, void *arg) +{ + /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */ + ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_co(pool, func, arg); +} + static int paio_submit_co_full(BlockDriverState *bs, int fd, int64_t offset, int fd2, int64_t offset2, QEMUIOVector *qiov, int bytes, int type) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - ThreadPool *pool; acb->bs = bs; acb->aio_type = type; @@ -1849,8 +1856,7 @@ static int paio_submit_co_full(BlockDriverState *bs, int fd, } trace_file_paio_submit_co(offset, bytes, type); - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, aio_worker, acb); + return raw_thread_pool_submit(bs, aio_worker, acb); } static inline int paio_submit_co(BlockDriverState *bs, int fd, @@ -1976,7 +1982,6 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, PreallocMode prealloc, Error **errp) { RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - ThreadPool *pool; *acb = (RawPosixAIOData) { .bs = bs, @@ -1989,9 +1994,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, }, }; - /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */ - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, aio_worker, acb); + return raw_thread_pool_submit(bs, aio_worker, acb); } static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, From 29cb4c01e7b38589fa4fe3f9ea82f69dc2df3051 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 14:18:58 +0100 Subject: [PATCH 16/42] file-posix: Avoid aio_worker() for QEMU_AIO_TRUNCATE aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. Signed-off-by: Kevin Wolf --- block/file-posix.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 27be94cfe5..7c5121efc9 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1645,8 +1645,9 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) return ret; } -static int handle_aiocb_truncate(RawPosixAIOData *aiocb) +static int handle_aiocb_truncate(void *opaque) { + RawPosixAIOData *aiocb = opaque; int result = 0; int64_t current_length = 0; char *buf = NULL; @@ -1812,8 +1813,7 @@ static int aio_worker(void *arg) ret = handle_aiocb_copy_range(aiocb); break; case QEMU_AIO_TRUNCATE: - ret = handle_aiocb_truncate(aiocb); - break; + g_assert_not_reached(); default: error_report("invalid aio request (0x%x)", aiocb->aio_type); ret = -EINVAL; @@ -1981,9 +1981,9 @@ static int coroutine_fn raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, PreallocMode prealloc, Error **errp) { - RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); + RawPosixAIOData acb; - *acb = (RawPosixAIOData) { + acb = (RawPosixAIOData) { .bs = bs, .aio_fildes = fd, .aio_type = QEMU_AIO_TRUNCATE, @@ -1994,7 +1994,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, }, }; - return raw_thread_pool_submit(bs, aio_worker, acb); + return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb); } static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, From 58a209c43747c311ceb6f6a6f3e5904b354efce2 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 14:18:58 +0100 Subject: [PATCH 17/42] file-posix: Avoid aio_worker() for QEMU_AIO_COPY_RANGE aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. Signed-off-by: Kevin Wolf --- block/file-posix.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 7c5121efc9..e502ba7e15 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1572,8 +1572,9 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif -static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) +static int handle_aiocb_copy_range(void *opaque) { + RawPosixAIOData *aiocb = opaque; uint64_t bytes = aiocb->aio_nbytes; off_t in_off = aiocb->aio_offset; off_t out_off = aiocb->copy_range.aio_offset2; @@ -1810,8 +1811,6 @@ static int aio_worker(void *arg) ret = handle_aiocb_write_zeroes_unmap(aiocb); break; case QEMU_AIO_COPY_RANGE: - ret = handle_aiocb_copy_range(aiocb); - break; case QEMU_AIO_TRUNCATE: g_assert_not_reached(); default: @@ -2714,6 +2713,7 @@ static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { + RawPosixAIOData acb; BDRVRawState *s = bs->opaque; BDRVRawState *src_s; @@ -2726,8 +2726,20 @@ static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) { return -EIO; } - return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset, - NULL, bytes, QEMU_AIO_COPY_RANGE); + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_type = QEMU_AIO_COPY_RANGE, + .aio_fildes = src_s->fd, + .aio_offset = src_offset, + .aio_nbytes = bytes, + .copy_range = { + .aio_fd2 = s->fd, + .aio_offset2 = dst_offset, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb); } BlockDriver bdrv_file = { From 7154d8ae66c75c97b08c8f1c80dd6f46f0dbffca Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 14:18:58 +0100 Subject: [PATCH 18/42] file-posix: Avoid aio_worker() for QEMU_AIO_WRITE_ZEROES aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. Signed-off-by: Kevin Wolf --- block/file-posix.c | 57 +++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index e502ba7e15..16f2b202c6 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1464,8 +1464,9 @@ static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) return ret; } -static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +static int handle_aiocb_write_zeroes(void *opaque) { + RawPosixAIOData *aiocb = opaque; #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) BDRVRawState *s = aiocb->bs->opaque; #endif @@ -1529,8 +1530,9 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) return -ENOTSUP; } -static ssize_t handle_aiocb_write_zeroes_unmap(RawPosixAIOData *aiocb) +static int handle_aiocb_write_zeroes_unmap(void *opaque) { + RawPosixAIOData *aiocb = opaque; BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque; int ret; @@ -1805,11 +1807,7 @@ static int aio_worker(void *arg) ret = handle_aiocb_discard(aiocb); break; case QEMU_AIO_WRITE_ZEROES: - ret = handle_aiocb_write_zeroes(aiocb); - break; case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD: - ret = handle_aiocb_write_zeroes_unmap(aiocb); - break; case QEMU_AIO_COPY_RANGE: case QEMU_AIO_TRUNCATE: g_assert_not_reached(); @@ -2631,18 +2629,41 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD); } +static int coroutine_fn +raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, + BdrvRequestFlags flags, bool blkdev) +{ + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + ThreadPoolFunc *handler; + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_WRITE_ZEROES, + .aio_offset = offset, + .aio_nbytes = bytes, + }; + + if (blkdev) { + acb.aio_type |= QEMU_AIO_BLKDEV; + } + + if (flags & BDRV_REQ_MAY_UNMAP) { + acb.aio_type |= QEMU_AIO_DISCARD; + handler = handle_aiocb_write_zeroes_unmap; + } else { + handler = handle_aiocb_write_zeroes; + } + + return raw_thread_pool_submit(bs, handler, &acb); +} + static int coroutine_fn raw_co_pwrite_zeroes( BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags) { - BDRVRawState *s = bs->opaque; - int operation = QEMU_AIO_WRITE_ZEROES; - - if (flags & BDRV_REQ_MAY_UNMAP) { - operation |= QEMU_AIO_DISCARD; - } - - return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation); + return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false); } static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) @@ -3147,8 +3168,6 @@ hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags) { - BDRVRawState *s = bs->opaque; - int operation = QEMU_AIO_WRITE_ZEROES | QEMU_AIO_BLKDEV; int rc; rc = fd_open(bs); @@ -3156,11 +3175,7 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, return rc; } - if (flags & BDRV_REQ_MAY_UNMAP) { - operation |= QEMU_AIO_DISCARD; - } - - return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation); + return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true); } static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts, From 46ee0f462bfa1e374fa0f5df5834b061a632af6d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 14:18:58 +0100 Subject: [PATCH 19/42] file-posix: Avoid aio_worker() for QEMU_AIO_DISCARD aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. Signed-off-by: Kevin Wolf --- block/file-posix.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 16f2b202c6..88887108a7 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1608,8 +1608,9 @@ static int handle_aiocb_copy_range(void *opaque) return 0; } -static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) +static int handle_aiocb_discard(void *opaque) { + RawPosixAIOData *aiocb = opaque; int ret = -EOPNOTSUPP; BDRVRawState *s = aiocb->bs->opaque; @@ -1804,8 +1805,6 @@ static int aio_worker(void *arg) ret = handle_aiocb_ioctl(aiocb); break; case QEMU_AIO_DISCARD: - ret = handle_aiocb_discard(aiocb); - break; case QEMU_AIO_WRITE_ZEROES: case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD: case QEMU_AIO_COPY_RANGE: @@ -2622,11 +2621,30 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, } static coroutine_fn int -raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev) { BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; - return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD); + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_DISCARD, + .aio_offset = offset, + .aio_nbytes = bytes, + }; + + if (blkdev) { + acb.aio_type |= QEMU_AIO_BLKDEV; + } + + return raw_thread_pool_submit(bs, handle_aiocb_discard, &acb); +} + +static coroutine_fn int +raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +{ + return raw_do_pdiscard(bs, offset, bytes, false); } static int coroutine_fn @@ -3154,15 +3172,13 @@ static int fd_open(BlockDriverState *bs) static coroutine_fn int hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { - BDRVRawState *s = bs->opaque; int ret; ret = fd_open(bs); if (ret < 0) { return ret; } - return paio_submit_co(bs, s->fd, offset, NULL, bytes, - QEMU_AIO_DISCARD | QEMU_AIO_BLKDEV); + return raw_do_pdiscard(bs, offset, bytes, true); } static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, From 06dc9bd57182eb1a09cd0f7b1cb145937ed4e618 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 14:18:58 +0100 Subject: [PATCH 20/42] file-posix: Avoid aio_worker() for QEMU_AIO_FLUSH aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. Signed-off-by: Kevin Wolf --- block/file-posix.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 88887108a7..4a94597b80 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1164,8 +1164,9 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) return 0; } -static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) +static int handle_aiocb_flush(void *opaque) { + RawPosixAIOData *aiocb = opaque; BDRVRawState *s = aiocb->bs->opaque; int ret; @@ -1798,12 +1799,10 @@ static int aio_worker(void *arg) ret = -EINVAL; } break; - case QEMU_AIO_FLUSH: - ret = handle_aiocb_flush(aiocb); - break; case QEMU_AIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); break; + case QEMU_AIO_FLUSH: case QEMU_AIO_DISCARD: case QEMU_AIO_WRITE_ZEROES: case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD: @@ -1931,6 +1930,7 @@ static void raw_aio_unplug(BlockDriverState *bs) static int raw_co_flush_to_disk(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; int ret; ret = fd_open(bs); @@ -1938,7 +1938,13 @@ static int raw_co_flush_to_disk(BlockDriverState *bs) return ret; } - return paio_submit_co(bs, s->fd, 0, NULL, 0, QEMU_AIO_FLUSH); + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_FLUSH, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb); } static void raw_aio_attach_aio_context(BlockDriverState *bs, From 54c7ca1b8135e728b70320418d6424ad78cc4629 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 31 Oct 2018 09:43:17 +0100 Subject: [PATCH 21/42] file-posix: Move read/write operation logic out of aio_worker() aio_worker() for reads and writes isn't boring enough yet. It still does some postprocessing for handling short reads and turning the result into the right return value. However, there is no reason why handle_aiocb_rw() couldn't do the same, and even without duplicating code between the read and write path. So move the code there. Signed-off-by: Kevin Wolf --- block/file-posix.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4a94597b80..877c0700e1 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1311,7 +1311,8 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) * we can just use plain pread/pwrite without any problems. */ if (aiocb->io.niov == 1) { - return handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base); + nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base); + goto out; } /* * We have more than one iovec, and all are properly aligned. @@ -1323,7 +1324,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) nbytes = handle_aiocb_rw_vector(aiocb); if (nbytes == aiocb->aio_nbytes || (nbytes < 0 && nbytes != -ENOSYS)) { - return nbytes; + goto out; } preadv_present = false; } @@ -1341,7 +1342,8 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) */ buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); if (buf == NULL) { - return -ENOMEM; + nbytes = -ENOMEM; + goto out; } if (aiocb->aio_type & QEMU_AIO_WRITE) { @@ -1375,7 +1377,21 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) } qemu_vfree(buf); - return nbytes; +out: + if (nbytes == aiocb->aio_nbytes) { + return 0; + } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { + if (aiocb->aio_type & QEMU_AIO_WRITE) { + return -EINVAL; + } else { + iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes, + 0, aiocb->aio_nbytes - nbytes); + return 0; + } + } else { + assert(nbytes < 0); + return nbytes; + } } #ifdef CONFIG_XFS @@ -1779,25 +1795,9 @@ static int aio_worker(void *arg) switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { case QEMU_AIO_READ: ret = handle_aiocb_rw(aiocb); - if (ret >= 0 && ret < aiocb->aio_nbytes) { - iov_memset(aiocb->io.iov, aiocb->io.niov, ret, - 0, aiocb->aio_nbytes - ret); - - ret = aiocb->aio_nbytes; - } - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; - } break; case QEMU_AIO_WRITE: ret = handle_aiocb_rw(aiocb); - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; - } break; case QEMU_AIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); From 999e6b69ce5adc0c68cdd90211569f61f006858b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Oct 2018 14:18:58 +0100 Subject: [PATCH 22/42] file-posix: Avoid aio_worker() for QEMU_AIO_READ/WRITE aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. Signed-off-by: Kevin Wolf --- block/file-posix.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 877c0700e1..0f64c83639 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1300,8 +1300,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) return offset; } -static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) +static int handle_aiocb_rw(void *opaque) { + RawPosixAIOData *aiocb = opaque; ssize_t nbytes; char *buf; @@ -1793,15 +1794,11 @@ static int aio_worker(void *arg) ssize_t ret = 0; switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { - case QEMU_AIO_READ: - ret = handle_aiocb_rw(aiocb); - break; - case QEMU_AIO_WRITE: - ret = handle_aiocb_rw(aiocb); - break; case QEMU_AIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); break; + case QEMU_AIO_READ: + case QEMU_AIO_WRITE: case QEMU_AIO_FLUSH: case QEMU_AIO_DISCARD: case QEMU_AIO_WRITE_ZEROES: @@ -1865,6 +1862,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int type) { BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; if (fd_open(bs) < 0) return -EIO; @@ -1887,7 +1885,20 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, } } - return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = type, + .aio_offset = offset, + .aio_nbytes = bytes, + .io = { + .iov = qiov->iov, + .niov = qiov->niov, + }, + }; + + assert(qiov->size == bytes); + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); } static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, From c9db2b6489f7d04b5fd3288565cb163476a04dc1 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 31 Oct 2018 10:26:18 +0100 Subject: [PATCH 23/42] file-posix: Remove paio_submit_co() The function is not used any more, remove it. Signed-off-by: Kevin Wolf --- block/file-posix.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 0f64c83639..c8a085a911 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1824,40 +1824,6 @@ static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs, return thread_pool_submit_co(pool, func, arg); } -static int paio_submit_co_full(BlockDriverState *bs, int fd, - int64_t offset, int fd2, int64_t offset2, - QEMUIOVector *qiov, - int bytes, int type) -{ - RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - - acb->bs = bs; - acb->aio_type = type; - acb->aio_fildes = fd; - - acb->aio_nbytes = bytes; - acb->aio_offset = offset; - - if (qiov) { - acb->io.iov = qiov->iov; - acb->io.niov = qiov->niov; - assert(qiov->size == bytes); - } else { - acb->copy_range.aio_fd2 = fd2; - acb->copy_range.aio_offset2 = offset2; - } - - trace_file_paio_submit_co(offset, bytes, type); - return raw_thread_pool_submit(bs, aio_worker, acb); -} - -static inline int paio_submit_co(BlockDriverState *bs, int fd, - int64_t offset, QEMUIOVector *qiov, - int bytes, int type) -{ - return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type); -} - static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int type) { From 2f3a7ab39bec4ba8022dc4d42ea641165b004e3e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 31 Oct 2018 11:25:18 +0100 Subject: [PATCH 24/42] file-posix: Switch to .bdrv_co_ioctl No real reason to keep using the callback based mechanism here when the rest of the file-posix driver is coroutine based. Changing it brings ioctls more in line with how other request types work. Signed-off-by: Kevin Wolf --- block/file-posix.c | 21 +++++++++++---------- include/scsi/pr-manager.h | 8 +++----- scsi/pr-manager.c | 21 +++++++++------------ scsi/trace-events | 2 +- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index c8a085a911..9c15bbe429 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -3109,24 +3109,25 @@ hdev_open_Mac_error: } #if defined(__linux__) - -static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { BDRVRawState *s = bs->opaque; RawPosixAIOData *acb; ThreadPool *pool; + int ret; - if (fd_open(bs) < 0) - return NULL; + ret = fd_open(bs); + if (ret < 0) { + return ret; + } if (req == SG_IO && s->pr_mgr) { struct sg_io_hdr *io_hdr = buf; if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), - s->fd, io_hdr, cb, opaque); + s->fd, io_hdr); } } @@ -3138,7 +3139,7 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, acb->ioctl.buf = buf; acb->ioctl.cmd = req; pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); + return thread_pool_submit_co(pool, aio_worker, acb); } #endif /* linux */ @@ -3279,7 +3280,7 @@ static BlockDriver bdrv_host_device = { /* generic scsi device */ #ifdef __linux__ - .bdrv_aio_ioctl = hdev_aio_ioctl, + .bdrv_co_ioctl = hdev_co_ioctl, #endif }; @@ -3401,7 +3402,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_lock_medium = cdrom_lock_medium, /* generic scsi device */ - .bdrv_aio_ioctl = hdev_aio_ioctl, + .bdrv_co_ioctl = hdev_co_ioctl, }; #endif /* __linux__ */ diff --git a/include/scsi/pr-manager.h b/include/scsi/pr-manager.h index 50a77b08fc..6ad5fd1ff7 100644 --- a/include/scsi/pr-manager.h +++ b/include/scsi/pr-manager.h @@ -5,6 +5,7 @@ #include "qapi/visitor.h" #include "qom/object_interfaces.h" #include "block/aio.h" +#include "qemu/coroutine.h" #define TYPE_PR_MANAGER "pr-manager" @@ -37,11 +38,8 @@ typedef struct PRManagerClass { } PRManagerClass; bool pr_manager_is_connected(PRManager *pr_mgr); -BlockAIOCB *pr_manager_execute(PRManager *pr_mgr, - AioContext *ctx, int fd, - struct sg_io_hdr *hdr, - BlockCompletionFunc *complete, - void *opaque); +int coroutine_fn pr_manager_execute(PRManager *pr_mgr, AioContext *ctx, int fd, + struct sg_io_hdr *hdr); PRManager *pr_manager_lookup(const char *id, Error **errp); diff --git a/scsi/pr-manager.c b/scsi/pr-manager.c index 2a8f300dde..d9f4e8c3ad 100644 --- a/scsi/pr-manager.c +++ b/scsi/pr-manager.c @@ -48,24 +48,21 @@ static int pr_manager_worker(void *opaque) } -BlockAIOCB *pr_manager_execute(PRManager *pr_mgr, - AioContext *ctx, int fd, - struct sg_io_hdr *hdr, - BlockCompletionFunc *complete, - void *opaque) +int coroutine_fn pr_manager_execute(PRManager *pr_mgr, AioContext *ctx, int fd, + struct sg_io_hdr *hdr) { - PRManagerData *data = g_new(PRManagerData, 1); ThreadPool *pool = aio_get_thread_pool(ctx); + PRManagerData data = { + .pr_mgr = pr_mgr, + .fd = fd, + .hdr = hdr, + }; - trace_pr_manager_execute(fd, hdr->cmdp[0], hdr->cmdp[1], opaque); - data->pr_mgr = pr_mgr; - data->fd = fd; - data->hdr = hdr; + trace_pr_manager_execute(fd, hdr->cmdp[0], hdr->cmdp[1]); /* The matching object_unref is in pr_manager_worker. */ object_ref(OBJECT(pr_mgr)); - return thread_pool_submit_aio(pool, pr_manager_worker, - data, complete, opaque); + return thread_pool_submit_co(pool, pr_manager_worker, &data); } bool pr_manager_is_connected(PRManager *pr_mgr) diff --git a/scsi/trace-events b/scsi/trace-events index 45f5b6e49b..f8a68b11eb 100644 --- a/scsi/trace-events +++ b/scsi/trace-events @@ -1,3 +1,3 @@ # scsi/pr-manager.c -pr_manager_execute(int fd, int cmd, int sa, void *opaque) "fd=%d cmd=0x%02x service action=0x%02x opaque=%p" +pr_manager_execute(int fd, int cmd, int sa) "fd=%d cmd=0x%02x service action=0x%02x" pr_manager_run(int fd, int cmd, int sa) "fd=%d cmd=0x%02x service action=0x%02x" From 0342567115feaf24bafcb25be903a9d732ac78ca Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 31 Oct 2018 11:30:42 +0100 Subject: [PATCH 25/42] file-posix: Avoid aio_worker() for QEMU_AIO_IOCTL aio_worker() doesn't add anything interesting, it's only a useless indirection. Call the handler function directly instead. As we know that this handler function is only called from coroutine context and the coroutine stays around until the worker thread finishes, we can keep RawPosixAIOData on the stack. This was the last user of aio_worker(), so the function goes away now. Signed-off-by: Kevin Wolf --- block/file-posix.c | 57 ++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 9c15bbe429..d8f0b93752 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1152,8 +1152,10 @@ static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) } #endif -static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) +#if defined(__linux__) +static int handle_aiocb_ioctl(void *opaque) { + RawPosixAIOData *aiocb = opaque; int ret; ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf); @@ -1163,6 +1165,7 @@ static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) return 0; } +#endif /* linux */ static int handle_aiocb_flush(void *opaque) { @@ -1788,34 +1791,6 @@ out: return result; } -static int aio_worker(void *arg) -{ - RawPosixAIOData *aiocb = arg; - ssize_t ret = 0; - - switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { - case QEMU_AIO_IOCTL: - ret = handle_aiocb_ioctl(aiocb); - break; - case QEMU_AIO_READ: - case QEMU_AIO_WRITE: - case QEMU_AIO_FLUSH: - case QEMU_AIO_DISCARD: - case QEMU_AIO_WRITE_ZEROES: - case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD: - case QEMU_AIO_COPY_RANGE: - case QEMU_AIO_TRUNCATE: - g_assert_not_reached(); - default: - error_report("invalid aio request (0x%x)", aiocb->aio_type); - ret = -EINVAL; - break; - } - - g_free(aiocb); - return ret; -} - static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs, ThreadPoolFunc func, void *arg) { @@ -3113,8 +3088,7 @@ static int coroutine_fn hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { BDRVRawState *s = bs->opaque; - RawPosixAIOData *acb; - ThreadPool *pool; + RawPosixAIOData acb; int ret; ret = fd_open(bs); @@ -3131,15 +3105,18 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) } } - acb = g_new(RawPosixAIOData, 1); - acb->bs = bs; - acb->aio_type = QEMU_AIO_IOCTL; - acb->aio_fildes = s->fd; - acb->aio_offset = 0; - acb->ioctl.buf = buf; - acb->ioctl.cmd = req; - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, aio_worker, acb); + acb = (RawPosixAIOData) { + .bs = bs, + .aio_type = QEMU_AIO_IOCTL, + .aio_fildes = s->fd, + .aio_offset = 0, + .ioctl = { + .buf = buf, + .cmd = req, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb); } #endif /* linux */ From 6e1000a863f9dcdb4a6f51cd7cd0782f23120ba5 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:33 +0200 Subject: [PATCH 26/42] block: Add bdrv_reopen_set_read_only() Most callers of bdrv_reopen() only use it to switch a BlockDriverState between read-only and read-write, so this patch adds a new function that does just that. We also want to get rid of the flags parameter in the bdrv_reopen() API, so this function sets the "read-only" option and passes the original flags (which will then be updated in bdrv_reopen_prepare()). Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 17 +++++++++++++++++ include/block/block.h | 2 ++ 2 files changed, 19 insertions(+) diff --git a/block.c b/block.c index 811239ca23..7178872560 100644 --- a/block.c +++ b/block.c @@ -3146,6 +3146,23 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) return ret; } +int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, + Error **errp) +{ + int ret; + BlockReopenQueue *queue; + QDict *opts = qdict_new(); + + qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only); + + bdrv_subtree_drained_begin(bs); + queue = bdrv_reopen_queue(NULL, bs, opts, bdrv_get_flags(bs)); + ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, errp); + bdrv_subtree_drained_end(bs); + + return ret; +} + static BlockReopenQueueEntry *find_parent_in_reopen_queue(BlockReopenQueue *q, BdrvChild *c) { diff --git a/include/block/block.h b/include/block/block.h index 7f5453b45b..382e6643fc 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -303,6 +303,8 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, QDict *options, int flags); int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp); int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp); +int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, + Error **errp); int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, Error **errp); void bdrv_reopen_commit(BDRVReopenState *reopen_state); From e94d3dba6a91e3c0be59b20cc841c9da8a49f7aa Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:34 +0200 Subject: [PATCH 27/42] block: Use bdrv_reopen_set_read_only() in bdrv_backing_update_filename() This patch replaces the bdrv_reopen() calls that set and remove the BDRV_O_RDWR flag with the new bdrv_reopen_set_read_only() function. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block.c b/block.c index 7178872560..d7b94794f4 100644 --- a/block.c +++ b/block.c @@ -1079,11 +1079,11 @@ static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base, const char *filename, Error **errp) { BlockDriverState *parent = c->opaque; - int orig_flags = bdrv_get_flags(parent); + bool read_only = bdrv_is_read_only(parent); int ret; - if (!(orig_flags & BDRV_O_RDWR)) { - ret = bdrv_reopen(parent, orig_flags | BDRV_O_RDWR, errp); + if (read_only) { + ret = bdrv_reopen_set_read_only(parent, false, errp); if (ret < 0) { return ret; } @@ -1095,8 +1095,8 @@ static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base, error_setg_errno(errp, -ret, "Could not update backing file link"); } - if (!(orig_flags & BDRV_O_RDWR)) { - bdrv_reopen(parent, orig_flags, NULL); + if (read_only) { + bdrv_reopen_set_read_only(parent, true, NULL); } return ret; From e70cdc57daa5d5a16caf3e681ed2ec3522cd75ad Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:35 +0200 Subject: [PATCH 28/42] block: Use bdrv_reopen_set_read_only() in commit_start/complete() This patch replaces the bdrv_reopen() calls that set and remove the BDRV_O_RDWR flag with the new bdrv_reopen_set_read_only() function. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/commit.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/block/commit.c b/block/commit.c index a2da5740b0..a53c2d04b0 100644 --- a/block/commit.c +++ b/block/commit.c @@ -38,7 +38,7 @@ typedef struct CommitBlockJob { BlockBackend *base; BlockDriverState *base_bs; BlockdevOnError on_error; - int base_flags; + bool base_read_only; char *backing_file_str; } CommitBlockJob; @@ -124,8 +124,8 @@ static void commit_clean(Job *job) /* restore base open flags here if appropriate (e.g., change the base back * to r/o). These reopens do not need to be atomic, since we won't abort * even on failure here */ - if (s->base_flags != bdrv_get_flags(s->base_bs)) { - bdrv_reopen(s->base_bs, s->base_flags, NULL); + if (s->base_read_only) { + bdrv_reopen_set_read_only(s->base_bs, true, NULL); } g_free(s->backing_file_str); @@ -264,7 +264,6 @@ void commit_start(const char *job_id, BlockDriverState *bs, const char *filter_node_name, Error **errp) { CommitBlockJob *s; - int orig_base_flags; BlockDriverState *iter; BlockDriverState *commit_top_bs = NULL; Error *local_err = NULL; @@ -283,11 +282,9 @@ void commit_start(const char *job_id, BlockDriverState *bs, } /* convert base to r/w, if necessary */ - orig_base_flags = bdrv_get_flags(base); - if (!(orig_base_flags & BDRV_O_RDWR)) { - bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); + s->base_read_only = bdrv_is_read_only(base); + if (s->base_read_only) { + if (bdrv_reopen_set_read_only(base, false, errp) != 0) { goto fail; } } @@ -363,7 +360,6 @@ void commit_start(const char *job_id, BlockDriverState *bs, goto fail; } - s->base_flags = orig_base_flags; s->backing_file_str = g_strdup(backing_file_str); s->on_error = on_error; From c742a3643f113fe92cf904dd897fa03c2784d30d Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:36 +0200 Subject: [PATCH 29/42] block: Use bdrv_reopen_set_read_only() in bdrv_commit() This patch replaces the bdrv_reopen() calls that set and remove the BDRV_O_RDWR flag with the new bdrv_reopen_set_read_only() function. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/commit.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/block/commit.c b/block/commit.c index a53c2d04b0..53148e610b 100644 --- a/block/commit.c +++ b/block/commit.c @@ -391,7 +391,7 @@ int bdrv_commit(BlockDriverState *bs) BlockDriverState *commit_top_bs = NULL; BlockDriver *drv = bs->drv; int64_t offset, length, backing_length; - int ro, open_flags; + int ro; int64_t n; int ret = 0; uint8_t *buf = NULL; @@ -410,10 +410,9 @@ int bdrv_commit(BlockDriverState *bs) } ro = bs->backing->bs->read_only; - open_flags = bs->backing->bs->open_flags; if (ro) { - if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) { + if (bdrv_reopen_set_read_only(bs->backing->bs, false, NULL)) { return -EACCES; } } @@ -523,7 +522,7 @@ ro_cleanup: if (ro) { /* ignoring error return here */ - bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL); + bdrv_reopen_set_read_only(bs->backing->bs, true, NULL); } return ret; From e7d22f8bc60351c8c16c6130efb09f08ba09d789 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:37 +0200 Subject: [PATCH 30/42] block: Use bdrv_reopen_set_read_only() in stream_start/complete() This patch replaces the bdrv_reopen() calls that set and remove the BDRV_O_RDWR flag with the new bdrv_reopen_set_read_only() function. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/stream.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/stream.c b/block/stream.c index 81a7ec8ece..7a49ac0992 100644 --- a/block/stream.c +++ b/block/stream.c @@ -34,7 +34,7 @@ typedef struct StreamBlockJob { BlockDriverState *base; BlockdevOnError on_error; char *backing_file_str; - int bs_flags; + bool bs_read_only; } StreamBlockJob; static int coroutine_fn stream_populate(BlockBackend *blk, @@ -89,10 +89,10 @@ static void stream_clean(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); /* Reopen the image back in read-only mode if necessary */ - if (s->bs_flags != bdrv_get_flags(bs)) { + if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); - bdrv_reopen(bs, s->bs_flags, NULL); + bdrv_reopen_set_read_only(bs, true, NULL); } g_free(s->backing_file_str); @@ -226,12 +226,12 @@ void stream_start(const char *job_id, BlockDriverState *bs, { StreamBlockJob *s; BlockDriverState *iter; - int orig_bs_flags; + bool bs_read_only; /* Make sure that the image is opened in read-write mode */ - orig_bs_flags = bdrv_get_flags(bs); - if (!(orig_bs_flags & BDRV_O_RDWR)) { - if (bdrv_reopen(bs, orig_bs_flags | BDRV_O_RDWR, errp) != 0) { + bs_read_only = bdrv_is_read_only(bs); + if (bs_read_only) { + if (bdrv_reopen_set_read_only(bs, false, errp) != 0) { return; } } @@ -261,7 +261,7 @@ void stream_start(const char *job_id, BlockDriverState *bs, s->base = base; s->backing_file_str = g_strdup(backing_file_str); - s->bs_flags = orig_bs_flags; + s->bs_read_only = bs_read_only; s->on_error = on_error; trace_stream_start(bs, base, s); @@ -269,7 +269,7 @@ void stream_start(const char *job_id, BlockDriverState *bs, return; fail: - if (orig_bs_flags != bdrv_get_flags(bs)) { - bdrv_reopen(bs, orig_bs_flags, NULL); + if (bs_read_only) { + bdrv_reopen_set_read_only(bs, true, NULL); } } From 051a60f6a342c83fc489e1533b9f2799fa2b2b9b Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:38 +0200 Subject: [PATCH 31/42] block: Use bdrv_reopen_set_read_only() in qmp_change_backing_file() This patch replaces the bdrv_reopen() calls that set and remove the BDRV_O_RDWR flag with the new bdrv_reopen_set_read_only() function. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- blockdev.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/blockdev.c b/blockdev.c index 81f95d920b..6fa6969cd0 100644 --- a/blockdev.c +++ b/blockdev.c @@ -4100,7 +4100,6 @@ void qmp_change_backing_file(const char *device, BlockDriverState *image_bs = NULL; Error *local_err = NULL; bool ro; - int open_flags; int ret; bs = qmp_get_root_bs(device, errp); @@ -4142,13 +4141,10 @@ void qmp_change_backing_file(const char *device, } /* if not r/w, reopen to make r/w */ - open_flags = image_bs->open_flags; ro = bdrv_is_read_only(image_bs); if (ro) { - bdrv_reopen(image_bs, open_flags | BDRV_O_RDWR, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (bdrv_reopen_set_read_only(image_bs, false, errp) != 0) { goto out; } } @@ -4164,7 +4160,7 @@ void qmp_change_backing_file(const char *device, } if (ro) { - bdrv_reopen(image_bs, open_flags, &local_err); + bdrv_reopen_set_read_only(image_bs, true, &local_err); error_propagate(errp, local_err); } From 1b57774f79e6fb5e848149e0bf227e0534e9ade5 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:39 +0200 Subject: [PATCH 32/42] block: Use bdrv_reopen_set_read_only() in external_snapshot_commit() This patch replaces the bdrv_reopen() call that set and remove the BDRV_O_RDWR flag with the new bdrv_reopen_set_read_only() function. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- blockdev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index 6fa6969cd0..e6c8349409 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1703,8 +1703,7 @@ static void external_snapshot_commit(BlkActionState *common) * bdrv_reopen_multiple() across all the entries at once, because we * don't want to abort all of them if one of them fails the reopen */ if (!atomic_read(&state->old_bs->copy_on_read)) { - bdrv_reopen(state->old_bs, state->old_bs->open_flags & ~BDRV_O_RDWR, - NULL); + bdrv_reopen_set_read_only(state->old_bs, true, NULL); } aio_context_release(aio_context); From 1ba793889583cf9b548bf61bd5c8f224ad6c1415 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:40 +0200 Subject: [PATCH 33/42] block: Use bdrv_reopen_set_read_only() in the mirror driver The 'block-commit' QMP command is implemented internally using two different drivers. If the source image is the active layer then the mirror driver is used (commit_active_start()), otherwise the commit driver is used (commit_start()). In both cases the destination image must be put temporarily in read-write mode. This is done correctly in the latter case, but what commit_active_start() does is copy all flags instead. This patch replaces the bdrv_reopen() calls in that function with bdrv_reopen_set_read_only() so that only the read-only status is changed. A similar change is made in mirror_exit(), which is also used by the 'drive-mirror' and 'blockdev-mirror' commands. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/mirror.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 8f52c6215d..628f9e6a0b 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -669,9 +669,10 @@ static int mirror_exit_common(Job *job) if (s->should_complete && !abort) { BlockDriverState *to_replace = s->to_replace ?: src; + bool ro = bdrv_is_read_only(to_replace); - if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) { - bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL); + if (ro != bdrv_is_read_only(target_bs)) { + bdrv_reopen_set_read_only(target_bs, ro, NULL); } /* The mirror job has no requests in flight any more, but we need to @@ -1689,13 +1690,15 @@ void commit_active_start(const char *job_id, BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque, bool auto_complete, Error **errp) { - int orig_base_flags; + bool base_read_only; Error *local_err = NULL; - orig_base_flags = bdrv_get_flags(base); + base_read_only = bdrv_is_read_only(base); - if (bdrv_reopen(base, bs->open_flags, errp)) { - return; + if (base_read_only) { + if (bdrv_reopen_set_read_only(base, false, errp) < 0) { + return; + } } mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0, @@ -1714,6 +1717,8 @@ void commit_active_start(const char *job_id, BlockDriverState *bs, error_restore_flags: /* ignore error and errp for bdrv_reopen, because we want to propagate * the original error */ - bdrv_reopen(base, orig_base_flags, NULL); + if (base_read_only) { + bdrv_reopen_set_read_only(base, true, NULL); + } return; } From 295cf237c2e5112de3e2aa7c3dbc80580b80bf88 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:41 +0200 Subject: [PATCH 34/42] block: Drop bdrv_reopen() No one is using this function anymore, so we can safely remove it. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 21 --------------------- include/block/block.h | 1 - 2 files changed, 22 deletions(-) diff --git a/block.c b/block.c index d7b94794f4..071465ee3a 100644 --- a/block.c +++ b/block.c @@ -3125,27 +3125,6 @@ cleanup: return ret; } - -/* Reopen a single BlockDriverState with the specified flags. */ -int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) -{ - int ret = -1; - Error *local_err = NULL; - BlockReopenQueue *queue; - - bdrv_subtree_drained_begin(bs); - - queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags); - ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); - } - - bdrv_subtree_drained_end(bs); - - return ret; -} - int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) { diff --git a/include/block/block.h b/include/block/block.h index 382e6643fc..de72c7a093 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -302,7 +302,6 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, int flags); int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp); -int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp); int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); int bdrv_reopen_prepare(BDRVReopenState *reopen_state, From dc900c35239bc865df2dff5880eabcd25b974f19 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:42 +0200 Subject: [PATCH 35/42] qemu-io: Put flag changes in the options QDict in reopen_f() When reopen_f() puts a block device in the reopen queue, some of the new options are passed using a QDict, but others ("read-only" and the cache options) are passed as flags. This patch puts those flags in the QDict. This way the flags parameter becomes redundant and we'll be able to get rid of it in a subsequent patch. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- qemu-io-cmds.c | 27 ++++++++++++++++++++++++++- tests/qemu-iotests/133 | 9 +++++++++ tests/qemu-iotests/133.out | 8 ++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 5363482213..c9ae09d574 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -10,6 +10,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" +#include "qapi/qmp/qdict.h" #include "qemu-io.h" #include "sysemu/block-backend.h" #include "block/block.h" @@ -1978,6 +1979,7 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) int flags = bs->open_flags; bool writethrough = !blk_enable_write_cache(blk); bool has_rw_option = false; + bool has_cache_option = false; BlockReopenQueue *brq; Error *local_err = NULL; @@ -1989,6 +1991,7 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) error_report("Invalid cache option: %s", optarg); return -EINVAL; } + has_cache_option = true; break; case 'o': if (!qemu_opts_parse_noisily(&reopen_opts, optarg, 0)) { @@ -2046,9 +2049,31 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) } qopts = qemu_opts_find(&reopen_opts, NULL); - opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL; + opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : qdict_new(); qemu_opts_reset(&reopen_opts); + if (qdict_haskey(opts, BDRV_OPT_READ_ONLY)) { + if (has_rw_option) { + error_report("Cannot set both -r/-w and '" BDRV_OPT_READ_ONLY "'"); + qobject_unref(opts); + return -EINVAL; + } + } else { + qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR)); + } + + if (qdict_haskey(opts, BDRV_OPT_CACHE_DIRECT) || + qdict_haskey(opts, BDRV_OPT_CACHE_NO_FLUSH)) { + if (has_cache_option) { + error_report("Cannot set both -c and the cache options"); + qobject_unref(opts); + return -EINVAL; + } + } else { + qdict_put_bool(opts, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE); + qdict_put_bool(opts, BDRV_OPT_CACHE_NO_FLUSH, flags & BDRV_O_NO_FLUSH); + } + bdrv_subtree_drained_begin(bs); brq = bdrv_reopen_queue(NULL, bs, opts, flags); bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err); diff --git a/tests/qemu-iotests/133 b/tests/qemu-iotests/133 index a9a47a3c36..63b25f394b 100755 --- a/tests/qemu-iotests/133 +++ b/tests/qemu-iotests/133 @@ -91,6 +91,15 @@ echo IMGOPTSSYNTAX=false $QEMU_IO -f null-co -c 'reopen' -c 'info' \ "json:{'driver': 'null-co', 'size': 65536}" +echo +echo "=== Check that mixing -c/-r/-w and their corresponding options is forbidden ===" +echo + +$QEMU_IO -c 'reopen -r -o read-only=on' $TEST_IMG +$QEMU_IO -c 'reopen -w -o read-only=on' $TEST_IMG +$QEMU_IO -c 'reopen -c none -o cache.direct=on' $TEST_IMG +$QEMU_IO -c 'reopen -c writeback -o cache.direct=on' $TEST_IMG +$QEMU_IO -c 'reopen -c directsync -o cache.no-flush=on' $TEST_IMG # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/133.out b/tests/qemu-iotests/133.out index f4a85aeb63..48a9d087f0 100644 --- a/tests/qemu-iotests/133.out +++ b/tests/qemu-iotests/133.out @@ -24,4 +24,12 @@ Cannot change the option 'driver' format name: null-co format name: null-co + +=== Check that mixing -c/-r/-w and their corresponding options is forbidden === + +Cannot set both -r/-w and 'read-only' +Cannot set both -r/-w and 'read-only' +Cannot set both -c and the cache options +Cannot set both -c and the cache options +Cannot set both -c and the cache options *** done From 3c4e964762c7d0292185473657dd2afc0d2d060b Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:43 +0200 Subject: [PATCH 36/42] block: Clean up reopen_backing_file() in block/replication.c This function is used to put the hidden and secondary disks in read-write mode before launching the backup job, and back in read-only mode afterwards. This patch does the following changes: - Use an options QDict with the "read-only" option instead of passing the changes as flags only. - Simplify the code (it was unnecessarily complicated and verbose). - Fix a bug due to which the secondary disk was not being put back in read-only mode when writable=false (because in this case orig_secondary_flags always had the BDRV_O_RDWR flag set). - Stop clearing the BDRV_O_INACTIVE flag. The flags parameter to bdrv_reopen_queue() becomes redundant and we'll be able to get rid of it in a subsequent patch. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/replication.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/block/replication.c b/block/replication.c index 0c2989d2cb..af9dba8696 100644 --- a/block/replication.c +++ b/block/replication.c @@ -20,6 +20,7 @@ #include "block/block_backup.h" #include "sysemu/block-backend.h" #include "qapi/error.h" +#include "qapi/qmp/qdict.h" #include "replication.h" typedef enum { @@ -39,8 +40,8 @@ typedef struct BDRVReplicationState { char *top_id; ReplicationState *rs; Error *blocker; - int orig_hidden_flags; - int orig_secondary_flags; + bool orig_hidden_read_only; + bool orig_secondary_read_only; int error; } BDRVReplicationState; @@ -349,44 +350,40 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp) } } +/* This function is supposed to be called twice: + * first with writable = true, then with writable = false. + * The first call puts s->hidden_disk and s->secondary_disk in + * r/w mode, and the second puts them back in their original state. + */ static void reopen_backing_file(BlockDriverState *bs, bool writable, Error **errp) { BDRVReplicationState *s = bs->opaque; BlockReopenQueue *reopen_queue = NULL; - int orig_hidden_flags, orig_secondary_flags; - int new_hidden_flags, new_secondary_flags; Error *local_err = NULL; if (writable) { - orig_hidden_flags = s->orig_hidden_flags = - bdrv_get_flags(s->hidden_disk->bs); - new_hidden_flags = (orig_hidden_flags | BDRV_O_RDWR) & - ~BDRV_O_INACTIVE; - orig_secondary_flags = s->orig_secondary_flags = - bdrv_get_flags(s->secondary_disk->bs); - new_secondary_flags = (orig_secondary_flags | BDRV_O_RDWR) & - ~BDRV_O_INACTIVE; - } else { - orig_hidden_flags = (s->orig_hidden_flags | BDRV_O_RDWR) & - ~BDRV_O_INACTIVE; - new_hidden_flags = s->orig_hidden_flags; - orig_secondary_flags = (s->orig_secondary_flags | BDRV_O_RDWR) & - ~BDRV_O_INACTIVE; - new_secondary_flags = s->orig_secondary_flags; + s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs); + s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs); } bdrv_subtree_drained_begin(s->hidden_disk->bs); bdrv_subtree_drained_begin(s->secondary_disk->bs); - if (orig_hidden_flags != new_hidden_flags) { - reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL, - new_hidden_flags); + if (s->orig_hidden_read_only) { + int flags = bdrv_get_flags(s->hidden_disk->bs); + QDict *opts = qdict_new(); + qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); + reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, + opts, flags); } - if (!(orig_secondary_flags & BDRV_O_RDWR)) { + if (s->orig_secondary_read_only) { + int flags = bdrv_get_flags(s->secondary_disk->bs); + QDict *opts = qdict_new(); + qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs, - NULL, new_secondary_flags); + opts, flags); } if (reopen_queue) { From 2e891722c5e3d0372e042173a425925e14749bf0 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:44 +0200 Subject: [PATCH 37/42] block: Remove flags parameter from bdrv_reopen_queue() Now that all callers are passing all flag changes as QDict options, the flags parameter is no longer necessary, so we can get rid of it. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 5 +++-- block/replication.c | 6 ++---- include/block/block.h | 3 +-- qemu-io-cmds.c | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/block.c b/block.c index 071465ee3a..800c341244 100644 --- a/block.c +++ b/block.c @@ -3060,8 +3060,9 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, - QDict *options, int flags) + QDict *options) { + int flags = bdrv_get_flags(bs); return bdrv_reopen_queue_child(bs_queue, bs, options, flags, NULL, NULL, 0); } @@ -3135,7 +3136,7 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only); bdrv_subtree_drained_begin(bs); - queue = bdrv_reopen_queue(NULL, bs, opts, bdrv_get_flags(bs)); + queue = bdrv_reopen_queue(NULL, bs, opts); ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, errp); bdrv_subtree_drained_end(bs); diff --git a/block/replication.c b/block/replication.c index af9dba8696..e70dd95001 100644 --- a/block/replication.c +++ b/block/replication.c @@ -371,19 +371,17 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, bdrv_subtree_drained_begin(s->secondary_disk->bs); if (s->orig_hidden_read_only) { - int flags = bdrv_get_flags(s->hidden_disk->bs); QDict *opts = qdict_new(); qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, - opts, flags); + opts); } if (s->orig_secondary_read_only) { - int flags = bdrv_get_flags(s->secondary_disk->bs); QDict *opts = qdict_new(); qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs, - opts, flags); + opts); } if (reopen_queue) { diff --git a/include/block/block.h b/include/block/block.h index de72c7a093..f70a843b72 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -299,8 +299,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference, BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name, int flags, Error **errp); BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, - BlockDriverState *bs, - QDict *options, int flags); + BlockDriverState *bs, QDict *options); int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp); int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index c9ae09d574..2c39124036 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -2075,7 +2075,7 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) } bdrv_subtree_drained_begin(bs); - brq = bdrv_reopen_queue(NULL, bs, opts, flags); + brq = bdrv_reopen_queue(NULL, bs, opts); bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err); bdrv_subtree_drained_end(bs); From 9aa09ddd1edf1256cb6e4e23293720fee130f07c Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:45 +0200 Subject: [PATCH 38/42] block: Stop passing flags to bdrv_reopen_queue_child() Now that all callers are passing the new options using the QDict we no longer need the 'flags' parameter. This patch makes the following changes: 1) The update_options_from_flags() call is no longer necessary so it can be removed. 2) The update_flags_from_options() call is now used in all cases, and is moved down a few lines so it happens after the options QDict contains the final set of values. 3) The flags parameter is removed. Now the flags are initialized using the current value (for the top-level node) or the parent flags (after inherit_options()). In both cases the initial values are updated to reflect the new options in the QDict. This happens in bdrv_reopen_queue_child() (as explained above) and in bdrv_reopen_prepare(). Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/block.c b/block.c index 800c341244..cb354438be 100644 --- a/block.c +++ b/block.c @@ -2931,7 +2931,6 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference, static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, - int flags, const BdrvChildRole *role, QDict *parent_options, int parent_flags) @@ -2940,7 +2939,9 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockReopenQueueEntry *bs_entry; BdrvChild *child; - QDict *old_options, *explicit_options; + QDict *old_options, *explicit_options, *options_copy; + int flags; + QemuOpts *opts; /* Make sure that the caller remembered to use a drained section. This is * important to avoid graph changes between the recursive queuing here and @@ -2966,22 +2967,11 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, /* * Precedence of options: * 1. Explicitly passed in options (highest) - * 2. Set in flags (only for top level) - * 3. Retained from explicitly set options of bs - * 4. Inherited from parent node - * 5. Retained from effective options of bs + * 2. Retained from explicitly set options of bs + * 3. Inherited from parent node + * 4. Retained from effective options of bs */ - if (!parent_options) { - /* - * Any setting represented by flags is always updated. If the - * corresponding QDict option is set, it takes precedence. Otherwise - * the flag is translated into a QDict option. The old setting of bs is - * not considered. - */ - update_options_from_flags(options, flags); - } - /* Old explicitly set values (don't overwrite by inherited value) */ if (bs_entry) { old_options = qdict_clone_shallow(bs_entry->state.explicit_options); @@ -2995,16 +2985,10 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, /* Inherit from parent node */ if (parent_options) { - QemuOpts *opts; - QDict *options_copy; - assert(!flags); + flags = 0; role->inherit_options(&flags, options, parent_flags, parent_options); - options_copy = qdict_clone_shallow(options); - opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options_copy, NULL); - update_flags_from_options(&flags, opts); - qemu_opts_del(opts); - qobject_unref(options_copy); + } else { + flags = bdrv_get_flags(bs); } /* Old values are used for options that aren't set yet */ @@ -3012,6 +2996,14 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, bdrv_join_options(bs, options, old_options); qobject_unref(old_options); + /* We have the final set of options so let's update the flags */ + options_copy = qdict_clone_shallow(options); + opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options_copy, NULL); + update_flags_from_options(&flags, opts); + qemu_opts_del(opts); + qobject_unref(options_copy); + /* bdrv_open_inherit() sets and clears some additional flags internally */ flags &= ~BDRV_O_PROTOCOL; if (flags & BDRV_O_RDWR) { @@ -3051,7 +3043,7 @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, qdict_extract_subqdict(options, &new_child_options, child_key_dot); g_free(child_key_dot); - bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options, 0, + bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options, child->role, options, flags); } @@ -3062,9 +3054,7 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options) { - int flags = bdrv_get_flags(bs); - return bdrv_reopen_queue_child(bs_queue, bs, options, flags, - NULL, NULL, 0); + return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, NULL, 0); } /* From 8eb4b07b6f9466f850be5cfb7d471cdd97086178 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:46 +0200 Subject: [PATCH 39/42] block: Remove assertions from update_flags_from_options() This function takes four options (cache.direct, cache.no-flush, read-only and auto-read-only) from a QemuOpts object and updates the flags accordingly. If any of those options is not set (because it was missing from the original QDict or because it had an invalid value) then the function aborts with a failed assertion: $ qemu-io -c 'reopen -o read-only=foo' hd.qcow2 block.c:1126: update_flags_from_options: Assertion `qemu_opt_find(opts, BDRV_OPT_CACHE_DIRECT)' failed. Aborted This assertion is unnecessary, and it forces any caller of bdrv_reopen() to pass all the aforementioned four options. This may have made sense in order to remove ambiguity when bdrv_reopen() was taking both flags and options, but that's not the case anymore. It's also unnecessary if we want to validate the option values, because bdrv_reopen_prepare() already takes care of that, as we can see if we remove the assertions: $ qemu-io -c 'reopen -o read-only=foo' hd.qcow2 Parameter 'read-only' expects 'on' or 'off' Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 4 ---- tests/qemu-iotests/133 | 9 +++++++++ tests/qemu-iotests/133.out | 7 +++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/block.c b/block.c index cb354438be..c6d4564bcf 100644 --- a/block.c +++ b/block.c @@ -1139,22 +1139,18 @@ static void update_flags_from_options(int *flags, QemuOpts *opts) { *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY); - assert(qemu_opt_find(opts, BDRV_OPT_CACHE_NO_FLUSH)); if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) { *flags |= BDRV_O_NO_FLUSH; } - assert(qemu_opt_find(opts, BDRV_OPT_CACHE_DIRECT)); if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) { *flags |= BDRV_O_NOCACHE; } - assert(qemu_opt_find(opts, BDRV_OPT_READ_ONLY)); if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) { *flags |= BDRV_O_RDWR; } - assert(qemu_opt_find(opts, BDRV_OPT_AUTO_READ_ONLY)); if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) { *flags |= BDRV_O_AUTO_RDONLY; } diff --git a/tests/qemu-iotests/133 b/tests/qemu-iotests/133 index 63b25f394b..565e0b1b6e 100755 --- a/tests/qemu-iotests/133 +++ b/tests/qemu-iotests/133 @@ -100,6 +100,15 @@ $QEMU_IO -c 'reopen -w -o read-only=on' $TEST_IMG $QEMU_IO -c 'reopen -c none -o cache.direct=on' $TEST_IMG $QEMU_IO -c 'reopen -c writeback -o cache.direct=on' $TEST_IMG $QEMU_IO -c 'reopen -c directsync -o cache.no-flush=on' $TEST_IMG + +echo +echo "=== Check that invalid options are handled correctly ===" +echo + +$QEMU_IO -c 'reopen -o read-only=foo' $TEST_IMG +$QEMU_IO -c 'reopen -o cache.no-flush=bar' $TEST_IMG +$QEMU_IO -c 'reopen -o cache.direct=baz' $TEST_IMG +$QEMU_IO -c 'reopen -o auto-read-only=qux' $TEST_IMG # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/133.out b/tests/qemu-iotests/133.out index 48a9d087f0..414c7fa27f 100644 --- a/tests/qemu-iotests/133.out +++ b/tests/qemu-iotests/133.out @@ -32,4 +32,11 @@ Cannot set both -r/-w and 'read-only' Cannot set both -c and the cache options Cannot set both -c and the cache options Cannot set both -c and the cache options + +=== Check that invalid options are handled correctly === + +Parameter 'read-only' expects 'on' or 'off' +Parameter 'cache.no-flush' expects 'on' or 'off' +Parameter 'cache.direct' expects 'on' or 'off' +Parameter 'auto-read-only' expects 'on' or 'off' *** done From e6d79c41c961bcdacb60b46c88009b33841f90a7 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 12 Nov 2018 16:00:47 +0200 Subject: [PATCH 40/42] block: Assert that flags are up-to-date in bdrv_reopen_prepare() Towards the end of bdrv_reopen_queue_child(), before starting to process the children, the update_flags_from_options() function is called in order to have BDRVReopenState.flags in sync with the options from the QDict. This is necessary because during the reopen process flags must be updated for all nodes in the queue so bdrv_is_writable_after_reopen() and the permission checks work correctly. Because of that, calling update_flags_from_options() again in bdrv_reopen_prepare() doesn't really change the flags (they are already up-to-date). But we need to call it in order to remove those options from QemuOpts and that way indicate that they have been processed. Signed-off-by: Alberto Garcia Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block.c b/block.c index c6d4564bcf..4f5ff2cc12 100644 --- a/block.c +++ b/block.c @@ -3197,6 +3197,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, Error **errp) { int ret = -1; + int old_flags; Error *local_err = NULL; BlockDriver *drv; QemuOpts *opts; @@ -3223,7 +3224,12 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, goto error; } + /* This was already called in bdrv_reopen_queue_child() so the flags + * are up-to-date. This time we simply want to remove the options from + * QemuOpts in order to indicate that they have been processed. */ + old_flags = reopen_state->flags; update_flags_from_options(&reopen_state->flags, opts); + assert(old_flags == reopen_state->flags); discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD); if (discard != NULL) { From 2c26e648e4350079b0c86a6627b2d3566c3709c0 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 5 Dec 2018 09:43:08 +0100 Subject: [PATCH 41/42] iotests: make 235 work on s390 (and others) "-machine pc" will not work all architectures. Lets fall back to the default machine by not specifying it. In addition we also need to specify -no-shutdown on s390 as qemu will exit otherwise. Cc: qemu-stable@nongnu.org Signed-off-by: Christian Borntraeger Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- tests/qemu-iotests/235 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/235 b/tests/qemu-iotests/235 index da044ed34e..d6edd97ab4 100755 --- a/tests/qemu-iotests/235 +++ b/tests/qemu-iotests/235 @@ -49,7 +49,9 @@ qemu_img_create('-f', iotests.imgfmt, '-o', 'preallocation=metadata', disk, str(size)) vm = QEMUMachine(iotests.qemu_prog) -vm.add_args('-machine', 'pc,accel=kvm') +vm.add_args('-machine', 'accel=kvm') +if iotests.qemu_default_machine == 's390-ccw-virtio': + vm.add_args('-no-shutdown') vm.add_args('-drive', 'id=src,file=' + disk) vm.launch() From 537c3d4f64297911a5b70a151926cd7851bbf752 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 13 Dec 2018 11:24:34 +0000 Subject: [PATCH 42/42] block/mirror: add missing coroutine_fn annotations Marking a function coroutine_fn currently has no effect on the compiler, but it documents that this function must be called from coroutine context and it may yield. This is important information for the programmer. Also, if we ever transition to a stackless coroutine implementation, then it's likely that the annotation will become mandatory so the compiler can use the correct calling convention for coroutine functions. Cc: Max Reitz Cc: John Snow Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/mirror.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 628f9e6a0b..ab59ad77e8 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -277,7 +277,8 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, return ret; } -static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) +static inline void coroutine_fn +mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) { MirrorOp *op; @@ -295,7 +296,8 @@ static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) abort(); } -static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) +static inline void coroutine_fn +mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) { /* Only non-active operations use up in-flight slots */ mirror_wait_for_any_operation(s, false); @@ -598,7 +600,7 @@ static void mirror_free_init(MirrorBlockJob *s) * mirror_resume() because mirror_run() will begin iterating again * when the job is resumed. */ -static void mirror_wait_for_all_io(MirrorBlockJob *s) +static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) { while (s->in_flight > 0) { mirror_wait_for_free_in_flight_slot(s); @@ -732,7 +734,7 @@ static void mirror_abort(Job *job) assert(ret == 0); } -static void mirror_throttle(MirrorBlockJob *s) +static void coroutine_fn mirror_throttle(MirrorBlockJob *s) { int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); @@ -1107,7 +1109,7 @@ static void mirror_complete(Job *job, Error **errp) job_enter(job); } -static void mirror_pause(Job *job) +static void coroutine_fn mirror_pause(Job *job) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); @@ -1178,9 +1180,10 @@ static const BlockJobDriver commit_active_job_driver = { .drain = mirror_drain, }; -static void do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static void coroutine_fn +do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BdrvDirtyBitmapIter *iter; QEMUIOVector target_qiov;