From 5e3c0220d7e4f0361c4d36c697a8842f2b583402 Mon Sep 17 00:00:00 2001 From: Li Qiang Date: Thu, 1 Nov 2018 18:22:43 -0700 Subject: [PATCH 1/9] nvme: fix oob access issue(CVE-2018-16847) Currently, the nvme_cmb_ops mr doesn't check the addr and size. This can lead an oob access issue. This is triggerable in the guest. Add check to avoid this issue. Fixes CVE-2018-16847. Reported-by: Li Qiang Reviewed-by: Paolo Bonzini Signed-off-by: Li Qiang Signed-off-by: Kevin Wolf --- hw/block/nvme.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 09d7c90259..d0226e7fdc 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1175,6 +1175,10 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; + + if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) { + return; + } memcpy(&n->cmbuf[addr], &data, size); } @@ -1183,6 +1187,9 @@ static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) uint64_t val; NvmeCtrl *n = (NvmeCtrl *)opaque; + if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) { + return 0; + } memcpy(&val, &n->cmbuf[addr], size); return val; } From 441f6692ecc14859b77af2ac6d8f55e6f1354d3b Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Sun, 11 Nov 2018 09:40:23 +0000 Subject: [PATCH 2/9] fdc: fix segfault in fdctrl_stop_transfer() when DMA is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c8a35f1cf0f "fdc: use IsaDma interface instead of global DMA_* functions" accidentally introduced a segfault in fdctrl_stop_transfer() for non-DMA transfers. If fdctrl->dma_chann has not been configured then the fdctrl->dma interface reference isn't initialised during isabus_fdc_realize(). Unfortunately fdctrl_stop_transfer() unconditionally references the DMA interface when finishing the transfer causing a NULL pointer dereference. Fix the issue by adding a check in fdctrl_stop_transfer() so that the DMA interface reference and release method is only invoked if fdctrl->dma_chann has been set. (This issue was discovered by Martin testing a recent change in the NetBSD installer under qemu-system-sparc) Cc: qemu-stable@nongnu.org Reported-by: Martin Husemann Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Hervé Poussineau Reviewed-by: John Snow Signed-off-by: Kevin Wolf --- hw/block/fdc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/fdc.c b/hw/block/fdc.c index 2e9c1e1e2f..6f19f127a5 100644 --- a/hw/block/fdc.c +++ b/hw/block/fdc.c @@ -1617,7 +1617,7 @@ static void fdctrl_stop_transfer(FDCtrl *fdctrl, uint8_t status0, fdctrl->fifo[5] = cur_drv->sect; fdctrl->fifo[6] = FD_SECTOR_SC; fdctrl->data_dir = FD_DIR_READ; - if (!(fdctrl->msr & FD_MSR_NONDMA)) { + if (fdctrl->dma_chann != -1 && !(fdctrl->msr & FD_MSR_NONDMA)) { IsaDmaClass *k = ISADMA_GET_CLASS(fdctrl->dma); k->release_DREQ(fdctrl->dma, fdctrl->dma_chann); } From 443ba6befa2f47243def9b190c930a8a4f59f888 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 14 Nov 2018 13:50:16 +0100 Subject: [PATCH 3/9] vvfat: Fix memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't leak 'cluster' in the mapping == NULL case. Found by Coverity (CID 1055918). Fixes: 8d9401c2791ee2d2805b741b1ee3006041edcd3e Signed-off-by: Kevin Wolf Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Liam Merwick Tested-by: Philippe Mathieu-Daudé --- block/vvfat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/vvfat.c b/block/vvfat.c index 1de5de1db4..b7b61ea8b7 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2510,7 +2510,7 @@ static int commit_one_file(BDRVVVFATState* s, uint32_t first_cluster = c; mapping_t* mapping = find_mapping_for_cluster(s, c); uint32_t size = filesize_of_direntry(direntry); - char* cluster = g_malloc(s->cluster_size); + char *cluster; uint32_t i; int fd = 0; @@ -2528,17 +2528,17 @@ static int commit_one_file(BDRVVVFATState* s, if (fd < 0) { fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, strerror(errno), errno); - g_free(cluster); return fd; } if (offset > 0) { if (lseek(fd, offset, SEEK_SET) != offset) { qemu_close(fd); - g_free(cluster); return -3; } } + cluster = g_malloc(s->cluster_size); + while (offset < size) { uint32_t c1; int rest_size = (size - offset > s->cluster_size ? From d3e1a7eb4ceb9489d575c45c9518137dfbd1389d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 15 Nov 2018 12:34:08 -0600 Subject: [PATCH 4/9] qcow2: Document some maximum size constraints Although off_t permits up to 63 bits (8EB) of file offsets, in practice, we're going to hit other limits first. Document some of those limits in the qcow2 spec (some are inherent, others are implementation choices of qemu), and how choice of cluster size can influence some of the limits. While we cannot map any uncompressed virtual cluster to any address higher than 64 PB (56 bits) (due to the current L1/L2 field encoding stopping at bit 55), qemu's cap of 8M for the refcount table can still access larger host addresses for some combinations of large clusters and small refcount_order. For comparison, ext4 with 4k blocks caps files at 16PB. Another interesting limit: for compressed clusters, the L2 layout requires an ever-smaller maximum host offset as cluster size gets larger, down to a 512 TB maximum with 2M clusters. In particular, note that with a cluster size of 8k or smaller, the L2 entry for a compressed cluster could technically point beyond the 64PB mark, but when you consider that with 8k clusters and refcount_order = 0, you cannot access beyond 512T without exceeding qemu's limit of an 8M cap on the refcount table, it is unlikely that any image in the wild has attempted to do so. To be safe, let's document that bits beyond 55 in a compressed cluster must be 0. Signed-off-by: Eric Blake Signed-off-by: Kevin Wolf --- docs/interop/qcow2.txt | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index 845d40a086..fb5cb47245 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -40,7 +40,18 @@ The first cluster of a qcow2 image contains the file header: with larger cluster sizes. 24 - 31: size - Virtual disk size in bytes + Virtual disk size in bytes. + + Note: qemu has an implementation limit of 32 MB as + the maximum L1 table size. With a 2 MB cluster + size, it is unable to populate a virtual cluster + beyond 2 EB (61 bits); with a 512 byte cluster + size, it is unable to populate a virtual size + larger than 128 GB (37 bits). Meanwhile, L1/L2 + table layouts limit an image to no more than 64 PB + (56 bits) of populated clusters, and an image may + hit other limits first (such as a file system's + maximum size). 32 - 35: crypt_method 0 for no encryption @@ -326,6 +337,17 @@ in the image file. It contains pointers to the second level structures which are called refcount blocks and are exactly one cluster in size. +Although a large enough refcount table can reserve clusters past 64 PB +(56 bits) (assuming the underlying protocol can even be sized that +large), note that some qcow2 metadata such as L1/L2 tables must point +to clusters prior to that point. + +Note: qemu has an implementation limit of 8 MB as the maximum refcount +table size. With a 2 MB cluster size and a default refcount_order of +4, it is unable to reference host resources beyond 2 EB (61 bits); in +the worst case, with a 512 cluster size and refcount_order of 6, it is +unable to access beyond 32 GB (35 bits). + Given an offset into the image file, the refcount of its cluster can be obtained as follows: @@ -365,6 +387,16 @@ The L1 table has a variable size (stored in the header) and may use multiple clusters, however it must be contiguous in the image file. L2 tables are exactly one cluster in size. +The L1 and L2 tables have implications on the maximum virtual file +size; for a given L1 table size, a larger cluster size is required for +the guest to have access to more space. Furthermore, a virtual +cluster must currently map to a host offset below 64 PB (56 bits) +(although this limit could be relaxed by putting reserved bits into +use). Additionally, as cluster size increases, the maximum host +offset for a compressed cluster is reduced (a 2M cluster size requires +compressed clusters to reside below 512 TB (49 bits), and this limit +cannot be relaxed without an incompatible layout change). + Given an offset into the virtual disk, the offset into the image file can be obtained as follows: @@ -427,7 +459,9 @@ Standard Cluster Descriptor: Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)): Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a - cluster or sector boundary! + cluster or sector boundary! If cluster_bits is + small enough that this field includes bits beyond + 55, those upper bits must be set to 0. x - 61: Number of additional 512-byte sectors used for the compressed data, beyond the sector containing the offset From 77d6a21558577fbdd35e65e0e1d03ae07214329f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Nov 2018 17:03:18 -0600 Subject: [PATCH 5/9] qcow2: Don't allow overflow during cluster allocation Our code was already checking that we did not attempt to allocate more clusters than what would fit in an INT64 (the physical maximimum if we can access a full off_t's worth of data). But this does not catch smaller limits enforced by various spots in the qcow2 image description: L1 and normal clusters of L2 are documented as having bits 63-56 reserved for other purposes, capping our maximum offset at 64PB (bit 55 is the maximum bit set). And for compressed images with 2M clusters, the cap drops the maximum offset to bit 48, or a maximum offset of 512TB. If we overflow that offset, we would write compressed data into one place, but try to decompress from another, which won't work. It's actually possible to prove that overflow can cause image corruption without this patch; I'll add the iotests separately in the next commit. Signed-off-by: Eric Blake Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 20 +++++++++++++------- block/qcow2.h | 6 ++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 46082aeac1..1c63ac244a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -31,7 +31,8 @@ #include "qemu/bswap.h" #include "qemu/cutils.h" -static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, + uint64_t max); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, uint64_t addend, bool decrease, enum qcow2_discard_type type); @@ -362,7 +363,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } /* Allocate the refcount block itself and mark it as used */ - int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); + int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX); if (new_block < 0) { return new_block; } @@ -954,7 +955,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, /* return < 0 if error */ -static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, + uint64_t max) { BDRVQcow2State *s = bs->opaque; uint64_t i, nb_clusters, refcount; @@ -979,9 +981,9 @@ retry: } /* Make sure that all offsets in the "allocated" range are representable - * in an int64_t */ + * in the requested max */ if (s->free_cluster_index > 0 && - s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) + s->free_cluster_index - 1 > (max >> s->cluster_bits)) { return -EFBIG; } @@ -1001,7 +1003,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); do { - offset = alloc_clusters_noref(bs, size); + offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET); if (offset < 0) { return offset; } @@ -1083,7 +1085,11 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); do { if (!offset || free_in_cluster < size) { - int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); + int64_t new_cluster; + + new_cluster = alloc_clusters_noref(bs, s->cluster_size, + MIN(s->cluster_offset_mask, + QCOW_MAX_CLUSTER_OFFSET)); if (new_cluster < 0) { return new_cluster; } diff --git a/block/qcow2.h b/block/qcow2.h index 29c98d87a0..8662b68575 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -42,6 +42,12 @@ #define QCOW_MAX_CRYPT_CLUSTERS 32 #define QCOW_MAX_SNAPSHOTS 65536 +/* Field widths in qcow2 mean normal cluster offsets cannot reach + * 64PB; depending on cluster size, compressed clusters can have a + * smaller limit (64PB for up to 16k clusters, then ramps down to + * 512TB for 2M clusters). */ +#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1) + /* 8 MB refcount table is enough for 2 PB images at 64k cluster size * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ #define QCOW_MAX_REFTABLE_SIZE S_8MiB From 3b94c343f98b660e1c1cf79a8f704151962ecd48 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 13 Nov 2018 17:03:19 -0600 Subject: [PATCH 6/9] iotests: Add new test 220 for max compressed cluster offset If you have a capable file system (tmpfs is good, ext4 not so much; run ./check with TEST_DIR pointing to a good location so as not to skip the test), it's actually possible to create a qcow2 file that expands to a sparse 512T image with just over 38M of content. The test is not the world's fastest (qemu crawling through 256M bits of refcount table to find the next cluster to allocate takes several seconds, as does qemu-img check reporting millions of leaked clusters); but it DOES catch the problem that the previous patch just fixed where writing a compressed cluster to a full image ended up overwriting the wrong cluster. Suggested-by: Max Reitz Signed-off-by: Eric Blake Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- tests/qemu-iotests/220 | 96 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/220.out | 54 +++++++++++++++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 151 insertions(+) create mode 100755 tests/qemu-iotests/220 create mode 100644 tests/qemu-iotests/220.out diff --git a/tests/qemu-iotests/220 b/tests/qemu-iotests/220 new file mode 100755 index 0000000000..0c5682bda0 --- /dev/null +++ b/tests/qemu-iotests/220 @@ -0,0 +1,96 @@ +#!/bin/bash +# +# max limits on compression in huge qcow2 files +# +# Copyright (C) 2018 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq=$(basename $0) +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.pattern + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +echo "== Creating huge file ==" + +# Sanity check: We require a file system that permits the creation +# of a HUGE (but very sparse) file. tmpfs works, ext4 does not. +if ! truncate --size=513T "$TEST_IMG"; then + _notrun "file system on $TEST_DIR does not support large enough files" +fi +rm "$TEST_IMG" +IMGOPTS='cluster_size=2M,refcount_bits=1' _make_test_img 513T + +echo "== Populating refcounts ==" +# We want an image with 256M refcounts * 2M clusters = 512T referenced. +# Each 2M cluster holds 16M refcounts; the refcount table initially uses +# 1 refblock, so we need to add 15 more. The refcount table lives at 2M, +# first refblock at 4M, L2 at 6M, so our remaining additions start at 8M. +# Then, for each refblock, mark it as fully populated. +to_hex() { + printf %016x\\n $1 | sed 's/\(..\)/\\x\1/g' +} +truncate --size=38m "$TEST_IMG" +entry=$((0x200000)) +$QEMU_IO_PROG -f raw -c "w -P 0xff 4m 2m" "$TEST_IMG" | _filter_qemu_io +for i in {1..15}; do + offs=$((0x600000 + i*0x200000)) + poke_file "$TEST_IMG" $((i*8 + entry)) $(to_hex $offs) + $QEMU_IO_PROG -f raw -c "w -P 0xff $offs 2m" "$TEST_IMG" | _filter_qemu_io +done + +echo "== Checking file before ==" +# FIXME: 'qemu-img check' doesn't diagnose refcounts beyond the end of +# the file as leaked clusters +_check_test_img 2>&1 | sed '/^Leaked cluster/d' +stat -c 'image size %s' "$TEST_IMG" + +echo "== Trying to write compressed cluster ==" +# Given our file size, the next available cluster at 512T lies beyond the +# maximum offset that a compressed 2M cluster can reside in +$QEMU_IO_PROG -c 'w -c 0 2m' "$TEST_IMG" | _filter_qemu_io +# The attempt failed, but ended up allocating a new refblock +stat -c 'image size %s' "$TEST_IMG" + +echo "== Writing normal cluster ==" +# The failed write should not corrupt the image, so a normal write succeeds +$QEMU_IO_PROG -c 'w 0 2m' "$TEST_IMG" | _filter_qemu_io + +echo "== Checking file after ==" +# qemu-img now sees the millions of leaked clusters, thanks to the allocations +# at 512T. Undo many of our faked references to speed up the check. +$QEMU_IO_PROG -f raw -c "w -z 5m 1m" -c "w -z 8m 30m" "$TEST_IMG" | + _filter_qemu_io +_check_test_img 2>&1 | sed '/^Leaked cluster/d' + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/220.out b/tests/qemu-iotests/220.out new file mode 100644 index 0000000000..af3021fd88 --- /dev/null +++ b/tests/qemu-iotests/220.out @@ -0,0 +1,54 @@ +QA output created by 220 +== Creating huge file == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=564049465049088 +== Populating refcounts == +wrote 2097152/2097152 bytes at offset 4194304 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 8388608 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 10485760 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 12582912 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 14680064 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 16777216 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 18874368 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 20971520 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 23068672 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 25165824 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 27262976 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 29360128 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 31457280 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 33554432 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 35651584 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 37748736 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +== Checking file before == +No errors were found on the image. +image size 39845888 +== Trying to write compressed cluster == +write failed: Input/output error +image size 562949957615616 +== Writing normal cluster == +wrote 2097152/2097152 bytes at offset 0 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +== Checking file after == +wrote 1048576/1048576 bytes at offset 5242880 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 31457280/31457280 bytes at offset 8388608 +30 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +8388589 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index ebe4fe78bc..4d194716f2 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -219,6 +219,7 @@ 217 rw auto quick 218 rw auto quick 219 rw auto +220 rw auto 221 rw auto quick 222 rw auto quick 223 rw auto quick From 9ad08c44566bf4466c6263c71d43e9f7a354d4ba Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 16 Nov 2018 17:45:24 +0100 Subject: [PATCH 7/9] block: Always abort reopen after prepare succeeded bdrv_reopen_multiple() does not invoke bdrv_reopen_abort() for the element of the reopen queue for which bdrv_reopen_prepare() failed, because it assumes that the prepare function will have rolled back all changes already. However, bdrv_reopen_prepare() does not do this in every case: It may notice an error after BlockDriver.bdrv_reopen_prepare() succeeded, and it will not invoke BlockDriver.bdrv_reopen_abort() then; and neither will bdrv_reopen_multiple(), as explained above. This is wrong because we must always call .bdrv_reopen_commit() or .bdrv_reopen_abort() after .bdrv_reopen_prepare() has succeeded. Otherwise, the block driver has no chance to undo what it has done in its implementation of .bdrv_reopen_prepare(). To fix this, bdrv_reopen_prepare() has to call .bdrv_reopen_abort() if it wants to return an error after .bdrv_reopen_prepare() has succeeded. Signed-off-by: Max Reitz Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block.c b/block.c index fd67e14dfa..3feac08535 100644 --- a/block.c +++ b/block.c @@ -3201,6 +3201,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, QDict *orig_reopen_opts; char *discard = NULL; bool read_only; + bool drv_prepared = false; assert(reopen_state != NULL); assert(reopen_state->bs->drv != NULL); @@ -3285,6 +3286,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, goto error; } + drv_prepared = true; + /* Options that are not handled are only okay if they are unchanged * compared to the old state. It is expected that some options are only * used for the initial open, but not reopen (e.g. filename) */ @@ -3350,6 +3353,15 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, reopen_state->options = qobject_ref(orig_reopen_opts); error: + if (ret < 0 && drv_prepared) { + /* drv->bdrv_reopen_prepare() has succeeded, so we need to + * call drv->bdrv_reopen_abort() before signaling an error + * (bdrv_reopen_multiple() will not call bdrv_reopen_abort() + * when the respective bdrv_reopen_prepare() has failed) */ + if (drv->bdrv_reopen_abort) { + drv->bdrv_reopen_abort(reopen_state); + } + } qemu_opts_del(opts); qobject_unref(orig_reopen_opts); g_free(discard); From 577a133988c76e4ebf01d050d0d758d207a1baf7 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 16 Nov 2018 17:45:25 +0100 Subject: [PATCH 8/9] file-posix: Fix shared locks on reopen commit s->locked_shared_perm is the set of bits locked in the file, which is the inverse of the permissions actually shared. So we need to pass them as they are to raw_apply_lock_bytes() instead of inverting them again. Reported-by: Alberto Garcia Signed-off-by: Max Reitz Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block/file-posix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/file-posix.c b/block/file-posix.c index 58c86a01ea..07bbdab953 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -959,7 +959,7 @@ static void raw_reopen_commit(BDRVReopenState *state) /* Copy locks to the new fd before closing the old one. */ raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm, - ~s->locked_shared_perm, false, &local_err); + s->locked_shared_perm, false, &local_err); if (local_err) { /* shouldn't fail in a sane host, but report it just in case. */ error_report_err(local_err); From 6d0a4a0fb5c8f10c8eb68b52cfda0082b00ae963 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 16 Nov 2018 17:45:26 +0100 Subject: [PATCH 9/9] iotests: Test file-posix locking and reopen Signed-off-by: Max Reitz Reviewed-by: Alberto Garcia Signed-off-by: Kevin Wolf --- tests/qemu-iotests/182 | 71 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/182.out | 9 +++++ 2 files changed, 80 insertions(+) diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182 index 4b31592fb8..3b7689c1d5 100755 --- a/tests/qemu-iotests/182 +++ b/tests/qemu-iotests/182 @@ -31,6 +31,7 @@ status=1 # failure is the default! _cleanup() { _cleanup_test_img + rm -f "$TEST_IMG.overlay" } trap "_cleanup; exit \$status" 0 1 2 3 15 @@ -71,6 +72,76 @@ echo 'quit' | $QEMU -nographic -monitor stdio \ _cleanup_qemu +echo +echo '=== Testing reopen ===' +echo + +# This tests that reopening does not unshare any permissions it should +# not unshare +# (There was a bug where reopening shared exactly the opposite of the +# permissions it was supposed to share) + +_launch_qemu + +_send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'qmp_capabilities'}" \ + 'return' + +# Open the image without any format layer (we are not going to access +# it, so that is fine) +# This should keep all permissions shared. +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'blockdev-add', + 'arguments': { + 'node-name': 'node0', + 'driver': 'file', + 'filename': '$TEST_IMG', + 'locking': 'on' + } }" \ + 'return' \ + 'error' + +# This snapshot will perform a reopen to drop R/W to RO. +# It should still keep all permissions shared. +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'blockdev-snapshot-sync', + 'arguments': { + 'node-name': 'node0', + 'snapshot-file': '$TEST_IMG.overlay', + 'snapshot-node-name': 'node1' + } }" \ + 'return' \ + 'error' + +# Now open the same file again +# This does not require any permissions (and does not unshare any), so +# this will not conflict with node0. +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'blockdev-add', + 'arguments': { + 'node-name': 'node1', + 'driver': 'file', + 'filename': '$TEST_IMG', + 'locking': 'on' + } }" \ + 'return' \ + 'error' + +# Now we attach the image to a virtio-blk device. This device does +# require some permissions (at least WRITE and READ_CONSISTENT), so if +# reopening node0 unshared any (which it should not have), this will +# fail (but it should not). +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'device_add', + 'arguments': { + 'driver': 'virtio-blk', + 'drive': 'node1' + } }" \ + 'return' \ + 'error' + +_cleanup_qemu + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/182.out b/tests/qemu-iotests/182.out index f1463c8862..af501ca3f3 100644 --- a/tests/qemu-iotests/182.out +++ b/tests/qemu-iotests/182.out @@ -5,4 +5,13 @@ Starting QEMU Starting a second QEMU using the same image should fail QEMU_PROG: -drive file=TEST_DIR/t.qcow2,if=none,id=drive0,file.locking=on: Failed to get "write" lock Is another process using the image [TEST_DIR/t.qcow2]? + +=== Testing reopen === + +{"return": {}} +{"return": {}} +Formatting 'TEST_DIR/t.qcow2.overlay', fmt=qcow2 size=197120 backing_file=TEST_DIR/t.qcow2 backing_fmt=file cluster_size=65536 lazy_refcounts=off refcount_bits=16 +{"return": {}} +{"return": {}} +{"return": {}} *** done