From be2ebc6dad31918e9b6ba241dbe1ea0942c5d691 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 17 Nov 2014 11:18:32 +0100 Subject: [PATCH 1/6] raw-posix: Fix comment for raw_co_get_block_status() Missed in commit 705be72. Signed-off-by: Markus Armbruster Reviewed-by: Paolo Bonzini Reviewed-by: Fam Zheng Reviewed-by: Eric Blake Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/raw-posix.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index e100ae2046..706d3c02b1 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1555,9 +1555,7 @@ static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, } /* - * Returns true iff the specified sector is present in the disk image. Drivers - * not implementing the functionality are assumed to not support backing files, - * hence all their sectors are reported as allocated. + * Returns the allocation status of the specified sectors. * * If 'sector_num' is beyond the end of the disk image the return value is 0 * and 'pnum' is set to 0. From c4875e5b2216cf5427459e619b10f75083565792 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 17 Nov 2014 11:18:33 +0100 Subject: [PATCH 2/6] raw-posix: SEEK_HOLE suffices, get rid of FIEMAP Commit 5500316 (May 2012) implemented raw_co_is_allocated() as follows: 1. If defined(CONFIG_FIEMAP), use the FS_IOC_FIEMAP ioctl 2. Else if defined(SEEK_HOLE) && defined(SEEK_DATA), use lseek() 3. Else pretend there are no holes Later on, raw_co_is_allocated() was generalized to raw_co_get_block_status(). Commit 4f11aa8 (May 2014) changed it to try the three methods in order until success, because "there may be implementations which support [SEEK_HOLE/SEEK_DATA] but not [FIEMAP] (e.g., NFSv4.2) as well as vice versa." Unfortunately, we used FIEMAP incorrectly: we lacked FIEMAP_FLAG_SYNC. Commit 38c4d0a (Sep 2014) added it. Because that's a significant speed hit, the next commit 7c159037 put SEEK_HOLE/SEEK_DATA first. As you see, the obvious use of FIEMAP is wrong, and the correct use is slow. I guess this puts it somewhere between -7 "The obvious use is wrong" and -10 "It's impossible to get right" on Rusty Russel's Hard to Misuse scale[*]. "Fortunately", the FIEMAP code is used only when * SEEK_HOLE/SEEK_DATA aren't defined, but CONFIG_FIEMAP is Uncommon. SEEK_HOLE had no XFS implementation between 2011 (when it was introduced for ext4 and btrfs) and 2012. * SEEK_HOLE/SEEK_DATA and CONFIG_FIEMAP are defined, but lseek() fails Unlikely. Thus, the FIEMAP code executes rarely. Makes it a nice hidey-hole for bugs. Worse, bugs hiding there can theoretically bite even on a host that has SEEK_HOLE/SEEK_DATA. I don't want to worry about this crap, not even theoretically. Get rid of it. [*] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html Signed-off-by: Markus Armbruster Reviewed-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- block/raw-posix.c | 63 +++-------------------------------------------- 1 file changed, 4 insertions(+), 59 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 706d3c02b1..a29130ea42 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -60,9 +60,6 @@ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ #endif #endif -#ifdef CONFIG_FIEMAP -#include -#endif #ifdef CONFIG_FALLOCATE_PUNCH_HOLE #include #endif @@ -151,9 +148,6 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool needs_alignment; -#ifdef CONFIG_FIEMAP - bool skip_fiemap; -#endif } BDRVRawState; typedef struct BDRVRawReopenState { @@ -1481,52 +1475,6 @@ out: return result; } -static int try_fiemap(BlockDriverState *bs, off_t start, off_t *data, - off_t *hole, int nb_sectors) -{ -#ifdef CONFIG_FIEMAP - BDRVRawState *s = bs->opaque; - int ret = 0; - struct { - struct fiemap fm; - struct fiemap_extent fe; - } f; - - if (s->skip_fiemap) { - return -ENOTSUP; - } - - f.fm.fm_start = start; - f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; - f.fm.fm_flags = FIEMAP_FLAG_SYNC; - f.fm.fm_extent_count = 1; - f.fm.fm_reserved = 0; - if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { - s->skip_fiemap = true; - return -errno; - } - - if (f.fm.fm_mapped_extents == 0) { - /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length. - * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! - */ - off_t length = lseek(s->fd, 0, SEEK_END); - *hole = f.fm.fm_start; - *data = MIN(f.fm.fm_start + f.fm.fm_length, length); - } else { - *data = f.fe.fe_logical; - *hole = f.fe.fe_logical + f.fe.fe_length; - if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) { - ret |= BDRV_BLOCK_ZERO; - } - } - - return ret; -#else - return -ENOTSUP; -#endif -} - static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, off_t *hole) { @@ -1593,13 +1541,10 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, ret = try_seek_hole(bs, start, &data, &hole); if (ret < 0) { - ret = try_fiemap(bs, start, &data, &hole, nb_sectors); - if (ret < 0) { - /* Assume everything is allocated. */ - data = 0; - hole = start + nb_sectors * BDRV_SECTOR_SIZE; - ret = 0; - } + /* Assume everything is allocated. */ + data = 0; + hole = start + nb_sectors * BDRV_SECTOR_SIZE; + ret = 0; } assert(ret >= 0); From d1f06fe665acdd7aa7a46a5ef88172c3d7d3028e Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 17 Nov 2014 11:18:34 +0100 Subject: [PATCH 3/6] raw-posix: The SEEK_HOLE code is flawed, rewrite it On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris, but not Linux), try_seek_hole() reports trailing data instead. Additionally, unlikely lseek() failures are treated badly: * When SEEK_HOLE fails, try_seek_hole() reports trailing data. For -ENXIO, there's in fact a trailing hole. Can happen only when something truncated the file since we opened it. * When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds, then try_seek_hole() reports a trailing hole. This is okay only when SEEK_DATA failed with -ENXIO (which means the non-trailing hole found by SEEK_HOLE has since become trailing somehow). For other failures (unlikely), it's wrong. * When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely), then try_seek_hole() reports bogus data [-1,start), which its caller raw_co_get_block_status() turns into zero sectors of data. Could theoretically lead to infinite loops in code that attempts to scan data vs. hole forward. Rewrite from scratch, with very careful comments. Signed-off-by: Markus Armbruster Reviewed-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- block/raw-posix.c | 111 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 26 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index a29130ea42..414e6d1e91 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1475,28 +1475,86 @@ out: return result; } -static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, - off_t *hole) +/* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. + * If @start is not in a hole, store @start in @data, and the + * beginning of the next hole in @hole, and return 0. + * If @start is in a non-trailing hole, store @start in @hole and the + * beginning of the next non-hole in @data, and return 0. + * If @start is in a trailing hole or beyond EOF, return -ENXIO. + * If we can't find out, return a negative errno other than -ENXIO. + */ +static int find_allocation(BlockDriverState *bs, off_t start, + off_t *data, off_t *hole) { #if defined SEEK_HOLE && defined SEEK_DATA BDRVRawState *s = bs->opaque; + off_t offs; - *hole = lseek(s->fd, start, SEEK_HOLE); - if (*hole == -1) { - return -errno; + /* + * SEEK_DATA cases: + * D1. offs == start: start is in data + * D2. offs > start: start is in a hole, next data at offs + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole + * or start is beyond EOF + * If the latter happens, the file has been truncated behind + * our back since we opened it. All bets are off then. + * Treating like a trailing hole is simplest. + * D4. offs < 0, errno != ENXIO: we learned nothing + */ + offs = lseek(s->fd, start, SEEK_DATA); + if (offs < 0) { + return -errno; /* D3 or D4 */ + } + assert(offs >= start); + + if (offs > start) { + /* D2: in hole, next data at offs */ + *hole = start; + *data = offs; + return 0; } - if (*hole > start) { + /* D1: in data, end not yet known */ + + /* + * SEEK_HOLE cases: + * H1. offs == start: start is in a hole + * If this happens here, a hole has been dug behind our back + * since the previous lseek(). + * H2. offs > start: either start is in data, next hole at offs, + * or start is in trailing hole, EOF at offs + * Linux treats trailing holes like any other hole: offs == + * start. Solaris seeks to EOF instead: offs > start (blech). + * If that happens here, a hole has been dug behind our back + * since the previous lseek(). + * H3. offs < 0, errno = ENXIO: start is beyond EOF + * If this happens, the file has been truncated behind our + * back since we opened it. Treat it like a trailing hole. + * H4. offs < 0, errno != ENXIO: we learned nothing + * Pretend we know nothing at all, i.e. "forget" about D1. + */ + offs = lseek(s->fd, start, SEEK_HOLE); + if (offs < 0) { + return -errno; /* D1 and (H3 or H4) */ + } + assert(offs >= start); + + if (offs > start) { + /* + * D1 and H2: either in data, next hole at offs, or it was in + * data but is now in a trailing hole. In the latter case, + * all bets are off. Treating it as if it there was data all + * the way to EOF is safe, so simply do that. + */ *data = start; - } else { - /* On a hole. We need another syscall to find its end. */ - *data = lseek(s->fd, start, SEEK_DATA); - if (*data == -1) { - *data = lseek(s->fd, 0, SEEK_END); - } + *hole = offs; + return 0; } - return 0; + /* D1 and H1 */ + return -EBUSY; #else return -ENOTSUP; #endif @@ -1539,25 +1597,26 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); } - ret = try_seek_hole(bs, start, &data, &hole); - if (ret < 0) { - /* Assume everything is allocated. */ - data = 0; - hole = start + nb_sectors * BDRV_SECTOR_SIZE; - ret = 0; - } - - assert(ret >= 0); - - if (data <= start) { + ret = find_allocation(bs, start, &data, &hole); + if (ret == -ENXIO) { + /* Trailing hole */ + *pnum = nb_sectors; + ret = BDRV_BLOCK_ZERO; + } else if (ret < 0) { + /* No info available, so pretend there are no holes */ + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA; + } else if (data == start) { /* On a data extent, compute sectors to the end of the extent. */ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); - return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + ret = BDRV_BLOCK_DATA; } else { /* On a hole, compute sectors to the beginning of the next extent. */ + assert(hole == start); *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); - return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start; + ret = BDRV_BLOCK_ZERO; } + return ret | BDRV_BLOCK_OFFSET_VALID | start; } static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, From 39411cf3c316de0fe3cbb9585774bacfe3bd8efd Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Nov 2014 11:23:04 +0100 Subject: [PATCH 4/6] block/raw-posix: Fix preallocating write() loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit write() may write less bytes than requested; in this case, the number of bytes written is returned. This is the byte count we should be subtracting from the number of bytes still to be written, and not the byte count we requested to write. Reported-by: László Érsek Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 414e6d1e91..e0e48c5f51 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1451,7 +1451,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) "Could not write to the new file"); break; } - left -= num; + left -= result; } fsync(fd); g_free(buf); From 731de38052b245eab79e417aeac5e1dcebe6437f Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Nov 2014 11:23:05 +0100 Subject: [PATCH 5/6] block/raw-posix: Only sync after successful preallocation The loop which filled the file with zeroes may have been left early due to an error. In that case, the fsync() should be skipped. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index e0e48c5f51..d106fc456b 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1453,7 +1453,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) } left -= result; } - fsync(fd); + if (result >= 0) { + fsync(fd); + } g_free(buf); break; } From 098ffa6674a82ceac0e3ccb3a8a5bf6ca44adcd5 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Nov 2014 11:23:06 +0100 Subject: [PATCH 6/6] block/raw-posix: Catch fsync() errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fsync() may fail, and that case should be handled. Reported-by: László Érsek Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/raw-posix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index d106fc456b..b1af77e47f 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1454,7 +1454,12 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) left -= result; } if (result >= 0) { - fsync(fd); + result = fsync(fd); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Could not flush new file to disk"); + } } g_free(buf); break;