From 1a6d39fd71ddf90c5b76026cac4d5ff51fbaf8d8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Feb 2012 13:27:24 +0000 Subject: [PATCH 01/15] cutils: extract buffer_is_zero() from qemu-img.c The qemu-img.c:is_not_zero() function checks if a buffer contains all zeroes. This function will come in handy for zero-detection in the block layer, so clean it up and move it to cutils.c. Note that the function now returns true if the buffer is all zeroes. This avoids the double-negatives (i.e. !is_not_zero()) that the old function can cause in callers. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- cutils.c | 35 +++++++++++++++++++++++++++++++++++ qemu-common.h | 2 ++ qemu-img.c | 46 +++++++--------------------------------------- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/cutils.c b/cutils.c index a6ffd46445..af308cd7b9 100644 --- a/cutils.c +++ b/cutils.c @@ -303,6 +303,41 @@ void qemu_iovec_memset_skip(QEMUIOVector *qiov, int c, size_t count, } } +/* + * Checks if a buffer is all zeroes + * + * Attention! The len must be a multiple of 4 * sizeof(long) due to + * restriction of optimizations in this function. + */ +bool buffer_is_zero(const void *buf, size_t len) +{ + /* + * Use long as the biggest available internal data type that fits into the + * CPU register and unroll the loop to smooth out the effect of memory + * latency. + */ + + size_t i; + long d0, d1, d2, d3; + const long * const data = buf; + + assert(len % (4 * sizeof(long)) == 0); + len /= sizeof(long); + + for (i = 0; i < len; i += 4) { + d0 = data[i + 0]; + d1 = data[i + 1]; + d2 = data[i + 2]; + d3 = data[i + 3]; + + if (d0 || d1 || d2 || d3) { + return false; + } + } + + return true; +} + #ifndef _WIN32 /* Sets a specific flag */ int fcntl_setfl(int fd, int flag) diff --git a/qemu-common.h b/qemu-common.h index 9b997f8838..c5e9cad35e 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -335,6 +335,8 @@ void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count); void qemu_iovec_memset_skip(QEMUIOVector *qiov, int c, size_t count, size_t skip); +bool buffer_is_zero(const void *buf, size_t len); + void qemu_progress_init(int enabled, float min_skip); void qemu_progress_end(void); void qemu_progress_print(float delta, int max); diff --git a/qemu-img.c b/qemu-img.c index 01cc0d35ad..c4bcf41e15 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -514,40 +514,6 @@ static int img_commit(int argc, char **argv) return 0; } -/* - * Checks whether the sector is not a zero sector. - * - * Attention! The len must be a multiple of 4 * sizeof(long) due to - * restriction of optimizations in this function. - */ -static int is_not_zero(const uint8_t *sector, int len) -{ - /* - * Use long as the biggest available internal data type that fits into the - * CPU register and unroll the loop to smooth out the effect of memory - * latency. - */ - - int i; - long d0, d1, d2, d3; - const long * const data = (const long *) sector; - - len /= sizeof(long); - - for(i = 0; i < len; i += 4) { - d0 = data[i + 0]; - d1 = data[i + 1]; - d2 = data[i + 2]; - d3 = data[i + 3]; - - if (d0 || d1 || d2 || d3) { - return 1; - } - } - - return 0; -} - /* * Returns true iff the first sector pointed to by 'buf' contains at least * a non-NUL byte. @@ -557,20 +523,22 @@ static int is_not_zero(const uint8_t *sector, int len) */ static int is_allocated_sectors(const uint8_t *buf, int n, int *pnum) { - int v, i; + bool is_zero; + int i; if (n <= 0) { *pnum = 0; return 0; } - v = is_not_zero(buf, 512); + is_zero = buffer_is_zero(buf, 512); for(i = 1; i < n; i++) { buf += 512; - if (v != is_not_zero(buf, 512)) + if (is_zero != buffer_is_zero(buf, 512)) { break; + } } *pnum = i; - return v; + return !is_zero; } /* @@ -955,7 +923,7 @@ static int img_convert(int argc, char **argv) if (n < cluster_sectors) { memset(buf + n * 512, 0, cluster_size - n * 512); } - if (is_not_zero(buf, cluster_size)) { + if (!buffer_is_zero(buf, cluster_size)) { ret = bdrv_write_compressed(out_bs, sector_num, buf, cluster_sectors); if (ret != 0) { From f08f2ddae078e8a7683f8b16da8e0cc3029c7b89 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Feb 2012 13:27:25 +0000 Subject: [PATCH 02/15] block: add .bdrv_co_write_zeroes() interface The ability to zero regions of an image file is a useful primitive for higher-level features such as image streaming or zero write detection. Image formats may support an optimized metadata representation instead of writing zeroes into the image file. This allows zero writes to be potentially faster than regular write operations and also preserve sparseness of the image file. The .bdrv_co_write_zeroes() interface should be implemented by block drivers that wish to provide efficient zeroing. Note that this operation is different from the discard operation, which may leave the contents of the region indeterminate. That means discarded blocks are not guaranteed to contain zeroes and may contain junk data instead. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++------ block.h | 8 ++++++++ block_int.h | 8 ++++++++ trace-events | 1 + 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index 3621d11de0..c9fa5c17ff 100644 --- a/block.c +++ b/block.c @@ -50,6 +50,7 @@ typedef enum { BDRV_REQ_COPY_ON_READ = 0x1, + BDRV_REQ_ZERO_WRITE = 0x2, } BdrvRequestFlags; static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); @@ -69,7 +70,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, BdrvRequestFlags flags); static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags); static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, @@ -1300,7 +1302,7 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) rwco->nb_sectors, rwco->qiov, 0); } else { rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov); + rwco->nb_sectors, rwco->qiov, 0); } } @@ -1639,11 +1641,37 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, BDRV_REQ_COPY_ON_READ); } +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov; + int ret; + + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + } + + /* Fall back to bounce buffer if write zeroes is unsupported */ + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_blockalign(bs, iov.iov_len); + memset(iov.iov_base, 0, iov.iov_len); + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); + + qemu_vfree(iov.iov_base); + return ret; +} + /* * Handle a write request in coroutine context */ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; BdrvTrackedRequest req; @@ -1670,7 +1698,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, tracked_request_begin(&req, bs, sector_num, nb_sectors, true); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + if (flags & BDRV_REQ_ZERO_WRITE) { + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); + } else { + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } if (bs->dirty_bitmap) { set_dirty_bitmap(bs, sector_num, nb_sectors, 1); @@ -1690,7 +1722,16 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, { trace_bdrv_co_writev(bs, sector_num, nb_sectors); - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov); + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE); } /** @@ -3192,7 +3233,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque) acb->req.nb_sectors, acb->req.qiov, 0); } else { acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov); + acb->req.nb_sectors, acb->req.qiov, 0); } acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); diff --git a/block.h b/block.h index cae289b2fb..60ea7308c6 100644 --- a/block.h +++ b/block.h @@ -146,6 +146,14 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); +/* + * Efficiently zero a region of the disk image. Note that this is a regular + * I/O request like read or write and should have a reasonable size. This + * function is not suitable for zeroing the entire image in a single request + * because it may allocate memory for the entire region. + */ +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors); int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum); BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, diff --git a/block_int.h b/block_int.h index 7be2988ca7..7946cf66a1 100644 --- a/block_int.h +++ b/block_int.h @@ -131,6 +131,14 @@ struct BlockDriver { int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + /* + * Efficiently zero a region of the disk image. Typically an image format + * would use a compact metadata representation to implement this. This + * function pointer may be NULL and .bdrv_co_writev() will be called + * instead. + */ + int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors); int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs, diff --git a/trace-events b/trace-events index 5d05749643..3372087ed9 100644 --- a/trace-events +++ b/trace-events @@ -67,6 +67,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d" bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" +bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p" bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" From 79c053bde9fa40595670ae274d047da07f1df88c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Feb 2012 13:27:26 +0000 Subject: [PATCH 03/15] block: perform zero-detection during copy-on-read Copy-on-Read populates the image file with data read from a backing image. In order to avoid bloating the image file when all zeroes are read we should scan the buffer and perform an optimized zero write operation. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index c9fa5c17ff..ae297bb847 100644 --- a/block.c +++ b/block.c @@ -1517,6 +1517,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, */ void *bounce_buffer; + BlockDriver *drv = bs->drv; struct iovec iov; QEMUIOVector bounce_qiov; int64_t cluster_sector_num; @@ -1537,14 +1538,21 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); qemu_iovec_init_external(&bounce_qiov, &iov, 1); - ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); if (ret < 0) { goto err; } - ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors); + } else { + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, &bounce_qiov); + } + if (ret < 0) { /* It might be okay to ignore write errors for guest requests. If this * is a deliberate copy-on-read then we don't want to ignore the error. From 6e4f59bd0d69f0a7aa4010b49a5c49a01987b9d8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Feb 2012 13:27:27 +0000 Subject: [PATCH 04/15] qed: replace is_write with flags field Per-request attributes like read/write are currently implemented as bool fields in the QEDAIOCB struct. This becomes unwiedly as the number of attributes grows. For example, the qed_aio_setup() function would have to take multiple bool arguments and at call sites it would be hard to distinguish the meaning of each bool. Instead use a flags field with bitmask constants. This will be used when zero write support is added. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qed.c | 15 ++++++++------- block/qed.h | 6 +++++- trace-events | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/block/qed.c b/block/qed.c index 8da3ebe9d4..b66dd17a09 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1233,8 +1233,8 @@ static void qed_aio_next_io(void *opaque, int ret) { QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); - QEDFindClusterFunc *io_fn = - acb->is_write ? qed_aio_write_data : qed_aio_read_data; + QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? + qed_aio_write_data : qed_aio_read_data; trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); @@ -1264,14 +1264,14 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, bool is_write) + void *opaque, int flags) { QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque); trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, - opaque, is_write); + opaque, flags); - acb->is_write = is_write; + acb->flags = flags; acb->finished = NULL; acb->qiov = qiov; acb->qiov_offset = 0; @@ -1291,7 +1291,7 @@ static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false); + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); } static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, @@ -1300,7 +1300,8 @@ static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true); + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, + opaque, QED_AIOCB_WRITE); } static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs, diff --git a/block/qed.h b/block/qed.h index 62cbd3b899..abed1472ed 100644 --- a/block/qed.h +++ b/block/qed.h @@ -123,12 +123,16 @@ typedef struct QEDRequest { CachedL2Table *l2_table; } QEDRequest; +enum { + QED_AIOCB_WRITE = 0x0001, /* read or write? */ +}; + typedef struct QEDAIOCB { BlockDriverAIOCB common; QEMUBH *bh; int bh_ret; /* final return status for completion bh */ QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ - bool is_write; /* false - read, true - write */ + int flags; /* QED_AIOCB_* bits ORed together */ bool *finished; /* signal for cancel completion */ uint64_t end_pos; /* request end on block device, in bytes */ diff --git a/trace-events b/trace-events index 3372087ed9..9b26ce2304 100644 --- a/trace-events +++ b/trace-events @@ -321,7 +321,7 @@ qed_need_check_timer_cb(void *s) "s %p" qed_start_need_check_timer(void *s) "s %p" qed_cancel_need_check_timer(void *s) "s %p" qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d" -qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int is_write) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p is_write %d" +qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x" qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64 qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" From 0e71be1932adfad27d564675f89a468fc8b92b1f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Feb 2012 13:27:28 +0000 Subject: [PATCH 05/15] qed: add .bdrv_co_write_zeroes() support Zero writes are a dedicated interface for writing regions of zeroes into the image file. If clusters are not yet allocated it is possible to use an efficient metadata representation which keeps the image file compact and does not store individual zero bytes. Implementing this for the QED image format is fairly straightforward. The only issue is that when a zero write touches an existing cluster we have to allocate a bounce buffer and perform a regular write. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/qed.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++---- block/qed.h | 1 + 2 files changed, 103 insertions(+), 8 deletions(-) diff --git a/block/qed.c b/block/qed.c index b66dd17a09..a041d31e66 100644 --- a/block/qed.c +++ b/block/qed.c @@ -875,6 +875,12 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret) qemu_iovec_destroy(&acb->cur_qiov); qed_unref_l2_cache_entry(acb->request.l2_table); + /* Free the buffer we may have allocated for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + qemu_vfree(acb->qiov->iov[0].iov_base); + acb->qiov->iov[0].iov_base = NULL; + } + /* Arrange for a bh to invoke the completion function */ acb->bh_ret = ret; acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); @@ -941,9 +947,8 @@ static void qed_aio_write_l1_update(void *opaque, int ret) /** * Update L2 table with new cluster offsets and write them out */ -static void qed_aio_write_l2_update(void *opaque, int ret) +static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) { - QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; int index; @@ -959,7 +964,7 @@ static void qed_aio_write_l2_update(void *opaque, int ret) index = qed_l2_index(s, acb->cur_pos); qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, - acb->cur_cluster); + offset); if (need_alloc) { /* Write out the whole new L2 table */ @@ -976,6 +981,12 @@ err: qed_aio_complete(acb, ret); } +static void qed_aio_write_l2_update_cb(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + qed_aio_write_l2_update(acb, ret, acb->cur_cluster); +} + /** * Flush new data clusters before updating the L2 table * @@ -990,7 +1001,7 @@ static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); - if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) { + if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { qed_aio_complete(acb, -EIO); } } @@ -1019,7 +1030,7 @@ static void qed_aio_write_main(void *opaque, int ret) if (s->bs->backing_hd) { next_fn = qed_aio_write_flush_before_l2_update; } else { - next_fn = qed_aio_write_l2_update; + next_fn = qed_aio_write_l2_update_cb; } } @@ -1081,6 +1092,18 @@ static bool qed_should_set_need_check(BDRVQEDState *s) return !(s->header.features & QED_F_NEED_CHECK); } +static void qed_aio_write_zero_cluster(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + qed_aio_write_l2_update(acb, 0, 1); +} + /** * Write new data cluster * @@ -1092,6 +1115,7 @@ static bool qed_should_set_need_check(BDRVQEDState *s) static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) { BDRVQEDState *s = acb_to_s(acb); + BlockDriverCompletionFunc *cb; /* Cancel timer when the first allocating request comes in */ if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { @@ -1109,14 +1133,26 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) acb->cur_nclusters = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, acb->cur_pos) + len); - acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + if (acb->flags & QED_AIOCB_ZERO) { + /* Skip ahead if the clusters are already zero */ + if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { + qed_aio_next_io(acb, 0); + return; + } + + cb = qed_aio_write_zero_cluster; + } else { + cb = qed_aio_write_prefill; + acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); + } + if (qed_should_set_need_check(s)) { s->header.features |= QED_F_NEED_CHECK; - qed_write_header(s, qed_aio_write_prefill, acb); + qed_write_header(s, cb, acb); } else { - qed_aio_write_prefill(acb, 0); + cb(acb, 0); } } @@ -1131,6 +1167,16 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) */ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) { + /* Allocate buffer for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + struct iovec *iov = acb->qiov->iov; + + if (!iov->iov_base) { + iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); + memset(iov->iov_base, 0, iov->iov_len); + } + } + /* Calculate the I/O vector */ acb->cur_cluster = offset; qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); @@ -1311,6 +1357,53 @@ static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs, return bdrv_aio_flush(bs->file, cb, opaque); } +typedef struct { + Coroutine *co; + int ret; + bool done; +} QEDWriteZeroesCB; + +static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +{ + QEDWriteZeroesCB *cb = opaque; + + cb->done = true; + cb->ret = ret; + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors) +{ + BlockDriverAIOCB *blockacb; + QEDWriteZeroesCB cb = { .done = false }; + QEMUIOVector qiov; + struct iovec iov; + + /* Zero writes start without an I/O buffer. If a buffer becomes necessary + * then it will be allocated during request processing. + */ + iov.iov_base = NULL, + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, + + qemu_iovec_init_external(&qiov, &iov, 1); + blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, + qed_co_write_zeroes_cb, &cb, + QED_AIOCB_WRITE | QED_AIOCB_ZERO); + if (!blockacb) { + return -EIO; + } + if (!cb.done) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + assert(cb.done); + return cb.ret; +} + static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) { BDRVQEDState *s = bs->opaque; @@ -1470,6 +1563,7 @@ static BlockDriver bdrv_qed = { .bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_writev = bdrv_qed_aio_writev, .bdrv_aio_flush = bdrv_qed_aio_flush, + .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, .bdrv_get_info = bdrv_qed_get_info, diff --git a/block/qed.h b/block/qed.h index abed1472ed..62624a1f34 100644 --- a/block/qed.h +++ b/block/qed.h @@ -125,6 +125,7 @@ typedef struct QEDRequest { enum { QED_AIOCB_WRITE = 0x0001, /* read or write? */ + QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */ }; typedef struct QEDAIOCB { From 71b58b82dac1e1dc5e08a005a14bbcafecbd9e2a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Feb 2012 13:27:29 +0000 Subject: [PATCH 06/15] qemu-io: add write -z option for bdrv_co_write_zeroes Extend the qemu-io write command with the -z option to call bdrv_co_write_zeroes(). Exposing the zero write interface from qemu-io allows us to write tests that exercise this new block layer interface. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- qemu-io.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index d4a5fcbcbc..0249be4e71 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -218,6 +218,51 @@ static int do_pwrite(char *buf, int64_t offset, int count, int *total) return 1; } +typedef struct { + int64_t offset; + int count; + int *total; + int ret; + bool done; +} CoWriteZeroes; + +static void coroutine_fn co_write_zeroes_entry(void *opaque) +{ + CoWriteZeroes *data = opaque; + + data->ret = bdrv_co_write_zeroes(bs, data->offset / BDRV_SECTOR_SIZE, + data->count / BDRV_SECTOR_SIZE); + data->done = true; + if (data->ret < 0) { + *data->total = data->ret; + return; + } + + *data->total = data->count; +} + +static int do_co_write_zeroes(int64_t offset, int count, int *total) +{ + Coroutine *co; + CoWriteZeroes data = { + .offset = offset, + .count = count, + .total = total, + .done = false, + }; + + co = qemu_coroutine_create(co_write_zeroes_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + qemu_aio_wait(); + } + if (data.ret < 0) { + return data.ret; + } else { + return 1; + } +} + static int do_load_vmstate(char *buf, int64_t offset, int count, int *total) { *total = bdrv_load_vmstate(bs, (uint8_t *)buf, offset, count); @@ -643,6 +688,7 @@ static void write_help(void) " -P, -- use different pattern to fill file\n" " -C, -- report statistics in a machine parsable format\n" " -q, -- quiet mode, do not show I/O statistics\n" +" -z, -- write zeroes using bdrv_co_write_zeroes\n" "\n"); } @@ -654,7 +700,7 @@ static const cmdinfo_t write_cmd = { .cfunc = write_f, .argmin = 2, .argmax = -1, - .args = "[-abCpq] [-P pattern ] off len", + .args = "[-bCpqz] [-P pattern ] off len", .oneline = "writes a number of bytes at a specified offset", .help = write_help, }; @@ -662,16 +708,16 @@ static const cmdinfo_t write_cmd = { static int write_f(int argc, char **argv) { struct timeval t1, t2; - int Cflag = 0, pflag = 0, qflag = 0, bflag = 0; + int Cflag = 0, pflag = 0, qflag = 0, bflag = 0, Pflag = 0, zflag = 0; int c, cnt; - char *buf; + char *buf = NULL; int64_t offset; int count; /* Some compilers get confused and warn if this is not initialized. */ int total = 0; int pattern = 0xcd; - while ((c = getopt(argc, argv, "bCpP:q")) != EOF) { + while ((c = getopt(argc, argv, "bCpP:qz")) != EOF) { switch (c) { case 'b': bflag = 1; @@ -683,6 +729,7 @@ static int write_f(int argc, char **argv) pflag = 1; break; case 'P': + Pflag = 1; pattern = parse_pattern(optarg); if (pattern < 0) { return 0; @@ -691,6 +738,9 @@ static int write_f(int argc, char **argv) case 'q': qflag = 1; break; + case 'z': + zflag = 1; + break; default: return command_usage(&write_cmd); } @@ -700,8 +750,13 @@ static int write_f(int argc, char **argv) return command_usage(&write_cmd); } - if (bflag && pflag) { - printf("-b and -p cannot be specified at the same time\n"); + if (bflag + pflag + zflag > 1) { + printf("-b, -p, or -z cannot be specified at the same time\n"); + return 0; + } + + if (zflag && Pflag) { + printf("-z and -P cannot be specified at the same time\n"); return 0; } @@ -732,13 +787,17 @@ static int write_f(int argc, char **argv) } } - buf = qemu_io_alloc(count, pattern); + if (!zflag) { + buf = qemu_io_alloc(count, pattern); + } gettimeofday(&t1, NULL); if (pflag) { cnt = do_pwrite(buf, offset, count, &total); } else if (bflag) { cnt = do_save_vmstate(buf, offset, count, &total); + } else if (zflag) { + cnt = do_co_write_zeroes(offset, count, &total); } else { cnt = do_write(buf, offset, count, &total); } @@ -758,7 +817,9 @@ static int write_f(int argc, char **argv) print_report("wrote", &t2, offset, count, total, cnt, Cflag); out: - qemu_io_free(buf); + if (!zflag) { + qemu_io_free(buf); + } return 0; } From f9dadc9855ae1554a98c53a10a3a962859992865 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 26 Jan 2012 09:39:02 +1100 Subject: [PATCH 07/15] iSCSI: add configuration variables for iSCSI This patch adds configuration variables for iSCSI to set initiator-name to use when logging in to the target, which type of header-digest to negotiate with the target and username and password for CHAP authentication. This allows specifying a initiator-name either from the command line -iscsi initiator-name=iqn.2004-01.com.example:test or from a configuration file included with -readconfig [iscsi] initiator-name = iqn.2004-01.com.example:test header-digest = CRC32C|CRC32C-NONE|NONE-CRC32C|NONE user = CHAP username password = CHAP password If you use several different targets, you can also configure this on a per target basis by using a group name: [iscsi "iqn.target.name"] ... The configuration file can be read using -readconfig. Example : qemu-system-i386 -drive file=iscsi://127.0.0.1/iqn.ronnie.test/1 -readconfig iscsi.conf Signed-off-by: Ronnie Sahlberg Signed-off-by: Kevin Wolf --- block/iscsi.c | 139 ++++++++++++++++++++++++++++++++++++++++++++---- qemu-config.c | 27 ++++++++++ qemu-doc.texi | 54 ++++++++++++++++++- qemu-options.hx | 16 ++++-- vl.c | 8 +++ 5 files changed, 229 insertions(+), 15 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 938c568071..bd3ca11b2e 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -455,6 +455,109 @@ iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data, } } +static int parse_chap(struct iscsi_context *iscsi, const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *user = NULL; + const char *password = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return 0; + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return 0; + } + } + + user = qemu_opt_get(opts, "user"); + if (!user) { + return 0; + } + + password = qemu_opt_get(opts, "password"); + if (!password) { + error_report("CHAP username specified but no password was given"); + return -1; + } + + if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { + error_report("Failed to set initiator username and password"); + return -1; + } + + return 0; +} + +static void parse_header_digest(struct iscsi_context *iscsi, const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *digest = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return; + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return; + } + } + + digest = qemu_opt_get(opts, "header-digest"); + if (!digest) { + return; + } + + if (!strcmp(digest, "CRC32C")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C); + } else if (!strcmp(digest, "NONE")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE); + } else if (!strcmp(digest, "CRC32C-NONE")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE); + } else if (!strcmp(digest, "NONE-CRC32C")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + } else { + error_report("Invalid header-digest setting : %s", digest); + } +} + +static char *parse_initiator_name(const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *name = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return g_strdup("iqn.2008-11.org.linux-kvm"); + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return g_strdup("iqn.2008-11.org.linux-kvm"); + } + } + + name = qemu_opt_get(opts, "initiator-name"); + if (!name) { + return g_strdup("iqn.2008-11.org.linux-kvm"); + } + + return g_strdup(name); +} + /* * We support iscsi url's on the form * iscsi://[%@][:]// @@ -465,6 +568,7 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) struct iscsi_context *iscsi = NULL; struct iscsi_url *iscsi_url = NULL; struct IscsiTask task; + char *initiator_name = NULL; int ret; if ((BDRV_SECTOR_SIZE % 512) != 0) { @@ -474,16 +578,6 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) return -EINVAL; } - memset(iscsilun, 0, sizeof(IscsiLun)); - - /* Should really append the KVM name after the ':' here */ - iscsi = iscsi_create_context("iqn.2008-11.org.linux-kvm:"); - if (iscsi == NULL) { - error_report("iSCSI: Failed to create iSCSI context."); - ret = -ENOMEM; - goto failed; - } - iscsi_url = iscsi_parse_full_url(iscsi, filename); if (iscsi_url == NULL) { error_report("Failed to parse URL : %s %s", filename, @@ -492,6 +586,17 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) goto failed; } + memset(iscsilun, 0, sizeof(IscsiLun)); + + initiator_name = parse_initiator_name(iscsi_url->target); + + iscsi = iscsi_create_context(initiator_name); + if (iscsi == NULL) { + error_report("iSCSI: Failed to create iSCSI context."); + ret = -ENOMEM; + goto failed; + } + if (iscsi_set_targetname(iscsi, iscsi_url->target)) { error_report("iSCSI: Failed to set target name."); ret = -EINVAL; @@ -507,6 +612,14 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) goto failed; } } + + /* check if we got CHAP username/password via the options */ + if (parse_chap(iscsi, iscsi_url->target) != 0) { + error_report("iSCSI: Failed to set CHAP user/password"); + ret = -EINVAL; + goto failed; + } + if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { error_report("iSCSI: Failed to set session type to normal."); ret = -EINVAL; @@ -515,6 +628,9 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + /* check if we got HEADER_DIGEST via the options */ + parse_header_digest(iscsi, iscsi_url->target); + task.iscsilun = iscsilun; task.status = 0; task.complete = 0; @@ -548,6 +664,9 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) return 0; failed: + if (initiator_name != NULL) { + g_free(initiator_name); + } if (iscsi_url != NULL) { iscsi_destroy_url(iscsi_url); } diff --git a/qemu-config.c b/qemu-config.c index b030205e23..e980fd68d6 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -118,6 +118,32 @@ static QemuOptsList qemu_drive_opts = { }, }; +static QemuOptsList qemu_iscsi_opts = { + .name = "iscsi", + .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head), + .desc = { + { + .name = "user", + .type = QEMU_OPT_STRING, + .help = "username for CHAP authentication to target", + },{ + .name = "password", + .type = QEMU_OPT_STRING, + .help = "password for CHAP authentication to target", + },{ + .name = "header-digest", + .type = QEMU_OPT_STRING, + .help = "HeaderDigest setting. " + "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}", + },{ + .name = "initiator-name", + .type = QEMU_OPT_STRING, + .help = "Initiator iqn name to use when connecting", + }, + { /* end of list */ } + }, +}; + static QemuOptsList qemu_chardev_opts = { .name = "chardev", .implied_opt_name = "backend", @@ -580,6 +606,7 @@ static QemuOptsList *vm_config_groups[32] = { &qemu_option_rom_opts, &qemu_machine_opts, &qemu_boot_opts, + &qemu_iscsi_opts, NULL, }; diff --git a/qemu-doc.texi b/qemu-doc.texi index 11f4166995..83b2ad5237 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -730,6 +730,57 @@ export LIBISCSI_CHAP_PASSWORD= iscsi://// @end example +Various session related parameters can be set via special options, either +in a configuration file provided via '-readconfig' or directly on the +command line. + +@example +Setting a specific initiator name to use when logging in to the target +-iscsi initiator-name=iqn.qemu.test:my-initiator +@end example + +@example +Controlling which type of header digest to negotiate with the target +-iscsi header-digest=CRC32C|CRC32C-NONE|NONE-CRC32C|NONE +@end example + +These can also be set via a configuration file +@example +[iscsi] + user = "CHAP username" + password = "CHAP password" + initiator-name = "iqn.qemu.test:my-initiator" + # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE + header-digest = "CRC32C" +@end example + + +Setting the target name allows different options for different targets +@example +[iscsi "iqn.target.name"] + user = "CHAP username" + password = "CHAP password" + initiator-name = "iqn.qemu.test:my-initiator" + # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE + header-digest = "CRC32C" +@end example + + +Howto use a configuration file to set iSCSI configuration options: +@example +cat >iscsi.conf < Date: Mon, 6 Feb 2012 09:22:30 -0700 Subject: [PATCH 08/15] vpc: Add support for Fixed Disk type The Virtual Hard Disk Image Format Specification allows for three types of hard disk formats, Fixed, Dynamic, and Differencing. Qemu currently only supports Dynamic disks. This patch adds support for the Fixed Disk format. Usage: Example 1: qemu-img create -f vpc -o type=fixed [size] Example 2: qemu-img convert -O vpc -o type=fixed While it is also allowed to specify '-o type=dynamic', the default disk type remains Dynamic and is what is used when the type is left unspecified. Signed-off-by: Charles Arnold Signed-off-by: Kevin Wolf --- block/vpc.c | 301 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 202 insertions(+), 99 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 89a5ee2668..db6b14bdf9 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -161,13 +161,27 @@ static int vpc_open(BlockDriverState *bs, int flags) uint8_t buf[HEADER_SIZE]; uint32_t checksum; int err = -1; + int disk_type = VHD_DYNAMIC; if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) goto fail; footer = (struct vhd_footer*) s->footer_buf; - if (strncmp(footer->creator, "conectix", 8)) - goto fail; + if (strncmp(footer->creator, "conectix", 8)) { + int64_t offset = bdrv_getlength(bs->file); + if (offset < HEADER_SIZE) { + goto fail; + } + /* If a fixed disk, the footer is found only at the end of the file */ + if (bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE) + != HEADER_SIZE) { + goto fail; + } + if (strncmp(footer->creator, "conectix", 8)) { + goto fail; + } + disk_type = VHD_FIXED; + } checksum = be32_to_cpu(footer->checksum); footer->checksum = 0; @@ -186,49 +200,54 @@ static int vpc_open(BlockDriverState *bs, int flags) goto fail; } - if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE) - != HEADER_SIZE) - goto fail; - - dyndisk_header = (struct vhd_dyndisk_header*) buf; - - if (strncmp(dyndisk_header->magic, "cxsparse", 8)) - goto fail; - - - s->block_size = be32_to_cpu(dyndisk_header->block_size); - s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; - - s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); - s->pagetable = g_malloc(s->max_table_entries * 4); - - s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); - if (bdrv_pread(bs->file, s->bat_offset, s->pagetable, - s->max_table_entries * 4) != s->max_table_entries * 4) - goto fail; - - s->free_data_block_offset = - (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511; - - for (i = 0; i < s->max_table_entries; i++) { - be32_to_cpus(&s->pagetable[i]); - if (s->pagetable[i] != 0xFFFFFFFF) { - int64_t next = (512 * (int64_t) s->pagetable[i]) + - s->bitmap_size + s->block_size; - - if (next> s->free_data_block_offset) - s->free_data_block_offset = next; + if (disk_type == VHD_DYNAMIC) { + if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, + HEADER_SIZE) != HEADER_SIZE) { + goto fail; } - } - s->last_bitmap_offset = (int64_t) -1; + dyndisk_header = (struct vhd_dyndisk_header *) buf; + + if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { + goto fail; + } + + s->block_size = be32_to_cpu(dyndisk_header->block_size); + s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; + + s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); + s->pagetable = g_malloc(s->max_table_entries * 4); + + s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); + if (bdrv_pread(bs->file, s->bat_offset, s->pagetable, + s->max_table_entries * 4) != s->max_table_entries * 4) { + goto fail; + } + + s->free_data_block_offset = + (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511; + + for (i = 0; i < s->max_table_entries; i++) { + be32_to_cpus(&s->pagetable[i]); + if (s->pagetable[i] != 0xFFFFFFFF) { + int64_t next = (512 * (int64_t) s->pagetable[i]) + + s->bitmap_size + s->block_size; + + if (next > s->free_data_block_offset) { + s->free_data_block_offset = next; + } + } + } + + s->last_bitmap_offset = (int64_t) -1; #ifdef CACHE - s->pageentry_u8 = g_malloc(512); - s->pageentry_u32 = s->pageentry_u8; - s->pageentry_u16 = s->pageentry_u8; - s->last_pagetable = -1; + s->pageentry_u8 = g_malloc(512); + s->pageentry_u32 = s->pageentry_u8; + s->pageentry_u16 = s->pageentry_u8; + s->last_pagetable = -1; #endif + } qemu_co_mutex_init(&s->lock); @@ -395,7 +414,11 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num, int ret; int64_t offset; int64_t sectors, sectors_per_block; + struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + if (cpu_to_be32(footer->type) == VHD_FIXED) { + return bdrv_read(bs->file, sector_num, buf, nb_sectors); + } while (nb_sectors > 0) { offset = get_sector_offset(bs, sector_num, 0); @@ -440,7 +463,11 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num, int64_t offset; int64_t sectors, sectors_per_block; int ret; + struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + if (cpu_to_be32(footer->type) == VHD_FIXED) { + return bdrv_write(bs->file, sector_num, buf, nb_sectors); + } while (nb_sectors > 0) { offset = get_sector_offset(bs, sector_num, 1); @@ -533,70 +560,14 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, return 0; } -static int vpc_create(const char *filename, QEMUOptionParameter *options) +static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) { - uint8_t buf[1024]; - struct vhd_footer* footer = (struct vhd_footer*) buf; struct vhd_dyndisk_header* dyndisk_header = (struct vhd_dyndisk_header*) buf; - int fd, i; - uint16_t cyls = 0; - uint8_t heads = 0; - uint8_t secs_per_cyl = 0; size_t block_size, num_bat_entries; - int64_t total_sectors = 0; + int i; int ret = -EIO; - // Read out options - total_sectors = get_option_parameter(options, BLOCK_OPT_SIZE)->value.n / - BDRV_SECTOR_SIZE; - - // Create the file - fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) - return -EIO; - - /* Calculate matching total_size and geometry. Increase the number of - sectors requested until we get enough (or fail). */ - for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { - if (calculate_geometry(total_sectors + i, - &cyls, &heads, &secs_per_cyl)) { - ret = -EFBIG; - goto fail; - } - } - total_sectors = (int64_t) cyls * heads * secs_per_cyl; - - // Prepare the Hard Disk Footer - memset(buf, 0, 1024); - - memcpy(footer->creator, "conectix", 8); - // TODO Check if "qemu" creator_app is ok for VPC - memcpy(footer->creator_app, "qemu", 4); - memcpy(footer->creator_os, "Wi2k", 4); - - footer->features = be32_to_cpu(0x02); - footer->version = be32_to_cpu(0x00010000); - footer->data_offset = be64_to_cpu(HEADER_SIZE); - footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE); - - // Version of Virtual PC 2007 - footer->major = be16_to_cpu(0x0005); - footer->minor =be16_to_cpu(0x0003); - - footer->orig_size = be64_to_cpu(total_sectors * 512); - footer->size = be64_to_cpu(total_sectors * 512); - - footer->cyls = be16_to_cpu(cyls); - footer->heads = heads; - footer->secs_per_cyl = secs_per_cyl; - - footer->type = be32_to_cpu(VHD_DYNAMIC); - - // TODO uuid is missing - - footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); - // Write the footer (twice: at the beginning and at the end) block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); @@ -624,7 +595,6 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) } } - // Prepare the Dynamic Disk Header memset(buf, 0, 1024); @@ -652,6 +622,132 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) } ret = 0; + fail: + return ret; +} + +static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) +{ + int ret = -EIO; + + /* Add footer to total size */ + total_size += 512; + if (ftruncate(fd, total_size) != 0) { + ret = -errno; + goto fail; + } + if (lseek(fd, -512, SEEK_END) < 0) { + goto fail; + } + if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + goto fail; + } + + ret = 0; + + fail: + return ret; +} + +static int vpc_create(const char *filename, QEMUOptionParameter *options) +{ + uint8_t buf[1024]; + struct vhd_footer *footer = (struct vhd_footer *) buf; + QEMUOptionParameter *disk_type_param; + int fd, i; + uint16_t cyls = 0; + uint8_t heads = 0; + uint8_t secs_per_cyl = 0; + int64_t total_sectors; + int64_t total_size; + int disk_type; + int ret = -EIO; + + /* Read out options */ + total_size = get_option_parameter(options, BLOCK_OPT_SIZE)->value.n; + + disk_type_param = get_option_parameter(options, BLOCK_OPT_SUBFMT); + if (disk_type_param && disk_type_param->value.s) { + if (!strcmp(disk_type_param->value.s, "dynamic")) { + disk_type = VHD_DYNAMIC; + } else if (!strcmp(disk_type_param->value.s, "fixed")) { + disk_type = VHD_FIXED; + } else { + return -EINVAL; + } + } else { + disk_type = VHD_DYNAMIC; + } + + /* Create the file */ + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) { + return -EIO; + } + + total_sectors = total_size / BDRV_SECTOR_SIZE; + if (disk_type == VHD_DYNAMIC) { + /* Calculate matching total_size and geometry. Increase the number of + sectors requested until we get enough (or fail). */ + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; + i++) { + if (calculate_geometry(total_sectors + i, + &cyls, &heads, &secs_per_cyl)) { + ret = -EFBIG; + goto fail; + } + } + } else { + if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl)) { + ret = -EFBIG; + goto fail; + } + } + total_sectors = (int64_t) cyls * heads * secs_per_cyl; + + /* Prepare the Hard Disk Footer */ + memset(buf, 0, 1024); + + memcpy(footer->creator, "conectix", 8); + /* TODO Check if "qemu" creator_app is ok for VPC */ + memcpy(footer->creator_app, "qemu", 4); + memcpy(footer->creator_os, "Wi2k", 4); + + footer->features = be32_to_cpu(0x02); + footer->version = be32_to_cpu(0x00010000); + if (disk_type == VHD_DYNAMIC) { + footer->data_offset = be64_to_cpu(HEADER_SIZE); + } else { + footer->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL); + } + footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE); + + /* Version of Virtual PC 2007 */ + footer->major = be16_to_cpu(0x0005); + footer->minor = be16_to_cpu(0x0003); + if (disk_type == VHD_DYNAMIC) { + footer->orig_size = be64_to_cpu(total_sectors * 512); + footer->size = be64_to_cpu(total_sectors * 512); + } else { + footer->orig_size = be64_to_cpu(total_size); + footer->size = be64_to_cpu(total_size); + } + footer->cyls = be16_to_cpu(cyls); + footer->heads = heads; + footer->secs_per_cyl = secs_per_cyl; + + footer->type = be32_to_cpu(disk_type); + + /* TODO uuid is missing */ + + footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); + + if (disk_type == VHD_DYNAMIC) { + ret = create_dynamic_disk(fd, buf, total_sectors); + } else { + ret = create_fixed_disk(fd, buf, total_size); + } + fail: close(fd); return ret; @@ -675,6 +771,13 @@ static QEMUOptionParameter vpc_create_options[] = { .type = OPT_SIZE, .help = "Virtual disk size" }, + { + .name = BLOCK_OPT_SUBFMT, + .type = OPT_STRING, + .help = + "Type of virtual hard disk format. Supported formats are " + "{dynamic (default) | fixed} " + }, { NULL } }; From ecd880d9eeb59a0148d4761698448245ec4306e0 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 7 Feb 2012 10:15:47 +0100 Subject: [PATCH 09/15] vpc: Round up image size during fixed image creation The geometry calculation algorithm from the VHD spec rounds the image size down if it doesn't exactly match a geometry. During image conversion, this causes the image to be truncated. For dynamic images, we already have code in place to round up instead, let's do the same for fixed images. Signed-off-by: Kevin Wolf --- block/vpc.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index db6b14bdf9..6b4816f563 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -685,24 +685,21 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) return -EIO; } + /* + * Calculate matching total_size and geometry. Increase the number of + * sectors requested until we get enough (or fail). This ensures that + * qemu-img convert doesn't truncate images, but rather rounds up. + */ total_sectors = total_size / BDRV_SECTOR_SIZE; - if (disk_type == VHD_DYNAMIC) { - /* Calculate matching total_size and geometry. Increase the number of - sectors requested until we get enough (or fail). */ - for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; - i++) { - if (calculate_geometry(total_sectors + i, - &cyls, &heads, &secs_per_cyl)) { - ret = -EFBIG; - goto fail; - } - } - } else { - if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl)) { + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + if (calculate_geometry(total_sectors + i, &cyls, &heads, + &secs_per_cyl)) + { ret = -EFBIG; goto fail; } } + total_sectors = (int64_t) cyls * heads * secs_per_cyl; /* Prepare the Hard Disk Footer */ From e24e49e6194626e4ec9f1aecce6d6a6847320bce Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 2 Feb 2012 12:32:31 +0100 Subject: [PATCH 10/15] qcow2: Update whole header at once In order to switch the backing file, qcow2 issues multiple write requests that only changed a part of the image header. Any failure after the first one would leave the header in an corrupted state. With this patch, the whole header is written at once, so we can't fail in the middle. At the same time, this gives us a reusable functions that updates all fields of the qcow2 header and not only the backing file. Signed-off-by: Kevin Wolf --- block/qcow2.c | 164 ++++++++++++++++++++++++++++++-------------------- block/qcow2.h | 1 + 2 files changed, 100 insertions(+), 65 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index aa32e8d01a..9f8c2de285 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -669,103 +669,137 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) } } -/* - * Updates the variable length parts of the qcow2 header, i.e. the backing file - * name and all extensions. qcow2 was not designed to allow such changes, so if - * we run out of space (we can only use the first cluster) this function may - * fail. - * - * Returns 0 on success, -errno in error cases. - */ -static int qcow2_update_ext_header(BlockDriverState *bs, - const char *backing_file, const char *backing_fmt) +static size_t header_ext_add(char *buf, uint32_t magic, const void *s, + size_t len, size_t buflen) { - size_t backing_file_len = 0; - size_t backing_fmt_len = 0; - BDRVQcowState *s = bs->opaque; - QCowExtension ext_backing_fmt = {0, 0}; - int ret; + QCowExtension *ext_backing_fmt = (QCowExtension*) buf; + size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); - /* Backing file format doesn't make sense without a backing file */ - if (backing_fmt && !backing_file) { - return -EINVAL; - } - - /* Prepare the backing file format extension if needed */ - if (backing_fmt) { - ext_backing_fmt.len = cpu_to_be32(strlen(backing_fmt)); - ext_backing_fmt.magic = cpu_to_be32(QCOW2_EXT_MAGIC_BACKING_FORMAT); - backing_fmt_len = ((sizeof(ext_backing_fmt) - + strlen(backing_fmt) + 7) & ~7); - } - - /* Check if we can fit the new header into the first cluster */ - if (backing_file) { - backing_file_len = strlen(backing_file); - } - - size_t header_size = sizeof(QCowHeader) + backing_file_len - + backing_fmt_len; - - if (header_size > s->cluster_size) { + if (buflen < ext_len) { return -ENOSPC; } - /* Rewrite backing file name and qcow2 extensions */ - size_t ext_size = header_size - sizeof(QCowHeader); - uint8_t buf[ext_size]; - size_t offset = 0; - size_t backing_file_offset = 0; + *ext_backing_fmt = (QCowExtension) { + .magic = cpu_to_be32(magic), + .len = cpu_to_be32(len), + }; + memcpy(buf + sizeof(QCowExtension), s, len); - if (backing_file) { - if (backing_fmt) { - int padding = backing_fmt_len - - (sizeof(ext_backing_fmt) + strlen(backing_fmt)); + return ext_len; +} - memcpy(buf + offset, &ext_backing_fmt, sizeof(ext_backing_fmt)); - offset += sizeof(ext_backing_fmt); +/* + * Updates the qcow2 header, including the variable length parts of it, i.e. + * the backing file name and all extensions. qcow2 was not designed to allow + * such changes, so if we run out of space (we can only use the first cluster) + * this function may fail. + * + * Returns 0 on success, -errno in error cases. + */ +int qcow2_update_header(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowHeader *header; + char *buf; + size_t buflen = s->cluster_size; + int ret; + uint64_t total_size; + uint32_t refcount_table_clusters; - memcpy(buf + offset, backing_fmt, strlen(backing_fmt)); - offset += strlen(backing_fmt); + buf = qemu_blockalign(bs, buflen); + memset(buf, 0, s->cluster_size); - memset(buf + offset, 0, padding); - offset += padding; + /* Header structure */ + header = (QCowHeader*) buf; + + if (buflen < sizeof(*header)) { + ret = -ENOSPC; + goto fail; + } + + total_size = bs->total_sectors * BDRV_SECTOR_SIZE; + refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + + *header = (QCowHeader) { + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(QCOW_VERSION), + .backing_file_offset = 0, + .backing_file_size = 0, + .cluster_bits = cpu_to_be32(s->cluster_bits), + .size = cpu_to_be64(total_size), + .crypt_method = cpu_to_be32(s->crypt_method_header), + .l1_size = cpu_to_be32(s->l1_size), + .l1_table_offset = cpu_to_be64(s->l1_table_offset), + .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), + .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), + .nb_snapshots = cpu_to_be32(s->nb_snapshots), + .snapshots_offset = cpu_to_be64(s->snapshots_offset), + }; + + buf += sizeof(*header); + buflen -= sizeof(*header); + + /* Backing file format header extension */ + if (*bs->backing_format) { + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, + bs->backing_format, strlen(bs->backing_format), + buflen); + if (ret < 0) { + goto fail; } - memcpy(buf + offset, backing_file, backing_file_len); - backing_file_offset = sizeof(QCowHeader) + offset; + buf += ret; + buflen -= ret; } - ret = bdrv_pwrite_sync(bs->file, sizeof(QCowHeader), buf, ext_size); + /* End of header extensions */ + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); if (ret < 0) { goto fail; } - /* Update header fields */ - uint64_t be_backing_file_offset = cpu_to_be64(backing_file_offset); - uint32_t be_backing_file_size = cpu_to_be32(backing_file_len); + buf += ret; + buflen -= ret; - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, backing_file_offset), - &be_backing_file_offset, sizeof(uint64_t)); - if (ret < 0) { - goto fail; + /* Backing file name */ + if (*bs->backing_file) { + size_t backing_file_len = strlen(bs->backing_file); + + if (buflen < backing_file_len) { + ret = -ENOSPC; + goto fail; + } + + strncpy(buf, bs->backing_file, buflen); + + header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); + header->backing_file_size = cpu_to_be32(backing_file_len); } - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, backing_file_size), - &be_backing_file_size, sizeof(uint32_t)); + /* Write the new header */ + ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); if (ret < 0) { goto fail; } ret = 0; fail: + qemu_vfree(header); return ret; } static int qcow2_change_backing_file(BlockDriverState *bs, const char *backing_file, const char *backing_fmt) { - return qcow2_update_ext_header(bs, backing_file, backing_fmt); + /* Backing file format doesn't make sense without a backing file */ + if (backing_fmt && !backing_file) { + return -EINVAL; + } + + pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); + pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + + return qcow2_update_header(bs); } static int preallocate(BlockDriverState *bs) diff --git a/block/qcow2.h b/block/qcow2.h index 99e45361f5..aae5f894e1 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -178,6 +178,7 @@ static inline int64_t align_offset(int64_t offset, int n) /* qcow2.c functions */ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors); +int qcow2_update_header(BlockDriverState *bs); /* qcow2-refcount.c functions */ int qcow2_refcount_init(BlockDriverState *bs); From 75bab85ca01876f912caf46f5fcb4ca8d9a50584 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 2 Feb 2012 14:52:08 +0100 Subject: [PATCH 11/15] qcow2: Keep unknown header extension when rewriting header If we want header extensions to work as compatible extensions, we can't destroy yet unknown header extensions when rewriting the header (e.g. for changing the backing file). Save all unknown header extensions in a list of blobs and include them in a new header. Signed-off-by: Kevin Wolf --- block/qcow2.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- block/qcow2.h | 8 ++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 9f8c2de285..3692b4523b 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -77,8 +77,10 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, uint64_t end_offset) { + BDRVQcowState *s = bs->opaque; QCowExtension ext; uint64_t offset; + int ret; #ifdef DEBUG_EXT printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); @@ -129,8 +131,22 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, break; default: - /* unknown magic -- just skip it */ - offset = ((offset + ext.len + 7) & ~7); + /* unknown magic - save it in case we need to rewrite the header */ + { + Qcow2UnknownHeaderExtension *uext; + + uext = g_malloc0(sizeof(*uext) + ext.len); + uext->magic = ext.magic; + uext->len = ext.len; + QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); + + ret = bdrv_pread(bs->file, offset , uext->data, uext->len); + if (ret < 0) { + return ret; + } + + offset = ((offset + ext.len + 7) & ~7); + } break; } } @@ -138,6 +154,16 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, return 0; } +static void cleanup_unknown_header_ext(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + Qcow2UnknownHeaderExtension *uext, *next; + + QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { + QLIST_REMOVE(uext, next); + g_free(uext); + } +} static int qcow2_open(BlockDriverState *bs, int flags) { @@ -291,6 +317,7 @@ static int qcow2_open(BlockDriverState *bs, int flags) return ret; fail: + cleanup_unknown_header_ext(bs); qcow2_free_snapshots(bs); qcow2_refcount_close(bs); g_free(s->l1_table); @@ -632,6 +659,7 @@ static void qcow2_close(BlockDriverState *bs) qcow2_cache_destroy(bs, s->l2_table_cache); qcow2_cache_destroy(bs, s->refcount_block_cache); + cleanup_unknown_header_ext(bs); g_free(s->cluster_cache); qemu_vfree(s->cluster_data); qcow2_refcount_close(bs); @@ -705,6 +733,7 @@ int qcow2_update_header(BlockDriverState *bs) int ret; uint64_t total_size; uint32_t refcount_table_clusters; + Qcow2UnknownHeaderExtension *uext; buf = qemu_blockalign(bs, buflen); memset(buf, 0, s->cluster_size); @@ -752,6 +781,17 @@ int qcow2_update_header(BlockDriverState *bs) buflen -= ret; } + /* Keep unknown header extensions */ + QLIST_FOREACH(uext, &s->unknown_header_ext, next) { + ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + /* End of header extensions */ ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); if (ret < 0) { diff --git a/block/qcow2.h b/block/qcow2.h index aae5f894e1..fc35838175 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -87,6 +87,13 @@ typedef struct QCowSnapshot { struct Qcow2Cache; typedef struct Qcow2Cache Qcow2Cache; +typedef struct Qcow2UnknownHeaderExtension { + uint32_t magic; + uint32_t len; + QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; + uint8_t data[]; +} Qcow2UnknownHeaderExtension; + typedef struct BDRVQcowState { int cluster_bits; int cluster_size; @@ -127,6 +134,7 @@ typedef struct BDRVQcowState { QCowSnapshot *snapshots; int flags; + QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; } BDRVQcowState; /* XXX: use std qcow open function ? */ From ea8f978ffef325ab2fd0edcf5f24b6dcec6078f3 Mon Sep 17 00:00:00 2001 From: Dong Xu Wang Date: Tue, 20 Dec 2011 17:03:47 +0800 Subject: [PATCH 12/15] rewrite QEMU_BUILD_BUG_ON On some platforms, __LINE__ will not expand to real number in QEMU_BUILD_BUG_ON, so if using QEMU_BUILD_BUG_ON twice, compiler will report errors. This patch will fix it. BTW, I got error message on RHEL 6.1/gcc 4.4.5. Signed-off-by: Dong Xu Wang Signed-off-by: Kevin Wolf --- compiler.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compiler.h b/compiler.h index a1c0794947..736e77075a 100644 --- a/compiler.h +++ b/compiler.h @@ -30,8 +30,10 @@ # define QEMU_PACKED __attribute__((packed)) #endif +#define cat(x,y) x ## y +#define cat2(x,y) cat(x,y) #define QEMU_BUILD_BUG_ON(x) \ - typedef char qemu_build_bug_on__##__LINE__[(x)?-1:1]; + typedef char cat2(qemu_build_bug_on__,__LINE__)[(x)?-1:1]; #if defined __GNUC__ # if !QEMU_GNUC_PREREQ(4, 4) From c9b308d20b642c106048f088ccc31f2aa7cf59ba Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 30 Jan 2012 23:29:47 +0100 Subject: [PATCH 13/15] AHCI: Fix port reset race bdrv_aio_cancel() can trigger bdrv_aio_flush() which makes all aio that is currently in flight finish. So what we do is: port reset detect ncq in flight cancel ncq delete ncq sg list at which point we have double freed the sg list. Instead, with this patch we do: port reset detect ncq in flight cancel ncq check if we are really still in flight delete ncq sg list which makes things work and gets rid of the race. Signed-off-by: Alexander Graf Signed-off-by: Kevin Wolf --- hw/ide/ahci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index c87a6ca515..caff7bc9e6 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -560,6 +560,11 @@ static void ahci_reset_port(AHCIState *s, int port) ncq_tfs->aiocb = NULL; } + /* Maybe we just finished the request thanks to bdrv_aio_cancel() */ + if (!ncq_tfs->used) { + continue; + } + qemu_sglist_destroy(&ncq_tfs->sglist); ncq_tfs->used = 0; } From 6d1acda8f16d1f2d0b05cfbf9ce54d05849cb016 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Tue, 31 Jan 2012 02:10:06 +0900 Subject: [PATCH 14/15] sheepdog: fix co_recv coroutine context The co_recv coroutine has two things that will try to enter it: 1. The select(2) read callback on the sheepdog socket. 2. The aio_add_request() blocking operations, including a coroutine mutex. This patch fixes it by setting NULL to co_recv before sending data. In future, we should make the sheepdog driver fully coroutine-based and simplify request handling. Signed-off-by: MORITA Kazutaka Signed-off-by: Kevin Wolf --- block/sheepdog.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/sheepdog.c b/block/sheepdog.c index 9416400165..00276f6f46 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -629,6 +629,9 @@ static void coroutine_fn aio_read_response(void *opaque) switch (acb->aiocb_type) { case AIOCB_WRITE_UDATA: + /* this coroutine context is no longer suitable for co_recv + * because we may send data to update vdi objects */ + s->co_recv = NULL; if (!is_data_obj(aio_req->oid)) { break; } From b867672884afc39b6537a8aa6aa2f20a5154bf4f Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 30 Jan 2012 23:29:48 +0100 Subject: [PATCH 15/15] AHCI: Masking of IRQs actually masks them When masking IRQ lines, we should actually mask them out and not declare them active anymore. Once we mask them in again, they are allowed to trigger again. Signed-off-by: Alexander Graf Signed-off-by: Kevin Wolf --- hw/ide/ahci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index caff7bc9e6..f7ef114123 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -146,6 +146,7 @@ static void ahci_check_irq(AHCIState *s) DPRINTF(-1, "check irq %#x\n", s->control_regs.irqstatus); + s->control_regs.irqstatus = 0; for (i = 0; i < s->ports; i++) { AHCIPortRegs *pr = &s->dev[i].port_regs; if (pr->irq_stat & pr->irq_mask) { @@ -216,6 +217,7 @@ static void ahci_port_write(AHCIState *s, int port, int offset, uint32_t val) break; case PORT_IRQ_STAT: pr->irq_stat &= ~val; + ahci_check_irq(s); break; case PORT_IRQ_MASK: pr->irq_mask = val & 0xfdc000ff;