From ac3a8726644d4783eacf54212d23db01d1d30044 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Oct 2014 10:18:43 +0100 Subject: [PATCH 01/53] util: introduce MIN_NON_ZERO at least in block layer we have the case of limits being defined for a BlockDriverState. However, in this context often zero (0) has the special meanining of undefined which means no limit. If two of those limits are combined and the minimum is needed the minimum function should only return zero if both parameters are zero. Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Signed-off-by: Stefan Hajnoczi --- include/qemu/osdep.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 1565404f7e..c0324344d5 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -68,6 +68,12 @@ typedef signed int int_fast16_t; #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #endif +/* Minimum function that returns zero only iff both values are zero. + * Intended for use with unsigned values only. */ +#ifndef MIN_NON_ZERO +#define MIN_NON_ZERO(a, b) (((a) != 0 && (a) < (b)) ? (a) : (b)) +#endif + #ifndef ROUND_UP #define ROUND_UP(n,d) (((n) + (d) - 1) & -(d)) #endif From 2647fab57d5d5e38b36f8dbda367d688045e6a2d Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Oct 2014 10:18:44 +0100 Subject: [PATCH 02/53] BlockLimits: introduce max_transfer_length Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Signed-off-by: Stefan Hajnoczi --- block.c | 4 ++++ include/block/block_int.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/block.c b/block.c index 88f6d9b236..76fcc1d6d3 100644 --- a/block.c +++ b/block.c @@ -519,6 +519,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) return; } bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; } else { bs->bl.opt_mem_alignment = 512; @@ -533,6 +534,9 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.opt_transfer_length = MAX(bs->bl.opt_transfer_length, bs->backing_hd->bl.opt_transfer_length); + bs->bl.max_transfer_length = + MIN_NON_ZERO(bs->bl.max_transfer_length, + bs->backing_hd->bl.max_transfer_length); bs->bl.opt_mem_alignment = MAX(bs->bl.opt_mem_alignment, bs->backing_hd->bl.opt_mem_alignment); diff --git a/include/block/block_int.h b/include/block/block_int.h index 8898c6c51b..a293e92852 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -289,6 +289,9 @@ typedef struct BlockLimits { /* optimal transfer length in sectors */ int opt_transfer_length; + /* maximal transfer length in sectors */ + int max_transfer_length; + /* memory alignment so that no bounce buffer is needed */ size_t opt_mem_alignment; } BlockLimits; From 52f6fa1430209125d1c13fec7d6bbb501aedf322 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Oct 2014 10:18:45 +0100 Subject: [PATCH 03/53] block/iscsi: set max_transfer_length Copy the max_xfer_len from the BlockLimits VPD or use the maximum value fitting in the CDB. The helper function sector_limits_lun2qemu is introduced to convert and cap the limits from the VPD to the maximum power of two fitting in an integer; integer is the range for nb_sectors throughout the block layer. Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Signed-off-by: Stefan Hajnoczi --- block/iscsi.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 233f46285c..6b229b1427 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1447,12 +1447,25 @@ static void iscsi_close(BlockDriverState *bs) memset(iscsilun, 0, sizeof(IscsiLun)); } +static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun) +{ + return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1); +} + static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) { - IscsiLun *iscsilun = bs->opaque; - /* We don't actually refresh here, but just return data queried in * iscsi_open(): iscsi targets don't change their limits. */ + + IscsiLun *iscsilun = bs->opaque; + uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; + + if (iscsilun->bl.max_xfer_len) { + max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len); + } + + bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun); + if (iscsilun->lbp.lbpu) { if (iscsilun->bl.max_unmap < 0xffffffff) { bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, From 6c5a42ac344306bb3711140a3267c61276c1567b Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Oct 2014 10:18:46 +0100 Subject: [PATCH 04/53] block: avoid creating oversized writes in multiwrite_merge Signed-off-by: Peter Lieven Reviewed-by: Ronnie Sahlberg Reviewed-by: Max Reitz Signed-off-by: Stefan Hajnoczi --- block.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block.c b/block.c index 76fcc1d6d3..41793419d8 100644 --- a/block.c +++ b/block.c @@ -4446,6 +4446,11 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, merge = 0; } + if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + + reqs[i].nb_sectors > bs->bl.max_transfer_length) { + merge = 0; + } + if (merge) { size_t size; QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); From 3dab155154b97eadd44342f9c94d1508337a0667 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Oct 2014 10:18:47 +0100 Subject: [PATCH 05/53] block/iscsi: use sector_limits_lun2qemu throughout iscsi_refresh_limits As Max pointed out there is a hidden cast from int64_t to int for all limits. So use the newly introduced sector_limits_lun2qemu for all limits received from the target. Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Signed-off-by: Stefan Hajnoczi --- block/iscsi.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 6b229b1427..a4ba33b28c 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1468,23 +1468,23 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) if (iscsilun->lbp.lbpu) { if (iscsilun->bl.max_unmap < 0xffffffff) { - bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, - iscsilun); + bs->bl.max_discard = + sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun); } - bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, - iscsilun); + bs->bl.discard_alignment = + sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); } if (iscsilun->bl.max_ws_len < 0xffffffff) { - bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len, - iscsilun); + bs->bl.max_write_zeroes = + sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); } if (iscsilun->lbp.lbpws) { - bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, - iscsilun); + bs->bl.write_zeroes_alignment = + sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); } - bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, - iscsilun); + bs->bl.opt_transfer_length = + sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); } /* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in From dc9e716369282ed687821d52cb3170369626f99f Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Oct 2014 10:18:48 +0100 Subject: [PATCH 06/53] block/iscsi: check for oversized requests Cancel oversized requests early. They would generate an iSCSI protocol error anyway; after having transferred possibly a lot of data over the wire. Suggested-By: Max Reitz Signed-off-by: Peter Lieven Signed-off-by: Stefan Hajnoczi --- block/iscsi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/iscsi.c b/block/iscsi.c index a4ba33b28c..111065d354 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -362,6 +362,12 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, return -EINVAL; } + if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { + error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " + "of %d sectors", nb_sectors, bs->bl.max_transfer_length); + return -EINVAL; + } + lba = sector_qemu2lun(sector_num, iscsilun); num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); @@ -529,6 +535,12 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, return -EINVAL; } + if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { + error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " + "of %d sectors", nb_sectors, bs->bl.max_transfer_length); + return -EINVAL; + } + if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { int64_t ret; From 7b8bad1b6abde7612940900dd1092dad873f1a11 Mon Sep 17 00:00:00 2001 From: John Snow Date: Wed, 1 Oct 2014 18:55:46 -0400 Subject: [PATCH 07/53] ahci: Correct PIO/D2H FIS responses Currently, the D2H FIS packets AHCI generates simply parrot back the LBA that the guest sent to us in the cmd_fis. However, some commands (like READ NATIVE MAX) modify the LBA registers as a return value, through which the AHCI D2H FIS is the only response mechanism. Thus, the D2H response should use the current register values, not the initial ones. This patch adjusts the LBA and drive select register responses for PIO Setup and D2H FIS response packets. Additionally, the PIO and D2H FIS responses copy too many bytes from the command FIS that it is being generated from. Specifically, byte 11 which is the Features(15:8) field for Register Host to Device FIS packets, is instead reserved for the PIO Setup FIS and should always be 0. Signed-off-by: John Snow Reviewed-by: Paolo Bonzini Tested-by: Michael S. Tsirkin Message-id: 1412204151-18117-2-git-send-email-jsnow@redhat.com Signed-off-by: Stefan Hajnoczi --- hw/ide/ahci.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 70958e3655..03df4623e9 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -600,6 +600,7 @@ static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len) uint8_t *pio_fis, *cmd_fis; uint64_t tbl_addr; dma_addr_t cmd_len = 0x80; + IDEState *s = &ad->port.ifs[0]; if (!ad->res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) { return; @@ -629,21 +630,21 @@ static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len) pio_fis[0] = 0x5f; pio_fis[1] = (ad->hba->control_regs.irqstatus ? (1 << 6) : 0); - pio_fis[2] = ad->port.ifs[0].status; - pio_fis[3] = ad->port.ifs[0].error; + pio_fis[2] = s->status; + pio_fis[3] = s->error; - pio_fis[4] = cmd_fis[4]; - pio_fis[5] = cmd_fis[5]; - pio_fis[6] = cmd_fis[6]; - pio_fis[7] = cmd_fis[7]; - pio_fis[8] = cmd_fis[8]; - pio_fis[9] = cmd_fis[9]; - pio_fis[10] = cmd_fis[10]; - pio_fis[11] = cmd_fis[11]; + pio_fis[4] = s->sector; + pio_fis[5] = s->lcyl; + pio_fis[6] = s->hcyl; + pio_fis[7] = s->select; + pio_fis[8] = s->hob_sector; + pio_fis[9] = s->hob_lcyl; + pio_fis[10] = s->hob_hcyl; + pio_fis[11] = 0; pio_fis[12] = cmd_fis[12]; pio_fis[13] = cmd_fis[13]; pio_fis[14] = 0; - pio_fis[15] = ad->port.ifs[0].status; + pio_fis[15] = s->status; pio_fis[16] = len & 255; pio_fis[17] = len >> 8; pio_fis[18] = 0; @@ -670,6 +671,7 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis) int i; dma_addr_t cmd_len = 0x80; int cmd_mapped = 0; + IDEState *s = &ad->port.ifs[0]; if (!ad->res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) { return; @@ -687,17 +689,17 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis) d2h_fis[0] = 0x34; d2h_fis[1] = (ad->hba->control_regs.irqstatus ? (1 << 6) : 0); - d2h_fis[2] = ad->port.ifs[0].status; - d2h_fis[3] = ad->port.ifs[0].error; + d2h_fis[2] = s->status; + d2h_fis[3] = s->error; - d2h_fis[4] = cmd_fis[4]; - d2h_fis[5] = cmd_fis[5]; - d2h_fis[6] = cmd_fis[6]; - d2h_fis[7] = cmd_fis[7]; - d2h_fis[8] = cmd_fis[8]; - d2h_fis[9] = cmd_fis[9]; - d2h_fis[10] = cmd_fis[10]; - d2h_fis[11] = cmd_fis[11]; + d2h_fis[4] = s->sector; + d2h_fis[5] = s->lcyl; + d2h_fis[6] = s->hcyl; + d2h_fis[7] = s->select; + d2h_fis[8] = s->hob_sector; + d2h_fis[9] = s->hob_lcyl; + d2h_fis[10] = s->hob_hcyl; + d2h_fis[11] = 0; d2h_fis[12] = cmd_fis[12]; d2h_fis[13] = cmd_fis[13]; for (i = 14; i < 20; i++) { From 659142ecf71a0da240ab0ff7cf929ee25c32b9bc Mon Sep 17 00:00:00 2001 From: John Snow Date: Wed, 1 Oct 2014 18:55:47 -0400 Subject: [PATCH 08/53] ahci: Update byte count after DMA completion Currently, DMA read/write operations neglect to update the byte count after a successful transfer like ATAPI DMA read or PIO read/write operations do. We correct this oversight by adding another callback into the IDEDMAOps structure. The commit callback is called whenever we are cleaning up a scatter-gather list. AHCI can register this callback in order to update post- transfer information such as byte count updates. We use this callback in AHCI to consolidate where we delete the SGlist as generated from the PRDT, as well as update the byte count after the transfer is complete. The QEMUSGList structure has an init flag added to it in order to make qemu_sglist_destroy a nop if it is called when there is no sglist, which simplifies cleanup and error paths. This patch fixes several AHCI problems, notably Non-NCQ modes of operation for Windows 7 as well as Hibernate support for Windows 7. Signed-off-by: John Snow Reviewed-by: Paolo Bonzini Tested-by: Michael S. Tsirkin Message-id: 1412204151-18117-3-git-send-email-jsnow@redhat.com Signed-off-by: Stefan Hajnoczi --- hw/ide/ahci.c | 41 +++++++++++++++++++++++++++++++---------- hw/ide/core.c | 11 +++++++---- hw/ide/internal.h | 2 ++ 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 03df4623e9..d80ddd2d0a 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -49,6 +49,9 @@ static int handle_cmd(AHCIState *s,int port,int slot); static void ahci_reset_port(AHCIState *s, int port); static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis); static void ahci_init_d2h(AHCIDevice *ad); +static int ahci_dma_prepare_buf(IDEDMA *dma, int is_write); +static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes); + static uint32_t ahci_port_read(AHCIState *s, int port, int offset) { @@ -1105,16 +1108,12 @@ static void ahci_start_transfer(IDEDMA *dma) } } - /* update number of transferred bytes */ - ad->cur_cmd->status = cpu_to_le32(le32_to_cpu(ad->cur_cmd->status) + size); - out: /* declare that we processed everything */ s->data_ptr = s->data_end; - if (has_sglist) { - qemu_sglist_destroy(&s->sg); - } + /* Update number of transferred bytes, destroy sglist */ + ahci_commit_buf(dma, size); s->end_transfer_func(s); @@ -1135,6 +1134,11 @@ static void ahci_start_dma(IDEDMA *dma, IDEState *s, dma_cb(s, 0); } +/** + * Called in DMA R/W chains to read the PRDT, utilizing ahci_populate_sglist. + * Not currently invoked by PIO R/W chains, + * which invoke ahci_populate_sglist via ahci_start_transfer. + */ static int ahci_dma_prepare_buf(IDEDMA *dma, int is_write) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); @@ -1147,6 +1151,24 @@ static int ahci_dma_prepare_buf(IDEDMA *dma, int is_write) return s->io_buffer_size != 0; } +/** + * Destroys the scatter-gather list, + * and updates the command header with a bytes-read value. + * called explicitly via ahci_dma_rw_buf (ATAPI DMA), + * and ahci_start_transfer (PIO R/W), + * and called via callback from ide_dma_cb for DMA R/W paths. + */ +static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes) +{ + AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); + IDEState *s = &ad->port.ifs[0]; + + tx_bytes += le32_to_cpu(ad->cur_cmd->status); + ad->cur_cmd->status = cpu_to_le32(tx_bytes); + + qemu_sglist_destroy(&s->sg); +} + static int ahci_dma_rw_buf(IDEDMA *dma, int is_write) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); @@ -1164,11 +1186,9 @@ static int ahci_dma_rw_buf(IDEDMA *dma, int is_write) dma_buf_write(p, l, &s->sg); } - /* free sglist that was created in ahci_populate_sglist() */ - qemu_sglist_destroy(&s->sg); + /* free sglist, update byte count */ + ahci_commit_buf(dma, l); - /* update number of transferred bytes */ - ad->cur_cmd->status = cpu_to_le32(le32_to_cpu(ad->cur_cmd->status) + l); s->io_buffer_index += l; s->io_buffer_offset += l; @@ -1211,6 +1231,7 @@ static const IDEDMAOps ahci_dma_ops = { .start_dma = ahci_start_dma, .start_transfer = ahci_start_transfer, .prepare_buf = ahci_dma_prepare_buf, + .commit_buf = ahci_commit_buf, .rw_buf = ahci_dma_rw_buf, .set_unit = ahci_dma_set_unit, .cmd_done = ahci_cmd_done, diff --git a/hw/ide/core.c b/hw/ide/core.c index 44e3d502d4..d316ccf961 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -634,8 +634,11 @@ void ide_sector_read(IDEState *s) ide_sector_read_cb, s); } -static void dma_buf_commit(IDEState *s) +static void dma_buf_commit(IDEState *s, uint32_t tx_bytes) { + if (s->bus->dma->ops->commit_buf) { + s->bus->dma->ops->commit_buf(s->bus->dma, tx_bytes); + } qemu_sglist_destroy(&s->sg); } @@ -650,6 +653,7 @@ void ide_set_inactive(IDEState *s, bool more) void ide_dma_error(IDEState *s) { + dma_buf_commit(s, 0); ide_abort_command(s); ide_set_inactive(s, false); ide_set_irq(s->bus); @@ -665,7 +669,6 @@ static int ide_handle_rw_error(IDEState *s, int error, int op) s->bus->error_status = op; } else if (action == BLOCK_ERROR_ACTION_REPORT) { if (op & IDE_RETRY_DMA) { - dma_buf_commit(s); ide_dma_error(s); } else { ide_rw_error(s); @@ -709,7 +712,8 @@ void ide_dma_cb(void *opaque, int ret) sector_num = ide_get_sector(s); if (n > 0) { - dma_buf_commit(s); + assert(s->io_buffer_size == s->sg.size); + dma_buf_commit(s, s->io_buffer_size); sector_num += n; ide_set_sector(s, sector_num); s->nsector -= n; @@ -740,7 +744,6 @@ void ide_dma_cb(void *opaque, int ret) if ((s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) && !ide_sect_range_ok(s, sector_num, n)) { - dma_buf_commit(s); ide_dma_error(s); return; } diff --git a/hw/ide/internal.h b/hw/ide/internal.h index 68ac610d6f..907493d0f8 100644 --- a/hw/ide/internal.h +++ b/hw/ide/internal.h @@ -322,6 +322,7 @@ typedef void EndTransferFunc(IDEState *); typedef void DMAStartFunc(IDEDMA *, IDEState *, BlockCompletionFunc *); typedef void DMAVoidFunc(IDEDMA *); typedef int DMAIntFunc(IDEDMA *, int); +typedef void DMAu32Func(IDEDMA *, uint32_t); typedef void DMAStopFunc(IDEDMA *, bool); typedef void DMARestartFunc(void *, int, RunState); @@ -430,6 +431,7 @@ struct IDEDMAOps { DMAStartFunc *start_dma; DMAVoidFunc *start_transfer; DMAIntFunc *prepare_buf; + DMAu32Func *commit_buf; DMAIntFunc *rw_buf; DMAIntFunc *set_unit; DMAStopFunc *set_inactive; From 54a7f8f38ddf4711ee8bf773b5066337b045a343 Mon Sep 17 00:00:00 2001 From: John Snow Date: Wed, 1 Oct 2014 18:55:51 -0400 Subject: [PATCH 09/53] ahci: Fix SDB FIS Construction The SDB FIS creation was mangled; We were writing the error byte to byte 0, and omitting the SDB FIS magic byte. Though the SDB packet layout states that: byte 0: Must be 0xA1 to indicate SDB FIS. byte 1: Port multiplier select & other flags byte 2: status byte. byte 3: error byte. This patch adds an SDB FIS structure with human-readable names, and ensures that we are filling the structure appropriately. Signed-off-by: John Snow Reviewed-by: Paolo Bonzini Tested-by: Michael S. Tsirkin Message-id: 1412204151-18117-7-git-send-email-jsnow@redhat.com Signed-off-by: Stefan Hajnoczi --- hw/ide/ahci.c | 18 +++++++++--------- hw/ide/ahci.h | 8 ++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index d80ddd2d0a..61dbed1b97 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -570,24 +570,24 @@ static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished) AHCIDevice *ad = &s->dev[port]; AHCIPortRegs *pr = &ad->port_regs; IDEState *ide_state; - uint8_t *sdb_fis; + SDBFIS *sdb_fis; if (!s->dev[port].res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) { return; } - sdb_fis = &ad->res_fis[RES_FIS_SDBFIS]; + sdb_fis = (SDBFIS *)&ad->res_fis[RES_FIS_SDBFIS]; ide_state = &ad->port.ifs[0]; - /* clear memory */ - *(uint32_t*)sdb_fis = 0; - - /* write values */ - sdb_fis[0] = ide_state->error; - sdb_fis[2] = ide_state->status & 0x77; + sdb_fis->type = 0xA1; + /* Interrupt pending & Notification bit */ + sdb_fis->flags = (ad->hba->control_regs.irqstatus ? (1 << 6) : 0); + sdb_fis->status = ide_state->status & 0x77; + sdb_fis->error = ide_state->error; + /* update SAct field in SDB_FIS */ s->dev[port].finished |= finished; - *(uint32_t*)(sdb_fis + 4) = cpu_to_le32(ad->finished); + sdb_fis->payload = cpu_to_le32(ad->finished); /* Update shadow registers (except BSY 0x80 and DRQ 0x08) */ pr->tfdata = (ad->port.ifs[0].error << 8) | diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h index e8ea34a1df..b12323730b 100644 --- a/hw/ide/ahci.h +++ b/hw/ide/ahci.h @@ -327,6 +327,14 @@ typedef struct NCQFrame { uint8_t reserved10; } QEMU_PACKED NCQFrame; +typedef struct SDBFIS { + uint8_t type; + uint8_t flags; + uint8_t status; + uint8_t error; + uint32_t payload; +} QEMU_PACKED SDBFIS; + void ahci_init(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports); void ahci_uninit(AHCIState *s); From ba2b22888c43fdf36f3ae0553c89013616e9c44a Mon Sep 17 00:00:00 2001 From: Chris Spiegel Date: Mon, 6 Oct 2014 09:33:45 -0700 Subject: [PATCH 10/53] snapshot: Reset err to NULL to avoid double free If an error occurs in bdrv_snapshot_delete_by_id_or_name(), "err" is freed. If "err" is not set to NULL before calling bdrv_snapshot_delete_by_id_or_name() again, it will not be updated on error, and will be freed again. This can be triggered by starting a VM with at least two drives and then attempting to delete a non-existent snapshot. Broken in commit a89d89d. Signed-off-by: Chris Spiegel Reviewed-by: Markus Armbruster Message-id: 1412613225-32676-1-git-send-email-chris.spiegel@cypherpath.com Signed-off-by: Stefan Hajnoczi --- savevm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/savevm.c b/savevm.c index 2d8eb960bb..08ec678ddc 100644 --- a/savevm.c +++ b/savevm.c @@ -1246,7 +1246,7 @@ int load_vmstate(const char *name) void do_delvm(Monitor *mon, const QDict *qdict) { BlockDriverState *bs; - Error *err = NULL; + Error *err; const char *name = qdict_get_str(qdict, "name"); if (!find_vmstate_bs()) { @@ -1257,6 +1257,7 @@ void do_delvm(Monitor *mon, const QDict *qdict) bs = NULL; while ((bs = bdrv_next(bs))) { if (bdrv_can_snapshot(bs)) { + err = NULL; bdrv_snapshot_delete_by_id_or_name(bs, name, &err); if (err) { monitor_printf(mon, From 285030b0de5575289084a50f4534af5b0a48b834 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 8 Oct 2014 13:13:28 +0400 Subject: [PATCH 11/53] iotests: replace fake parallels image with authentic one The image was generated using http://openvz.org/Ploop utility and properly filled with the same content as original one. Signed-off-by: Denis V. Lunev Reviewed-by: Paolo Bonzini Message-id: 1412759610-2257-2-git-send-email-den@openvz.org CC: Jeff Cody CC: Kevin Wolf CC: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/076 | 10 +++++----- tests/qemu-iotests/076.out | 8 ++++---- .../qemu-iotests/sample_images/fake.parallels.bz2 | Bin 141 -> 0 bytes tests/qemu-iotests/sample_images/parallels-v1.bz2 | Bin 0 -> 147 bytes 4 files changed, 9 insertions(+), 9 deletions(-) delete mode 100644 tests/qemu-iotests/sample_images/fake.parallels.bz2 create mode 100644 tests/qemu-iotests/sample_images/parallels-v1.bz2 diff --git a/tests/qemu-iotests/076 b/tests/qemu-iotests/076 index bc47457a85..2bb478cca4 100755 --- a/tests/qemu-iotests/076 +++ b/tests/qemu-iotests/076 @@ -47,26 +47,26 @@ catalog_entries_offset=$((0x20)) nb_sectors_offset=$((0x24)) echo -echo "== Read from a valid (enough) image ==" -_use_sample_img fake.parallels.bz2 +echo "== Read from a valid v1 image ==" +_use_sample_img parallels-v1.bz2 { $QEMU_IO -c "read -P 0x11 0 64k" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Negative catalog size ==" -_use_sample_img fake.parallels.bz2 +_use_sample_img parallels-v1.bz2 poke_file "$TEST_IMG" "$catalog_entries_offset" "\xff\xff\xff\xff" { $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Overflow in catalog allocation ==" -_use_sample_img fake.parallels.bz2 +_use_sample_img parallels-v1.bz2 poke_file "$TEST_IMG" "$nb_sectors_offset" "\xff\xff\xff\xff" poke_file "$TEST_IMG" "$catalog_entries_offset" "\x01\x00\x00\x40" { $QEMU_IO -c "read 64M 64M" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Zero sectors per track ==" -_use_sample_img fake.parallels.bz2 +_use_sample_img parallels-v1.bz2 poke_file "$TEST_IMG" "$tracks_offset" "\x00\x00\x00\x00" { $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir diff --git a/tests/qemu-iotests/076.out b/tests/qemu-iotests/076.out index f7745d8b0d..fd26f3ca32 100644 --- a/tests/qemu-iotests/076.out +++ b/tests/qemu-iotests/076.out @@ -1,18 +1,18 @@ QA output created by 076 -== Read from a valid (enough) image == +== Read from a valid v1 image == read 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) == Negative catalog size == -qemu-io: can't open device TEST_DIR/fake.parallels: Catalog too large +qemu-io: can't open device TEST_DIR/parallels-v1: Catalog too large no file open, try 'help open' == Overflow in catalog allocation == -qemu-io: can't open device TEST_DIR/fake.parallels: Catalog too large +qemu-io: can't open device TEST_DIR/parallels-v1: Catalog too large no file open, try 'help open' == Zero sectors per track == -qemu-io: can't open device TEST_DIR/fake.parallels: Invalid image: Zero sectors per track +qemu-io: can't open device TEST_DIR/parallels-v1: Invalid image: Zero sectors per track no file open, try 'help open' *** done diff --git a/tests/qemu-iotests/sample_images/fake.parallels.bz2 b/tests/qemu-iotests/sample_images/fake.parallels.bz2 deleted file mode 100644 index ffb5f13bac31bc9ab6e1ea5c0cfa26786f2c4cc6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 141 zcmV;80CN9AT4*^jL0KkKS*i&LJ^%_Hf6(xNVE_;S2ml2D2!JYJ)&M{N00969FaWp; z000b`1pojBOn|7QnnOSv)YEF7cgIVO0ByGSdk7e?fW`f$x`2Bi3t$bd06owJs09G{ vKo+1B1LXi)0CVe)J@eC^zBuEJbFFJA24D=p8Gt*$AL8yvrwS4kK_LggA5<|C diff --git a/tests/qemu-iotests/sample_images/parallels-v1.bz2 b/tests/qemu-iotests/sample_images/parallels-v1.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..d1ef14205401a8e010d1be68bffeee92ce137d39 GIT binary patch literal 147 zcmZ>Y%CIzaj8qGbbEBnV0~X zZfy+=3|x~mCkJ!L1skp^{Ftks!;qfyS}R^xuR}6WLEksaZ~=P@V<pD?kL6rKj5mT56#x^qEyDl+ literal 0 HcmV?d00001 From 76823c6e79583c926d127c33926aaf39475e5cda Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 8 Oct 2014 13:13:29 +0400 Subject: [PATCH 12/53] iotests: add v2 parallels sample image and simple test for it This is simple test image for the following commit made by me. commit d25d59802021a747812472780d80a0e792078f40 Author: Denis V. Lunev Date: Mon Jul 28 20:23:55 2014 +0400 parallels: 2TB+ parallels images support Signed-off-by: Denis V. Lunev Reviewed-by: Paolo Bonzini Message-id: 1412759610-2257-3-git-send-email-den@openvz.org CC: Jeff Cody CC: Kevin Wolf CC: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/076 | 5 +++++ tests/qemu-iotests/076.out | 4 ++++ tests/qemu-iotests/sample_images/parallels-v2.bz2 | Bin 0 -> 150 bytes 3 files changed, 9 insertions(+) create mode 100644 tests/qemu-iotests/sample_images/parallels-v2.bz2 diff --git a/tests/qemu-iotests/076 b/tests/qemu-iotests/076 index 2bb478cca4..ed2be3581e 100755 --- a/tests/qemu-iotests/076 +++ b/tests/qemu-iotests/076 @@ -70,6 +70,11 @@ _use_sample_img parallels-v1.bz2 poke_file "$TEST_IMG" "$tracks_offset" "\x00\x00\x00\x00" { $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +echo +echo "== Read from a valid v2 image ==" +_use_sample_img parallels-v2.bz2 +{ $QEMU_IO -c "read -P 0x11 0 64k" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/076.out b/tests/qemu-iotests/076.out index fd26f3ca32..32ade08565 100644 --- a/tests/qemu-iotests/076.out +++ b/tests/qemu-iotests/076.out @@ -15,4 +15,8 @@ no file open, try 'help open' == Zero sectors per track == qemu-io: can't open device TEST_DIR/parallels-v1: Invalid image: Zero sectors per track no file open, try 'help open' + +== Read from a valid v2 image == +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) *** done diff --git a/tests/qemu-iotests/sample_images/parallels-v2.bz2 b/tests/qemu-iotests/sample_images/parallels-v2.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..fd8614d061172faae50a993ac7acd3173de9aa98 GIT binary patch literal 150 zcmV;H0BQe1T4*^jL0KkKS<`z1ga8U8f6)C%U;t162ml8F2!JYJ)<8f2004jpFaWp= zU;vl^35GBLOaKJHsVaJ=X*QsGnW)758mCWPt$+@EvggMtTP~K8Aeq(bOlAS_fGVI1 zR{#%`0aSoc2hsqlKqv!50aXBg@8a8e-{1Q8zBk4(ssXG4tO2Y6qyhde Date: Wed, 8 Oct 2014 13:13:30 +0400 Subject: [PATCH 13/53] block/parallels: fix access to not initialized memory in catalog_bitmap found by valgrind. Command: ./qemu-img convert -f parallels -O qcow2 1.hds 1.img Invalid read of size 4 at 0x17D0EF: parallels_co_read (parallels.c:357) by 0x11FEE4: bdrv_aio_rw_vector (block.c:4640) by 0x11FFBF: bdrv_aio_readv_em (block.c:4652) by 0x11F55F: bdrv_co_readv_em (block.c:4862) by 0x123428: bdrv_aligned_preadv (block.c:3056) by 0x1239FA: bdrv_co_do_preadv (block.c:3162) by 0x125424: bdrv_rw_co_entry (block.c:2706) by 0x155DD9: coroutine_trampoline (coroutine-ucontext.c:118) by 0x6975B6F: ??? (in /lib/x86_64-linux-gnu/libc-2.19.so) The problem is that s->catalog_bitmap is allocated/filled as gmalloc(s->catalog_size) thus index validity check must be inclusive, i.e. index >= s->catalog_size is invalid. Signed-off-by: Denis V. Lunev Reviewed-by: Paolo Bonzini Message-id: 1412759610-2257-4-git-send-email-den@openvz.org CC: Jeff Cody CC: Kevin Wolf CC: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- block/parallels.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/parallels.c b/block/parallels.c index 2a814f3db4..4f9cd8dd23 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -155,7 +155,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) offset = sector_num % s->tracks; /* not allocated */ - if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0)) + if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0)) return -1; return ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512; From be21788495fdc8251b04dd4bfd0cdce95c49d75b Mon Sep 17 00:00:00 2001 From: Adam Crume Date: Thu, 9 Oct 2014 11:44:32 -0700 Subject: [PATCH 14/53] rbd: Add support for bdrv_invalidate_cache This fixes Ceph issue 2467: ttp://tracker.ceph.com/issues/2467 [Dropped return r in void function as suggested by Josh Durgin . --Stefan] Signed-off-by: Adam Crume Reviewed-by: Josh Durgin Reviewed-by: Stefan Hajnoczi Message-id: 1412880272-3154-1-git-send-email-adamcrume@gmail.com Signed-off-by: Stefan Hajnoczi --- block/rbd.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/rbd.c b/block/rbd.c index 47cab8be94..5b5a64a27a 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -887,6 +887,18 @@ static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, } #endif +#ifdef LIBRBD_SUPPORTS_INVALIDATE +static void qemu_rbd_invalidate_cache(BlockDriverState *bs, + Error **errp) +{ + BDRVRBDState *s = bs->opaque; + int r = rbd_invalidate_cache(s->image); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to invalidate the cache"); + } +} +#endif + static QemuOptsList qemu_rbd_create_opts = { .name = "rbd-create-opts", .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head), @@ -936,6 +948,9 @@ static BlockDriver bdrv_rbd = { .bdrv_snapshot_delete = qemu_rbd_snap_remove, .bdrv_snapshot_list = qemu_rbd_snap_list, .bdrv_snapshot_goto = qemu_rbd_snap_rollback, +#ifdef LIBRBD_SUPPORTS_INVALIDATE + .bdrv_invalidate_cache = qemu_rbd_invalidate_cache, +#endif }; static void bdrv_rbd_init(void) From 573742a5431a99ceaba6968ae269cee247727cce Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 10 Oct 2014 20:33:03 +0100 Subject: [PATCH 15/53] block.c: Fix type of IoOperationType variable in send_qmp_error_event() The local variable 'ac' in send_qmp_error_event() is declared with the wrong type, which causes clang to complain when it is initialized and again when it is used: block.c:3655:20: warning: implicit conversion from enumeration type 'enum IoOperationType' to different enumeration type 'BlockErrorAction' (aka 'enum BlockErrorAction') [-Wenum-conversion] ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; ~ ^~~~~~~~~~~~~~~~~~~~~~ block.c:3655:45: warning: implicit conversion from enumeration type 'enum IoOperationType' to different enumeration type 'BlockErrorAction' (aka 'enum BlockErrorAction') [-Wenum-conversion] ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; ~ ^~~~~~~~~~~~~~~~~~~~~~~ block.c:3656:62: warning: implicit conversion from enumeration type 'BlockErrorAction' (aka 'enum BlockErrorAction') to different enumeration type 'IoOperationType' (aka 'enum IoOperationType') [-Wenum-conversion] qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action, ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^~ Correct the type to IoOperationType, and rename the variable to 'optype' to match its correct type. Signed-off-by: Peter Maydell Reviewed-by: Eric Blake Reviewed-by: Luiz Capitulino Message-id: 1412969583-21045-1-git-send-email-peter.maydell@linaro.org Signed-off-by: Stefan Hajnoczi --- block.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 41793419d8..13a9e51470 100644 --- a/block.c +++ b/block.c @@ -3544,10 +3544,10 @@ static void send_qmp_error_event(BlockDriverState *bs, BlockErrorAction action, bool is_read, int error) { - BlockErrorAction ac; + IoOperationType optype; - ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; - qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action, + optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; + qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, bdrv_iostatus_is_enabled(bs), error == ENOSPC, strerror(error), &error_abort); From 3432a1929ee18e08787ce35476abd74f2c93a17c Mon Sep 17 00:00:00 2001 From: Zhang Haoyu Date: Tue, 21 Oct 2014 16:38:01 +0800 Subject: [PATCH 16/53] snapshot: add bdrv_drain_all() to bdrv_snapshot_delete() to avoid concurrency problem If there are still pending i/o while deleting snapshot, because deleting snapshot is done in non-coroutine context, and the pending i/o read/write (bdrv_co_do_rw) is done in coroutine context, so it's possible to cause concurrency problem between above two operations. Add bdrv_drain_all() to bdrv_snapshot_delete() to avoid this problem. Signed-off-by: Zhang Haoyu Reviewed-by: Paolo Bonzini Message-id: 201410211637596311287@sangfor.com Signed-off-by: Stefan Hajnoczi --- block/snapshot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/snapshot.c b/block/snapshot.c index 85c52ff455..698e1a1d58 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -236,6 +236,10 @@ int bdrv_snapshot_delete(BlockDriverState *bs, error_setg(errp, "snapshot_id and name are both NULL"); return -EINVAL; } + + /* drain all pending i/o before deleting snapshot */ + bdrv_drain_all(); + if (drv->bdrv_snapshot_delete) { return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); } From f76faeda4bd59f972d09dd9d954297f17c21dd60 Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Sun, 26 Oct 2014 11:05:27 +0000 Subject: [PATCH 17/53] block/curl: Improve type safety of s->timeout. qemu_opt_get_number returns a uint64_t, and curl_easy_setopt expects a long (not an int). There is no warning about the latter type error because curl_easy_setopt uses a varargs argument. Store the timeout (which is a positive number of seconds) as a uint64_t. Check that the number given by the user is reasonable. Zero is permissible (meaning no timeout is enforced by cURL). Cast it to long before calling curl_easy_setopt to fix the type error. Example error message after this change has been applied: $ ./qemu-img create -f qcow2 /tmp/test.qcow2 \ -b 'json: { "file.driver":"https", "file.url":"https://foo/bar", "file.timeout":-1 }' qemu-img: /tmp/test.qcow2: Could not open 'json: { "file.driver":"https", "file.url":"https://foo/bar", "file.timeout":-1 }': timeout parameter is too large or negative: Invalid argument Signed-off-by: Richard W.M. Jones Reviewed-by: Laszlo Ersek Reviewed-by: Gonglei Signed-off-by: Stefan Hajnoczi --- block/curl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/curl.c b/block/curl.c index b4157cc8b3..bbee3ca179 100644 --- a/block/curl.c +++ b/block/curl.c @@ -64,6 +64,7 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, #define SECTOR_SIZE 512 #define READ_AHEAD_DEFAULT (256 * 1024) #define CURL_TIMEOUT_DEFAULT 5 +#define CURL_TIMEOUT_MAX 10000 #define FIND_RET_NONE 0 #define FIND_RET_OK 1 @@ -112,7 +113,7 @@ typedef struct BDRVCURLState { char *url; size_t readahead_size; bool sslverify; - int timeout; + uint64_t timeout; char *cookie; bool accept_range; AioContext *aio_context; @@ -390,7 +391,7 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) if (s->cookie) { curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie); } - curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, s->timeout); + curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout); curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); @@ -546,6 +547,10 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT, CURL_TIMEOUT_DEFAULT); + if (s->timeout > CURL_TIMEOUT_MAX) { + error_setg(errp, "timeout parameter is too large or negative"); + goto out_noclean; + } s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true); From e6d7ec32dd315422a023ed3425fe36df8c274eeb Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 12:57:58 +0200 Subject: [PATCH 18/53] raw-posix: Fix raw_co_get_block_status() after EOF As its comment states, raw_co_get_block_status() should unconditionally return 0 and set *pnum to 0 for after EOF. An assertion after lseek(..., SEEK_HOLE) tried to catch this case by asserting that errno != -ENXIO (which would indicate a position after the EOF); but it should be errno != ENXIO instead. Regardless of that, there should be no such assertion at all. If bdrv_getlength() returned an outdated value and the image has been resized outside of qemu, lseek() will return with errno == ENXIO. Just return that value as an error then. Setting *pnum to 0 and returning 0 should not be done here, as in that case we should update the device length as well. So, from qemu's perspective, the file has not been resized; it's just that there was an error querying sectors beyond a certain point (the actual file size). Additionally, nb_sectors should be clamped against the image end. This was probably not an issue if FIEMAP or SEEK_HOLE/SEEK_DATA worked, but the fallback did not take this case into account. Reported-by: Kevin Wolf Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414148280-17949-2-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 475cf74655..a86b784fc3 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1535,10 +1535,6 @@ static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, *hole = lseek(s->fd, start, SEEK_HOLE); if (*hole == -1) { - /* -ENXIO indicates that sector_num was past the end of the file. - * There is a virtual hole there. */ - assert(errno != -ENXIO); - return -errno; } @@ -1578,6 +1574,7 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int nb_sectors, int *pnum) { off_t start, data = 0, hole = 0; + int64_t total_size; int64_t ret; ret = fd_open(bs); @@ -1586,6 +1583,15 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, } start = sector_num * BDRV_SECTOR_SIZE; + total_size = bdrv_getlength(bs); + if (total_size < 0) { + return total_size; + } else if (start >= total_size) { + *pnum = 0; + return 0; + } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { + nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); + } ret = try_seek_hole(bs, start, &data, &hole, pnum); if (ret < 0) { From d7f62751a14d1d34c7d388431a3e403ef1fe28a5 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 12:57:59 +0200 Subject: [PATCH 19/53] raw-posix: raw_co_get_block_status() return value Instead of generating the full return value thrice in try_fiemap(), try_seek_hole() and as a fall-back in raw_co_get_block_status() itself, generate the value only in raw_co_get_block_status(). While at it, also remove the pnum parameter from try_fiemap() and try_seek_hole(). Suggested-by: Kevin Wolf Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414148280-17949-3-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index a86b784fc3..e100ae2046 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1481,12 +1481,12 @@ out: return result; } -static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, - off_t *hole, int nb_sectors, int *pnum) +static int try_fiemap(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole, int nb_sectors) { #ifdef CONFIG_FIEMAP BDRVRawState *s = bs->opaque; - int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + int ret = 0; struct { struct fiemap fm; struct fiemap_extent fe; @@ -1527,8 +1527,8 @@ static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, #endif } -static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, - off_t *hole, int *pnum) +static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole) { #if defined SEEK_HOLE && defined SEEK_DATA BDRVRawState *s = bs->opaque; @@ -1548,7 +1548,7 @@ static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, } } - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + return 0; #else return -ENOTSUP; #endif @@ -1575,7 +1575,7 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, { off_t start, data = 0, hole = 0; int64_t total_size; - int64_t ret; + int ret; ret = fd_open(bs); if (ret < 0) { @@ -1593,28 +1593,28 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); } - ret = try_seek_hole(bs, start, &data, &hole, pnum); + ret = try_seek_hole(bs, start, &data, &hole); if (ret < 0) { - ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum); + ret = try_fiemap(bs, start, &data, &hole, nb_sectors); if (ret < 0) { /* Assume everything is allocated. */ data = 0; hole = start + nb_sectors * BDRV_SECTOR_SIZE; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + ret = 0; } } + assert(ret >= 0); + if (data <= start) { /* On a data extent, compute sectors to the end of the extent. */ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); + return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; } else { /* On a hole, compute sectors to the beginning of the next extent. */ *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); - ret &= ~BDRV_BLOCK_DATA; - ret |= BDRV_BLOCK_ZERO; + return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start; } - - return ret; } static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, From 70a5ff6bdd2f0d4c22e216cab921f9288cb1656e Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 12:58:00 +0200 Subject: [PATCH 20/53] iotests: Add test for external image truncation It should not be happening, but it is possible to truncate an image outside of qemu while qemu is running (or any of the qemu tools using the block layer. raw_co_get_block_status() should not break then. While touching this test, replace the existing "truncate" invocation by "$QEMU_IMG convert -f raw". Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414148280-17949-4-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/102 | 21 +++++++++++++++++++-- tests/qemu-iotests/102.out | 11 +++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/102 b/tests/qemu-iotests/102 index 34b363f78f..161b1974ce 100755 --- a/tests/qemu-iotests/102 +++ b/tests/qemu-iotests/102 @@ -34,9 +34,10 @@ _cleanup() } trap "_cleanup; exit \$status" 0 1 2 3 15 -# get standard environment, filters and checks +# get standard environment, filters and qemu instance handling . ./common.rc . ./common.filter +. ./common.qemu _supported_fmt qcow2 _supported_proto file @@ -53,11 +54,27 @@ _make_test_img $IMG_SIZE $QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io # Remove data cluster from image (first cluster: image header, second: reftable, # third: refblock, fourth: L1 table, fifth: L2 table) -truncate -s $((5 * 64 * 1024)) "$TEST_IMG" +$QEMU_IMG resize -f raw "$TEST_IMG" $((5 * 64 * 1024)) $QEMU_IO -c map "$TEST_IMG" $QEMU_IMG map "$TEST_IMG" +echo +echo '=== Testing map on an image file truncated outside of qemu ===' +echo + +# Same as above, only now we concurrently truncate and map the image +_make_test_img $IMG_SIZE +$QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io + +qemu_comm_method=monitor _launch_qemu -drive if=none,file="$TEST_IMG",id=drv0 + +$QEMU_IMG resize -f raw "$TEST_IMG" $((5 * 64 * 1024)) + +_send_qemu_cmd $QEMU_HANDLE 'qemu-io drv0 map' 'allocated' \ + | sed -e 's/^(qemu).*qemu-io drv0 map...$/(qemu) qemu-io drv0 map/' +_send_qemu_cmd $QEMU_HANDLE 'quit' '' + # success, all done echo '*** done' rm -f $seq.full diff --git a/tests/qemu-iotests/102.out b/tests/qemu-iotests/102.out index e0e9cdc541..eecde16ad5 100644 --- a/tests/qemu-iotests/102.out +++ b/tests/qemu-iotests/102.out @@ -5,6 +5,17 @@ QA output created by 102 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 wrote 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Image resized. [ 0] 128/ 128 sectors allocated at offset 0 bytes (1) Offset Length Mapped to File + +=== Testing map on an image file truncated outside of qemu === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Image resized. +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io drv0 map +[ 0] 128/ 128 sectors allocated at offset 0 bytes (1) *** done From 808c4b6f30d77295292cef8ee38c462957a6b9ca Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:30 +0200 Subject: [PATCH 21/53] qcow2: Allow "full" discard Normally, discarded sectors should read back as zero. However, there are cases in which a sector (or rather cluster) should be discarded as if they were never written in the first place, that is, reading them should fall through to the backing file again. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-2-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/qcow2-cluster.c | 27 +++++++++++++++++---------- block/qcow2-snapshot.c | 2 +- block/qcow2.c | 2 +- block/qcow2.h | 2 +- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4d888c785f..8411f5e6aa 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1414,7 +1414,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) * clusters. */ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters, enum qcow2_discard_type type) + unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table; @@ -1436,23 +1436,30 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); /* - * Make sure that a discarded area reads back as zeroes for v3 images - * (we cannot do it for v2 without actually writing a zero-filled - * buffer). We can skip the operation if the cluster is already marked - * as zero, or if it's unallocated and we don't have a backing file. + * If full_discard is false, make sure that a discarded area reads back + * as zeroes for v3 images (we cannot do it for v2 without actually + * writing a zero-filled buffer). We can skip the operation if the + * cluster is already marked as zero, or if it's unallocated and we + * don't have a backing file. * * TODO We might want to use bdrv_get_block_status(bs) here, but we're * holding s->lock, so that doesn't work today. + * + * If full_discard is true, the sector should not read back as zeroes, + * but rather fall through to the backing file. */ switch (qcow2_get_cluster_type(old_l2_entry)) { case QCOW2_CLUSTER_UNALLOCATED: - if (!bs->backing_hd) { + if (full_discard || !bs->backing_hd) { continue; } break; case QCOW2_CLUSTER_ZERO: - continue; + if (!full_discard) { + continue; + } + break; case QCOW2_CLUSTER_NORMAL: case QCOW2_CLUSTER_COMPRESSED: @@ -1464,7 +1471,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, /* First remove L2 entries */ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - if (s->qcow_version >= 3) { + if (!full_discard && s->qcow_version >= 3) { l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); } else { l2_table[l2_index + i] = cpu_to_be64(0); @@ -1483,7 +1490,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, } int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors, enum qcow2_discard_type type) + int nb_sectors, enum qcow2_discard_type type, bool full_discard) { BDRVQcowState *s = bs->opaque; uint64_t end_offset; @@ -1506,7 +1513,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, /* Each L2 table is handled by its own loop iteration */ while (nb_clusters > 0) { - ret = discard_single_l2(bs, offset, nb_clusters, type); + ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); if (ret < 0) { goto fail; } diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index f52d7fdd22..5b3903cf22 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -441,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), align_offset(sn->vm_state_size, s->cluster_size) >> BDRV_SECTOR_BITS, - QCOW2_DISCARD_NEVER); + QCOW2_DISCARD_NEVER, false); #ifdef DEBUG_ALLOC { diff --git a/block/qcow2.c b/block/qcow2.c index d031515838..d64a4bab3d 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2089,7 +2089,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, qemu_co_mutex_lock(&s->lock); ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors, QCOW2_DISCARD_REQUEST); + nb_sectors, QCOW2_DISCARD_REQUEST, false); qemu_co_mutex_unlock(&s->lock); return ret; } diff --git a/block/qcow2.h b/block/qcow2.h index 577ccd1bf0..886b25b7e0 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -534,7 +534,7 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors, enum qcow2_discard_type type); + int nb_sectors, enum qcow2_discard_type type, bool full_discard); int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); int qcow2_expand_zero_clusters(BlockDriverState *bs); From 491d27e2af4f6e157c4b29d43269c5cb0d191171 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:31 +0200 Subject: [PATCH 22/53] qcow2: Implement bdrv_make_empty() Implement this function by making all clusters in the image file fall through to the backing file (by using the recently extended discard). Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-3-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/qcow2.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/block/qcow2.c b/block/qcow2.c index d64a4bab3d..bf871d58f4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2230,6 +2230,32 @@ fail: return ret; } +static int qcow2_make_empty(BlockDriverState *bs) +{ + int ret = 0; + uint64_t start_sector; + int sector_step = INT_MAX / BDRV_SECTOR_SIZE; + + for (start_sector = 0; start_sector < bs->total_sectors; + start_sector += sector_step) + { + /* As this function is generally used after committing an external + * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the + * default action for this kind of discard is to pass the discard, + * which will ideally result in an actually smaller image file, as + * is probably desired. */ + ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE, + MIN(sector_step, + bs->total_sectors - start_sector), + QCOW2_DISCARD_SNAPSHOT, true); + if (ret < 0) { + break; + } + } + + return ret; +} + static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; @@ -2676,6 +2702,7 @@ static BlockDriver bdrv_qcow2 = { .bdrv_co_discard = qcow2_co_discard, .bdrv_truncate = qcow2_truncate, .bdrv_write_compressed = qcow2_write_compressed, + .bdrv_make_empty = qcow2_make_empty, .bdrv_snapshot_create = qcow2_snapshot_create, .bdrv_snapshot_goto = qcow2_snapshot_goto, From 94054183daffaa41cd77ced9301c01a01027923a Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:32 +0200 Subject: [PATCH 23/53] qcow2: Optimize bdrv_make_empty() bdrv_make_empty() is currently only called if the current image represents an external snapshot that has been committed to its base image; it is therefore unlikely to have internal snapshots. In this case, bdrv_make_empty() can be greatly sped up by emptying the L1 and refcount table (while having the dirty flag set, which only works for compat=1.1) and creating a trivial refcount structure. If there are snapshots or for compat=0.10, fall back to the simple implementation (discard all clusters). [Applied s/clusters/cluster/ typo fix suggested by Eric Blake --Stefan] Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414159063-25977-4-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/blkdebug.c | 2 + block/qcow2.c | 165 +++++++++++++++++++++++++++++++++++++++++- include/block/block.h | 2 + 3 files changed, 168 insertions(+), 1 deletion(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index e046b920fb..862d93b59b 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_PWRITEV] = "pwritev", [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", [BLKDBG_PWRITEV_DONE] = "pwritev_done", + + [BLKDBG_EMPTY_IMAGE_PREPARE] = "empty_image_prepare", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) diff --git a/block/qcow2.c b/block/qcow2.c index bf871d58f4..7ec7830fac 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2230,12 +2230,175 @@ fail: return ret; } +static int make_completely_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret, l1_clusters; + int64_t offset; + uint64_t *new_reftable = NULL; + uint64_t rt_entry, l1_size2; + struct { + uint64_t l1_offset; + uint64_t reftable_offset; + uint32_t reftable_clusters; + } QEMU_PACKED l1_ofs_rt_ofs_cls; + + ret = qcow2_cache_empty(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + ret = qcow2_cache_empty(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* Refcounts will be broken utterly */ + ret = qcow2_mark_dirty(bs); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + + l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); + + /* After this call, neither the in-memory nor the on-disk refcount + * information accurately describe the actual references */ + + ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE, + l1_clusters * s->cluster_sectors, 0); + if (ret < 0) { + goto fail_broken_refcounts; + } + memset(s->l1_table, 0, l1_size2); + + BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); + + /* Overwrite enough clusters at the beginning of the sectors to place + * the refcount table, a refcount block and the L1 table in; this may + * overwrite parts of the existing refcount and L1 table, which is not + * an issue because the dirty flag is set, complete data loss is in fact + * desired and partial data loss is consequently fine as well */ + ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE, + (2 + l1_clusters) * s->cluster_size / + BDRV_SECTOR_SIZE, 0); + /* This call (even if it failed overall) may have overwritten on-disk + * refcount structures; in that case, the in-memory refcount information + * will probably differ from the on-disk information which makes the BDS + * unusable */ + if (ret < 0) { + goto fail_broken_refcounts; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); + + /* "Create" an empty reftable (one cluster) directly after the image + * header and an empty L1 table three clusters after the image header; + * the cluster between those two will be used as the first refblock */ + cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); + cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); + cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), + &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); + if (ret < 0) { + goto fail_broken_refcounts; + } + + s->l1_table_offset = 3 * s->cluster_size; + + new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); + if (!new_reftable) { + ret = -ENOMEM; + goto fail_broken_refcounts; + } + + s->refcount_table_offset = s->cluster_size; + s->refcount_table_size = s->cluster_size / sizeof(uint64_t); + + g_free(s->refcount_table); + s->refcount_table = new_reftable; + new_reftable = NULL; + + /* Now the in-memory refcount information again corresponds to the on-disk + * information (reftable is empty and no refblocks (the refblock cache is + * empty)); however, this means some clusters (e.g. the image header) are + * referenced, but not refcounted, but the normal qcow2 code assumes that + * the in-memory information is always correct */ + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Enter the first refblock into the reftable */ + rt_entry = cpu_to_be64(2 * s->cluster_size); + ret = bdrv_pwrite_sync(bs->file, s->cluster_size, + &rt_entry, sizeof(rt_entry)); + if (ret < 0) { + goto fail_broken_refcounts; + } + s->refcount_table[0] = 2 * s->cluster_size; + + s->free_cluster_index = 0; + assert(3 + l1_clusters <= s->refcount_block_size); + offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); + if (offset < 0) { + ret = offset; + goto fail_broken_refcounts; + } else if (offset > 0) { + error_report("First cluster in emptied image is in use"); + abort(); + } + + /* Now finally the in-memory information corresponds to the on-disk + * structures and is correct */ + ret = qcow2_mark_clean(bs); + if (ret < 0) { + goto fail; + } + + ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size); + if (ret < 0) { + goto fail; + } + + return 0; + +fail_broken_refcounts: + /* The BDS is unusable at this point. If we wanted to make it usable, we + * would have to call qcow2_refcount_close(), qcow2_refcount_init(), + * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() + * again. However, because the functions which could have caused this error + * path to be taken are used by those functions as well, it's very likely + * that that sequence will fail as well. Therefore, just eject the BDS. */ + bs->drv = NULL; + +fail: + g_free(new_reftable); + return ret; +} + static int qcow2_make_empty(BlockDriverState *bs) { - int ret = 0; + BDRVQcowState *s = bs->opaque; uint64_t start_sector; int sector_step = INT_MAX / BDRV_SECTOR_SIZE; + int l1_clusters, ret = 0; + l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + + if (s->qcow_version >= 3 && !s->snapshots && + 3 + l1_clusters <= s->refcount_block_size) { + /* The following function only works for qcow2 v3 images (it requires + * the dirty flag) and only as long as there are no snapshots (because + * it completely empties the image). Furthermore, the L1 table and three + * additional clusters (image header, refcount table, one refcount + * block) have to fit inside one refcount block. */ + return make_completely_empty(bs); + } + + /* This fallback code simply discards every active cluster; this is slow, + * but works in all cases */ for (start_sector = 0; start_sector < bs->total_sectors; start_sector += sector_step) { diff --git a/include/block/block.h b/include/block/block.h index 341054dcf7..b1f4385f03 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -498,6 +498,8 @@ typedef enum { BLKDBG_PWRITEV_ZERO, BLKDBG_PWRITEV_DONE, + BLKDBG_EMPTY_IMAGE_PREPARE, + BLKDBG_EVENT_MAX, } BlkDebugEvent; From 345f9e1b04ee18dcb1454dcec49781d5e06ecb60 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:33 +0200 Subject: [PATCH 24/53] blockjob: Introduce block_job_complete_sync() Implement block_job_complete_sync() by doing the exact same thing as block_job_cancel_sync() does, only with calling block_job_complete() instead of block_job_cancel(). Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414159063-25977-5-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- blockjob.c | 39 ++++++++++++++++++++++++++++++++------- include/block/blockjob.h | 15 +++++++++++++++ 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/blockjob.c b/blockjob.c index ff0908a653..a7d57e3d33 100644 --- a/blockjob.c +++ b/blockjob.c @@ -153,7 +153,7 @@ void block_job_iostatus_reset(BlockJob *job) } } -struct BlockCancelData { +struct BlockFinishData { BlockJob *job; BlockCompletionFunc *cb; void *opaque; @@ -161,19 +161,22 @@ struct BlockCancelData { int ret; }; -static void block_job_cancel_cb(void *opaque, int ret) +static void block_job_finish_cb(void *opaque, int ret) { - struct BlockCancelData *data = opaque; + struct BlockFinishData *data = opaque; data->cancelled = block_job_is_cancelled(data->job); data->ret = ret; data->cb(data->opaque, ret); } -int block_job_cancel_sync(BlockJob *job) +static int block_job_finish_sync(BlockJob *job, + void (*finish)(BlockJob *, Error **errp), + Error **errp) { - struct BlockCancelData data; + struct BlockFinishData data; BlockDriverState *bs = job->bs; + Error *local_err = NULL; assert(bs->job == job); @@ -184,15 +187,37 @@ int block_job_cancel_sync(BlockJob *job) data.cb = job->cb; data.opaque = job->opaque; data.ret = -EINPROGRESS; - job->cb = block_job_cancel_cb; + job->cb = block_job_finish_cb; job->opaque = &data; - block_job_cancel(job); + finish(job, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EBUSY; + } while (data.ret == -EINPROGRESS) { aio_poll(bdrv_get_aio_context(bs), true); } return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret; } +/* A wrapper around block_job_cancel() taking an Error ** parameter so it may be + * used with block_job_finish_sync() without the need for (rather nasty) + * function pointer casts there. */ +static void block_job_cancel_err(BlockJob *job, Error **errp) +{ + block_job_cancel(job); +} + +int block_job_cancel_sync(BlockJob *job) +{ + return block_job_finish_sync(job, &block_job_cancel_err, NULL); +} + +int block_job_complete_sync(BlockJob *job, Error **errp) +{ + return block_job_finish_sync(job, &block_job_complete, errp); +} + void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns) { assert(job->busy); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index acb399f87d..ab11a0f622 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -272,6 +272,21 @@ bool block_job_is_paused(BlockJob *job); */ int block_job_cancel_sync(BlockJob *job); +/** + * block_job_complete_sync: + * @job: The job to be completed. + * @errp: Error object which may be set by block_job_complete(); this is not + * necessarily set on every error, the job return value has to be + * checked as well. + * + * Synchronously complete the job. The completion callback is called before the + * function returns, unless it is NULL (which is permissible when using this + * function). + * + * Returns the return value from the job. + */ +int block_job_complete_sync(BlockJob *job, Error **errp); + /** * block_job_iostatus_reset: * @job: The job whose I/O status should be reset. From ef6dbf1e46ebd1d41ab669df5bba0bbdec6bd374 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:34 +0200 Subject: [PATCH 25/53] blockjob: Add "ready" field When a block job signals readiness, this is currently reported only through QMP. If qemu wants to use block jobs for internal tasks, there needs to be another way to correctly detect when a block job may be completed. For this reason, introduce a bool "ready" which is set when the block job may be completed. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-6-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- blockjob.c | 3 +++ include/block/blockjob.h | 5 +++++ qapi/block-core.json | 4 +++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/blockjob.c b/blockjob.c index a7d57e3d33..448b9ce113 100644 --- a/blockjob.c +++ b/blockjob.c @@ -261,6 +261,7 @@ BlockJobInfo *block_job_query(BlockJob *job) info->offset = job->offset; info->speed = job->speed; info->io_status = job->iostatus; + info->ready = job->ready; return info; } @@ -296,6 +297,8 @@ void block_job_event_completed(BlockJob *job, const char *msg) void block_job_event_ready(BlockJob *job) { + job->ready = true; + qapi_event_send_block_job_ready(job->driver->job_type, bdrv_get_device_name(job->bs), job->len, diff --git a/include/block/blockjob.h b/include/block/blockjob.h index ab11a0f622..9694f130b0 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -91,6 +91,11 @@ struct BlockJob { */ bool busy; + /** + * Set to true when the job is ready to be completed. + */ + bool ready; + /** Status that is published by the query-block-jobs QMP API */ BlockDeviceIoStatus iostatus; diff --git a/qapi/block-core.json b/qapi/block-core.json index 8f7089e054..77a0cfbd82 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -514,12 +514,14 @@ # # @io-status: the status of the job (since 1.3) # +# @ready: true if the job may be completed (since 2.2) +# # Since: 1.1 ## { 'type': 'BlockJobInfo', 'data': {'type': 'str', 'device': 'str', 'len': 'int', 'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int', - 'io-status': 'BlockDeviceIoStatus'} } + 'io-status': 'BlockDeviceIoStatus', 'ready': 'bool'} } ## # @query-block-jobs: From 1d3ba15accf56b08f436d8ca92f5a0abce2d54ee Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:35 +0200 Subject: [PATCH 26/53] iotests: Omit length/offset test in 040 and 041 As of a follow-up patch to this one, the length of a mirror block job will no longer directly depend on the size of the block device; therefore, drop these checks from this test. Instead, just check whether the final offset equals the block job length. As 041 uses the wait_until_completed function from iotests.py, the same applies there as well which in turn affects tests 030, 055 and 056. On the other hand, a block job's length does not have to be related to the length of the image file in the first place, so that check was questionable anyway. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414159063-25977-7-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/040 | 4 +--- tests/qemu-iotests/041 | 5 +---- tests/qemu-iotests/iotests.py | 3 +-- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 index f1e16c11c7..2b432ad7a1 100755 --- a/tests/qemu-iotests/040 +++ b/tests/qemu-iotests/040 @@ -43,8 +43,7 @@ class ImageCommitTestCase(iotests.QMPTestCase): if event['event'] == 'BLOCK_JOB_COMPLETED': self.assert_qmp(event, 'data/type', 'commit') self.assert_qmp(event, 'data/device', 'drive0') - self.assert_qmp(event, 'data/offset', self.image_len) - self.assert_qmp(event, 'data/len', self.image_len) + self.assert_qmp(event, 'data/offset', event['data']['len']) if need_ready: self.assertTrue(ready, "Expecting BLOCK_JOB_COMPLETED event") completed = True @@ -52,7 +51,6 @@ class ImageCommitTestCase(iotests.QMPTestCase): ready = True self.assert_qmp(event, 'data/type', 'commit') self.assert_qmp(event, 'data/device', 'drive0') - self.assert_qmp(event, 'data/len', self.image_len) self.vm.qmp('block-job-complete', device='drive0') self.assert_no_active_block_jobs() diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 5dbd4ee91b..59a8f733f7 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -52,8 +52,7 @@ class ImageMirroringTestCase(iotests.QMPTestCase): event = self.cancel_and_wait(drive=drive) self.assertEquals(event['event'], 'BLOCK_JOB_COMPLETED') self.assert_qmp(event, 'data/type', 'mirror') - self.assert_qmp(event, 'data/offset', self.image_len) - self.assert_qmp(event, 'data/len', self.image_len) + self.assert_qmp(event, 'data/offset', event['data']['len']) def complete_and_wait(self, drive='drive0', wait_ready=True): '''Complete a block job and wait for it to finish''' @@ -417,7 +416,6 @@ new_state = "1" self.assert_qmp(event, 'data/type', 'mirror') self.assert_qmp(event, 'data/device', 'drive0') self.assert_qmp(event, 'data/error', 'Input/output error') - self.assert_qmp(event, 'data/len', self.image_len) completed = True self.assert_no_active_block_jobs() @@ -568,7 +566,6 @@ new_state = "1" self.assert_qmp(event, 'data/type', 'mirror') self.assert_qmp(event, 'data/device', 'drive0') self.assert_qmp(event, 'data/error', 'Input/output error') - self.assert_qmp(event, 'data/len', self.image_len) completed = True self.assert_no_active_block_jobs() diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 39a4cfcf4d..f57f1548ac 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -267,8 +267,7 @@ class QMPTestCase(unittest.TestCase): self.assert_qmp(event, 'data/device', drive) self.assert_qmp_absent(event, 'data/error') if check_offset: - self.assert_qmp(event, 'data/offset', self.image_len) - self.assert_qmp(event, 'data/len', self.image_len) + self.assert_qmp(event, 'data/offset', event['data']['len']) completed = True self.assert_no_active_block_jobs() From b21c76529d55bf7bb02ac736b312f5f8bf033ea2 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:36 +0200 Subject: [PATCH 27/53] block/mirror: Improve progress report Instead of taking the total length of the block device as the block job's length, use the number of dirty sectors. The progress is now the number of sectors mirrored to the target block device. Note that this may result in the job's length increasing during operation, which is however in fact desirable. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-8-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/mirror.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index e8a43eb39e..2a1acfe26f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -45,6 +45,7 @@ typedef struct MirrorBlockJob { int64_t sector_num; int64_t granularity; size_t buf_size; + int64_t bdev_length; unsigned long *cow_bitmap; BdrvDirtyBitmap *dirty_bitmap; HBitmapIter hbi; @@ -54,6 +55,7 @@ typedef struct MirrorBlockJob { unsigned long *in_flight_bitmap; int in_flight; + int sectors_in_flight; int ret; } MirrorBlockJob; @@ -87,6 +89,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); s->in_flight--; + s->sectors_in_flight -= op->nb_sectors; iov = op->qiov.iov; for (i = 0; i < op->qiov.niov; i++) { MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; @@ -98,8 +101,11 @@ static void mirror_iteration_done(MirrorOp *op, int ret) chunk_num = op->sector_num / sectors_per_chunk; nb_chunks = op->nb_sectors / sectors_per_chunk; bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); - if (s->cow_bitmap && ret >= 0) { - bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + if (ret >= 0) { + if (s->cow_bitmap) { + bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + } + s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; } qemu_iovec_destroy(&op->qiov); @@ -172,7 +178,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) hbitmap_next_sector = s->sector_num; sector_num = s->sector_num; sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - end = s->common.len >> BDRV_SECTOR_BITS; + end = s->bdev_length / BDRV_SECTOR_SIZE; /* Extend the QEMUIOVector to include all adjacent blocks that will * be copied in this operation. @@ -284,6 +290,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Copy the dirty cluster. */ s->in_flight++; + s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, mirror_read_complete, op); @@ -329,11 +336,11 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - s->common.len = bdrv_getlength(bs); - if (s->common.len < 0) { - ret = s->common.len; + s->bdev_length = bdrv_getlength(bs); + if (s->bdev_length < 0) { + ret = s->bdev_length; goto immediate_exit; - } else if (s->common.len == 0) { + } else if (s->bdev_length == 0) { /* Report BLOCK_JOB_READY and wait for complete. */ block_job_event_ready(&s->common); s->synced = true; @@ -344,7 +351,7 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - length = DIV_ROUND_UP(s->common.len, s->granularity); + length = DIV_ROUND_UP(s->bdev_length, s->granularity); s->in_flight_bitmap = bitmap_new(length); /* If we have no backing file yet in the destination, we cannot let @@ -364,7 +371,7 @@ static void coroutine_fn mirror_run(void *opaque) } } - end = s->common.len >> BDRV_SECTOR_BITS; + end = s->bdev_length / BDRV_SECTOR_SIZE; s->buf = qemu_try_blockalign(bs, s->buf_size); if (s->buf == NULL) { ret = -ENOMEM; @@ -409,6 +416,12 @@ static void coroutine_fn mirror_run(void *opaque) } cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); + /* s->common.offset contains the number of bytes already processed so + * far, cnt is the number of dirty sectors remaining and + * s->sectors_in_flight is the number of sectors currently being + * processed; together those are the current total operation length */ + s->common.len = s->common.offset + + (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; /* Note that even when no rate limit is applied we need to yield * periodically with no pending I/O so that qemu_aio_flush() returns. @@ -445,7 +458,6 @@ static void coroutine_fn mirror_run(void *opaque) * report completion. This way, block-job-cancel will leave * the target in a consistent state. */ - s->common.offset = end * BDRV_SECTOR_SIZE; if (!s->synced) { block_job_event_ready(&s->common); s->synced = true; @@ -474,8 +486,6 @@ static void coroutine_fn mirror_run(void *opaque) ret = 0; trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); if (!s->synced) { - /* Publish progress */ - s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; From d4a3238af567939f4c51bb074af1ce8839e2ce2d Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:37 +0200 Subject: [PATCH 28/53] qemu-img: Implement commit like QMP qemu-img should use QMP commands whenever possible in order to ensure feature completeness of both online and offline image operations. As qemu-img itself has no access to QMP (since this would basically require just everything being linked into qemu-img), imitate QMP's implementation of block-commit by using commit_active_start() and then waiting for the block job to finish. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414159063-25977-9-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/Makefile.objs | 3 +- qemu-img.c | 78 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/block/Makefile.objs b/block/Makefile.objs index 27911b6b88..04b0e43eb1 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o +block-obj-y += null.o mirror.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o @@ -23,7 +23,6 @@ block-obj-y += accounting.o common-obj-y += stream.o common-obj-y += commit.o -common-obj-y += mirror.o common-obj-y += backup.o iscsi.o-cflags := $(LIBISCSI_CFLAGS) diff --git a/qemu-img.c b/qemu-img.c index 731502c2ce..494146d6bb 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -31,6 +31,7 @@ #include "sysemu/sysemu.h" #include "sysemu/block-backend.h" #include "block/block_int.h" +#include "block/blockjob.h" #include "block/qapi.h" #include @@ -722,13 +723,43 @@ fail: return ret; } +typedef struct CommonBlockJobCBInfo { + BlockDriverState *bs; + Error **errp; +} CommonBlockJobCBInfo; + +static void common_block_job_cb(void *opaque, int ret) +{ + CommonBlockJobCBInfo *cbi = opaque; + + if (ret < 0) { + error_setg_errno(cbi->errp, -ret, "Block job failed"); + } + + /* Drop this block job's reference */ + bdrv_unref(cbi->bs); +} + +static void run_block_job(BlockJob *job, Error **errp) +{ + AioContext *aio_context = bdrv_get_aio_context(job->bs); + + do { + aio_poll(aio_context, true); + } while (!job->ready); + + block_job_complete_sync(job, errp); +} + static int img_commit(int argc, char **argv) { int c, ret, flags; const char *filename, *fmt, *cache; BlockBackend *blk; - BlockDriverState *bs; + BlockDriverState *bs, *base_bs; bool quiet = false; + Error *local_err = NULL; + CommonBlockJobCBInfo cbi; fmt = NULL; cache = BDRV_DEFAULT_CACHE; @@ -771,29 +802,38 @@ static int img_commit(int argc, char **argv) } bs = blk_bs(blk); - ret = bdrv_commit(bs); - switch(ret) { - case 0: - qprintf(quiet, "Image committed.\n"); - break; - case -ENOENT: - error_report("No disk inserted"); - break; - case -EACCES: - error_report("Image is read-only"); - break; - case -ENOTSUP: - error_report("Image is already committed"); - break; - default: - error_report("Error while committing image"); - break; + /* This is different from QMP, which by default uses the deepest file in the + * backing chain (i.e., the very base); however, the traditional behavior of + * qemu-img commit is using the immediate backing file. */ + base_bs = bs->backing_hd; + if (!base_bs) { + error_setg(&local_err, "Image does not have a backing file"); + goto done; } + cbi = (CommonBlockJobCBInfo){ + .errp = &local_err, + .bs = bs, + }; + + commit_active_start(bs, base_bs, 0, BLOCKDEV_ON_ERROR_REPORT, + common_block_job_cb, &cbi, &local_err); + if (local_err) { + goto done; + } + + run_block_job(bs->job, &local_err); + +done: blk_unref(blk); - if (ret) { + + if (local_err) { + qerror_report_err(local_err); + error_free(local_err); return 1; } + + qprintf(quiet, "Image committed.\n"); return 0; } From 9a86fe489552d47b57c92782471ddfb9e7f702e2 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:38 +0200 Subject: [PATCH 29/53] qemu-img: Empty image after commit After the top image has been committed, it should be emptied unless specified otherwise. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-10-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- qemu-img-cmds.hx | 4 ++-- qemu-img.c | 34 +++++++++++++++++++++++++++++++--- qemu-img.texi | 6 +++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 55aec6bded..e2ceadff6d 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -22,9 +22,9 @@ STEXI ETEXI DEF("commit", img_commit, - "commit [-q] [-f fmt] [-t cache] filename") + "commit [-q] [-f fmt] [-t cache] [-d] filename") STEXI -@item commit [-q] [-f @var{fmt}] [-t @var{cache}] @var{filename} +@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-d] @var{filename} ETEXI DEF("compare", img_compare, diff --git a/qemu-img.c b/qemu-img.c index 494146d6bb..cf8c01c13e 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -757,14 +757,14 @@ static int img_commit(int argc, char **argv) const char *filename, *fmt, *cache; BlockBackend *blk; BlockDriverState *bs, *base_bs; - bool quiet = false; + bool quiet = false, drop = false; Error *local_err = NULL; CommonBlockJobCBInfo cbi; fmt = NULL; cache = BDRV_DEFAULT_CACHE; for(;;) { - c = getopt(argc, argv, "f:ht:q"); + c = getopt(argc, argv, "f:ht:dq"); if (c == -1) { break; } @@ -779,6 +779,9 @@ static int img_commit(int argc, char **argv) case 't': cache = optarg; break; + case 'd': + drop = true; + break; case 'q': quiet = true; break; @@ -789,7 +792,7 @@ static int img_commit(int argc, char **argv) } filename = argv[optind++]; - flags = BDRV_O_RDWR; + flags = BDRV_O_RDWR | BDRV_O_UNMAP; ret = bdrv_parse_cache_flags(cache, &flags); if (ret < 0) { error_report("Invalid cache option: %s", cache); @@ -822,7 +825,32 @@ static int img_commit(int argc, char **argv) goto done; } + /* The block job will swap base_bs and bs (which is not what we really want + * here, but okay) and unref base_bs (after the swap, i.e., the old top + * image). In order to still be able to empty that top image afterwards, + * increment the reference counter here preemptively. */ + if (!drop) { + bdrv_ref(base_bs); + } + run_block_job(bs->job, &local_err); + if (local_err) { + goto unref_backing; + } + + if (!drop && base_bs->drv->bdrv_make_empty) { + ret = base_bs->drv->bdrv_make_empty(base_bs); + if (ret) { + error_setg_errno(&local_err, -ret, "Could not empty %s", + filename); + goto unref_backing; + } + } + +unref_backing: + if (!drop) { + bdrv_unref(base_bs); + } done: blk_unref(blk); diff --git a/qemu-img.texi b/qemu-img.texi index e3ef4a00e9..420bd91177 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -167,7 +167,7 @@ this case. @var{backing_file} will never be modified unless you use the The size can also be specified using the @var{size} option with @code{-o}, it doesn't need to be specified separately in this case. -@item commit [-f @var{fmt}] [-t @var{cache}] @var{filename} +@item commit [-f @var{fmt}] [-t @var{cache}] [-d] @var{filename} Commit the changes recorded in @var{filename} in its base image or backing file. If the backing file is smaller than the snapshot, then the backing file will be @@ -176,6 +176,10 @@ the backing file, the backing file will not be truncated. If you want the backing file to match the size of the smaller snapshot, you can safely truncate it yourself once the commit operation successfully completes. +The image @var{filename} is emptied after the operation has succeeded. If you do +not need @var{filename} afterwards and intend to drop it, you may skip emptying +@var{filename} by specifying the @code{-d} flag. + @item compare [-f @var{fmt}] [-F @var{fmt}] [-T @var{src_cache}] [-p] [-s] [-q] @var{filename1} @var{filename2} Check if two images have the same content. You can compare images with From 687fa1d83013d56f7c7f9c008c956f4c26d8ba5c Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:39 +0200 Subject: [PATCH 30/53] qemu-img: Enable progress output for commit Implement progress output for the commit command by querying the progress of the block job. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-11-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- qemu-img-cmds.hx | 4 ++-- qemu-img.c | 23 +++++++++++++++++++++-- qemu-img.texi | 2 +- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index e2ceadff6d..fde33d1860 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -22,9 +22,9 @@ STEXI ETEXI DEF("commit", img_commit, - "commit [-q] [-f fmt] [-t cache] [-d] filename") + "commit [-q] [-f fmt] [-t cache] [-d] [-p] filename") STEXI -@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-d] @var{filename} +@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-d] [-p] @var{filename} ETEXI DEF("compare", img_compare, diff --git a/qemu-img.c b/qemu-img.c index cf8c01c13e..8fec160c37 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -746,9 +746,14 @@ static void run_block_job(BlockJob *job, Error **errp) do { aio_poll(aio_context, true); + qemu_progress_print((float)job->offset / job->len * 100.f, 0); } while (!job->ready); block_job_complete_sync(job, errp); + + /* A block job may finish instantaneously without publishing any progress, + * so just signal completion here */ + qemu_progress_print(100.f, 0); } static int img_commit(int argc, char **argv) @@ -757,14 +762,14 @@ static int img_commit(int argc, char **argv) const char *filename, *fmt, *cache; BlockBackend *blk; BlockDriverState *bs, *base_bs; - bool quiet = false, drop = false; + bool progress = false, quiet = false, drop = false; Error *local_err = NULL; CommonBlockJobCBInfo cbi; fmt = NULL; cache = BDRV_DEFAULT_CACHE; for(;;) { - c = getopt(argc, argv, "f:ht:dq"); + c = getopt(argc, argv, "f:ht:dpq"); if (c == -1) { break; } @@ -782,11 +787,20 @@ static int img_commit(int argc, char **argv) case 'd': drop = true; break; + case 'p': + progress = true; + break; case 'q': quiet = true; break; } } + + /* Progress is not shown in Quiet mode */ + if (quiet) { + progress = false; + } + if (optind != argc - 1) { error_exit("Expecting one image file name"); } @@ -805,6 +819,9 @@ static int img_commit(int argc, char **argv) } bs = blk_bs(blk); + qemu_progress_init(progress, 1.f); + qemu_progress_print(0.f, 100); + /* This is different from QMP, which by default uses the deepest file in the * backing chain (i.e., the very base); however, the traditional behavior of * qemu-img commit is using the immediate backing file. */ @@ -853,6 +870,8 @@ unref_backing: } done: + qemu_progress_end(); + blk_unref(blk); if (local_err) { diff --git a/qemu-img.texi b/qemu-img.texi index 420bd91177..f82d1b4923 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -167,7 +167,7 @@ this case. @var{backing_file} will never be modified unless you use the The size can also be specified using the @var{size} option with @code{-o}, it doesn't need to be specified separately in this case. -@item commit [-f @var{fmt}] [-t @var{cache}] [-d] @var{filename} +@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-d] [-p] @var{filename} Commit the changes recorded in @var{filename} in its base image or backing file. If the backing file is smaller than the snapshot, then the backing file will be From 1b22bffd829ee3db4ddcfbd7e66c7a7921123f3e Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:40 +0200 Subject: [PATCH 31/53] qemu-img: Specify backing file for commit Introduce a new parameter for qemu-img commit which may be used to explicitly specify the backing file into which an image should be committed if the backing chain has more than a single layer. [Applied Eric Blake's qemu-img.texi documentation rewording --Stefan] Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414159063-25977-12-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- qemu-img-cmds.hx | 4 ++-- qemu-img.c | 32 +++++++++++++++++++++++--------- qemu-img.texi | 12 +++++++++++- 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index fde33d1860..04c207da8d 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -22,9 +22,9 @@ STEXI ETEXI DEF("commit", img_commit, - "commit [-q] [-f fmt] [-t cache] [-d] [-p] filename") + "commit [-q] [-f fmt] [-t cache] [-b base] [-d] [-p] filename") STEXI -@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-d] [-p] @var{filename} +@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-b @var{base}] [-d] [-p] @var{filename} ETEXI DEF("compare", img_compare, diff --git a/qemu-img.c b/qemu-img.c index 8fec160c37..bd5a2ae22e 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -759,7 +759,7 @@ static void run_block_job(BlockJob *job, Error **errp) static int img_commit(int argc, char **argv) { int c, ret, flags; - const char *filename, *fmt, *cache; + const char *filename, *fmt, *cache, *base; BlockBackend *blk; BlockDriverState *bs, *base_bs; bool progress = false, quiet = false, drop = false; @@ -768,8 +768,9 @@ static int img_commit(int argc, char **argv) fmt = NULL; cache = BDRV_DEFAULT_CACHE; + base = NULL; for(;;) { - c = getopt(argc, argv, "f:ht:dpq"); + c = getopt(argc, argv, "f:ht:b:dpq"); if (c == -1) { break; } @@ -784,6 +785,11 @@ static int img_commit(int argc, char **argv) case 't': cache = optarg; break; + case 'b': + base = optarg; + /* -b implies -d */ + drop = true; + break; case 'd': drop = true; break; @@ -822,13 +828,21 @@ static int img_commit(int argc, char **argv) qemu_progress_init(progress, 1.f); qemu_progress_print(0.f, 100); - /* This is different from QMP, which by default uses the deepest file in the - * backing chain (i.e., the very base); however, the traditional behavior of - * qemu-img commit is using the immediate backing file. */ - base_bs = bs->backing_hd; - if (!base_bs) { - error_setg(&local_err, "Image does not have a backing file"); - goto done; + if (base) { + base_bs = bdrv_find_backing_image(bs, base); + if (!base_bs) { + error_set(&local_err, QERR_BASE_NOT_FOUND, base); + goto done; + } + } else { + /* This is different from QMP, which by default uses the deepest file in + * the backing chain (i.e., the very base); however, the traditional + * behavior of qemu-img commit is using the immediate backing file. */ + base_bs = bs->backing_hd; + if (!base_bs) { + error_setg(&local_err, "Image does not have a backing file"); + goto done; + } } cbi = (CommonBlockJobCBInfo){ diff --git a/qemu-img.texi b/qemu-img.texi index f82d1b4923..078851adb6 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -167,7 +167,7 @@ this case. @var{backing_file} will never be modified unless you use the The size can also be specified using the @var{size} option with @code{-o}, it doesn't need to be specified separately in this case. -@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-d] [-p] @var{filename} +@item commit [-q] [-f @var{fmt}] [-t @var{cache}] [-b @var{base}] [-d] [-p] @var{filename} Commit the changes recorded in @var{filename} in its base image or backing file. If the backing file is smaller than the snapshot, then the backing file will be @@ -180,6 +180,16 @@ The image @var{filename} is emptied after the operation has succeeded. If you do not need @var{filename} afterwards and intend to drop it, you may skip emptying @var{filename} by specifying the @code{-d} flag. +If the backing chain of the given image file @var{filename} has more than one +layer, the backing file into which the changes will be committed may be +specified as @var{base} (which has to be part of @var{filename}'s backing +chain). If @var{base} is not specified, the immediate backing file of the top +image (which is @var{filename}) will be used. For reasons of consistency, +explicitly specifying @var{base} will always imply @code{-d} (since emptying an +image after committing to an indirect backing file would lead to different data +being read from the image due to content in the intermediate backing chain +overruling the commit target). + @item compare [-f @var{fmt}] [-F @var{fmt}] [-T @var{src_cache}] [-p] [-s] [-q] @var{filename1} @var{filename2} Check if two images have the same content. You can compare images with From f67ac71edb42f764004a3d4564d2a9f3cdfe59bc Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:41 +0200 Subject: [PATCH 32/53] iotests: Add _filter_qemu_img_map As different image formats most probably map guest addresses to different host addresses, add a filter to filter the host addresses out; also, the image filename should be filtered. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-13-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/common.filter | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter index f69cb6b916..3acdb307f4 100644 --- a/tests/qemu-iotests/common.filter +++ b/tests/qemu-iotests/common.filter @@ -213,5 +213,12 @@ _filter_img_info() -e "s/archipelago:a/TEST_DIR\//g" } +# filter out offsets and file names from qemu-img map +_filter_qemu_img_map() +{ + sed -e 's/\([0-9a-fx]* *[0-9a-fx]* *\)[0-9a-fx]* */\1/g' \ + -e 's/Mapped to *//' | _filter_testdir | _filter_imgfmt +} + # make sure this script returns success /bin/true From e6ea23126c6d4350c7eb034f42ef27460190ef21 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:42 +0200 Subject: [PATCH 33/53] iotests: Add test for backing-chain commits Add a test for qemu-img commit on backing chains with more than two images. This test also checks whether the top image is emptied (unless this is prevented by specifying either -d or -b) and does therefore not work for qed and vmdk which requires it to be separate from 020. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Kevin Wolf Message-id: 1414159063-25977-14-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/097 | 122 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/097.out | 119 ++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 242 insertions(+) create mode 100755 tests/qemu-iotests/097 create mode 100644 tests/qemu-iotests/097.out diff --git a/tests/qemu-iotests/097 b/tests/qemu-iotests/097 new file mode 100755 index 0000000000..c7a613b7ee --- /dev/null +++ b/tests/qemu-iotests/097 @@ -0,0 +1,122 @@ +#!/bin/bash +# +# Commit changes into backing chains and empty the top image if the +# backing image is not explicitly specified +# +# Copyright (C) 2014 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=mreitz@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img + _rm_test_img "$TEST_IMG.itmd" +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.pattern + +# Any format supporting backing files and bdrv_make_empty +_supported_fmt qcow qcow2 +_supported_proto file +_supported_os Linux + + +# Four passes: +# 0: Two-layer backing chain, commit to upper backing file (implicitly) +# (in this case, the top image will be emptied) +# 1: Two-layer backing chain, commit to upper backing file (explicitly) +# (in this case, the top image will implicitly stay unchanged) +# 2: Two-layer backing chain, commit to upper backing file (implicitly with -d) +# (in this case, the top image will explicitly stay unchanged) +# 3: Two-layer backing chain, commit to lower backing file +# (in this case, the top image will implicitly stay unchanged) +# +# 020 already tests committing, so this only tests whether image chains are +# working properly and that all images above the base are emptied; therefore, +# no complicated patterns are necessary +for i in 0 1 2 3; do + +echo +echo "=== Test pass $i ===" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img 64M +TEST_IMG="$TEST_IMG.itmd" _make_test_img -b "$TEST_IMG.base" 64M +_make_test_img -b "$TEST_IMG.itmd" 64M + +$QEMU_IO -c 'write -P 1 0 192k' "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c 'write -P 2 64k 128k' "$TEST_IMG.itmd" | _filter_qemu_io +$QEMU_IO -c 'write -P 3 128k 64k' "$TEST_IMG" | _filter_qemu_io + +if [ $i -lt 3 ]; then + if [ $i == 0 ]; then + # -b "$TEST_IMG.itmd" should be the default (that is, committing to the + # first backing file in the chain) + $QEMU_IMG commit "$TEST_IMG" + elif [ $i == 1 ]; then + # explicitly specify the commit target (this should imply -d) + $QEMU_IMG commit -b "$TEST_IMG.itmd" "$TEST_IMG" + else + # do not explicitly specify the commit target, but use -d to leave the + # top image unchanged + $QEMU_IMG commit -d "$TEST_IMG" + fi + + # Bottom should be unchanged + $QEMU_IO -c 'read -P 1 0 192k' "$TEST_IMG.base" | _filter_qemu_io + + # Intermediate should contain changes from top + $QEMU_IO -c 'read -P 1 0 64k' "$TEST_IMG.itmd" | _filter_qemu_io + $QEMU_IO -c 'read -P 2 64k 64k' "$TEST_IMG.itmd" | _filter_qemu_io + $QEMU_IO -c 'read -P 3 128k 64k' "$TEST_IMG.itmd" | _filter_qemu_io + + # And in pass 0, the top image should be empty, whereas in both other passes + # it should be unchanged (which is both checked by qemu-img map) +else + $QEMU_IMG commit -b "$TEST_IMG.base" "$TEST_IMG" + + # Bottom should contain all changes + $QEMU_IO -c 'read -P 1 0 64k' "$TEST_IMG.base" | _filter_qemu_io + $QEMU_IO -c 'read -P 2 64k 64k' "$TEST_IMG.base" | _filter_qemu_io + $QEMU_IO -c 'read -P 3 128k 64k' "$TEST_IMG.base" | _filter_qemu_io + + # Both top and intermediate should be unchanged +fi + +$QEMU_IMG map "$TEST_IMG.base" | _filter_qemu_img_map +$QEMU_IMG map "$TEST_IMG.itmd" | _filter_qemu_img_map +$QEMU_IMG map "$TEST_IMG" | _filter_qemu_img_map + +done + + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/097.out b/tests/qemu-iotests/097.out new file mode 100644 index 0000000000..1cb7764fe8 --- /dev/null +++ b/tests/qemu-iotests/097.out @@ -0,0 +1,119 @@ +QA output created by 097 + +=== Test pass 0 === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT.itmd', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.itmd' +wrote 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 131072/131072 bytes at offset 65536 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Image committed. +read 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x30000 TEST_DIR/t.IMGFMT.base +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x20000 TEST_DIR/t.IMGFMT.itmd +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x20000 TEST_DIR/t.IMGFMT.itmd + +=== Test pass 1 === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT.itmd', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.itmd' +wrote 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 131072/131072 bytes at offset 65536 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Image committed. +read 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x30000 TEST_DIR/t.IMGFMT.base +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x20000 TEST_DIR/t.IMGFMT.itmd +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x10000 TEST_DIR/t.IMGFMT.itmd +0x20000 0x10000 TEST_DIR/t.IMGFMT + +=== Test pass 2 === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT.itmd', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.itmd' +wrote 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 131072/131072 bytes at offset 65536 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Image committed. +read 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x30000 TEST_DIR/t.IMGFMT.base +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x20000 TEST_DIR/t.IMGFMT.itmd +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x10000 TEST_DIR/t.IMGFMT.itmd +0x20000 0x10000 TEST_DIR/t.IMGFMT + +=== Test pass 3 === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT.itmd', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.itmd' +wrote 196608/196608 bytes at offset 0 +192 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 131072/131072 bytes at offset 65536 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Image committed. +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 131072 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x30000 TEST_DIR/t.IMGFMT.base +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x20000 TEST_DIR/t.IMGFMT.itmd +Offset Length File +0 0x10000 TEST_DIR/t.IMGFMT.base +0x10000 0x10000 TEST_DIR/t.IMGFMT.itmd +0x20000 0x10000 TEST_DIR/t.IMGFMT +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 9bbd5d3a17..f1c2d87b27 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -100,6 +100,7 @@ 091 rw auto quick 092 rw auto quick 095 rw auto quick +097 rw auto backing 099 rw auto quick 100 rw auto quick 101 rw auto quick From 7d90030196d8e08fb13ff9c081fb6343d7bcb8c5 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Oct 2014 15:57:43 +0200 Subject: [PATCH 34/53] iotests: Add test for qcow2's bdrv_make_empty Add a test for qcow2's fast bdrv_make_empty implementation on images without internal snapshots. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Message-id: 1414159063-25977-15-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/098 | 82 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/098.out | 52 ++++++++++++++++++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 135 insertions(+) create mode 100755 tests/qemu-iotests/098 create mode 100644 tests/qemu-iotests/098.out diff --git a/tests/qemu-iotests/098 b/tests/qemu-iotests/098 new file mode 100755 index 0000000000..e2230ad60c --- /dev/null +++ b/tests/qemu-iotests/098 @@ -0,0 +1,82 @@ +#!/bin/bash +# +# Test qcow2's bdrv_make_empty for images without internal snapshots +# +# Copyright (C) 2014 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=mreitz@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img + rm -f "$TEST_DIR/blkdebug.conf" +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.pattern + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +IMGOPTS="compat=1.1" + +for event in l1_update empty_image_prepare reftable_update refblock_alloc; do + +echo +echo "=== $event ===" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img 64M +_make_test_img -b "$TEST_IMG.base" 64M + +# Some data that can be leaked when emptying the top image +$QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io + +cat > "$TEST_DIR/blkdebug.conf" <&1 \ + | _filter_testdir | _filter_imgfmt + +# There may be errors, but they should be fixed by opening the image +$QEMU_IO -c close "$TEST_IMG" + +_check_test_img + +done + + +rm -f "$TEST_DIR/blkdebug.conf" + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/098.out b/tests/qemu-iotests/098.out new file mode 100644 index 0000000000..586dfc809f --- /dev/null +++ b/tests/qemu-iotests/098.out @@ -0,0 +1,52 @@ +QA output created by 098 + +=== l1_update === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qemu-img: Could not empty blkdebug:TEST_DIR/blkdebug.conf:TEST_DIR/t.IMGFMT: Input/output error +No errors were found on the image. + +=== empty_image_prepare === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qemu-img: Could not empty blkdebug:TEST_DIR/blkdebug.conf:TEST_DIR/t.IMGFMT: Input/output error +Leaked cluster 4 refcount=1 reference=0 +Leaked cluster 5 refcount=1 reference=0 +Repairing cluster 4 refcount=1 reference=0 +Repairing cluster 5 refcount=1 reference=0 +No errors were found on the image. + +=== reftable_update === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qemu-img: Could not empty blkdebug:TEST_DIR/blkdebug.conf:TEST_DIR/t.IMGFMT: Input/output error +ERROR cluster 0 refcount=0 reference=1 +ERROR cluster 1 refcount=0 reference=1 +ERROR cluster 3 refcount=0 reference=1 +Rebuilding refcount structure +Repairing cluster 1 refcount=1 reference=0 +No errors were found on the image. + +=== refblock_alloc === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qemu-img: Could not empty blkdebug:TEST_DIR/blkdebug.conf:TEST_DIR/t.IMGFMT: Input/output error +ERROR cluster 0 refcount=0 reference=1 +ERROR cluster 1 refcount=0 reference=1 +ERROR cluster 3 refcount=0 reference=1 +Rebuilding refcount structure +Repairing cluster 1 refcount=1 reference=0 +No errors were found on the image. +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index f1c2d87b27..281c889921 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -101,6 +101,7 @@ 092 rw auto quick 095 rw auto quick 097 rw auto backing +098 rw auto backing quick 099 rw auto quick 100 rw auto quick 101 rw auto quick From 9ea92c210675b0d2b0ff3ccd2f7a360d1f6c5129 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Sat, 25 Oct 2014 17:05:37 +0200 Subject: [PATCH 35/53] block: qemu-iotest 107 supports NFS As discussed during review a follow up for Max's fix. Signed-off-by: Peter Lieven Reviewed-by: Max Reitz Message-id: 1414249537-29257-1-git-send-email-pl@kamp.de Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/107 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/107 b/tests/qemu-iotests/107 index cad1cf98a0..9862030469 100755 --- a/tests/qemu-iotests/107 +++ b/tests/qemu-iotests/107 @@ -39,7 +39,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 . ./common.filter _supported_fmt qcow2 -_supported_proto file +_supported_proto file nfs _supported_os Linux From 77485434206bbbfbb7f6a446866f6a327b062d5e Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:50 +0100 Subject: [PATCH 36/53] block: Add status callback to bdrv_amend_options() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depending on the changed options and the image format, bdrv_amend_options() may take a significant amount of time. In these cases, a way to be informed about the operation's status is desirable. Since the operation is rather complex and may fundamentally change the image, implementing it as AIO or a coroutine does not seem feasible. On the other hand, implementing it as a block job would be significantly more difficult than a simple callback and would not add benefits other than progress report to the amending operation, because it should not actually be run as a block job at all. A callback may not be very pretty, but it's very easy to implement and perfectly fits its purpose here. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoît Canet Reviewed-by: Kevin Wolf Message-id: 1414404776-4919-2-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block.c | 5 +++-- block/qcow2.c | 3 ++- include/block/block.h | 8 +++++++- include/block/block_int.h | 3 ++- qemu-img.c | 2 +- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index 13a9e51470..c5ff56019b 100644 --- a/block.c +++ b/block.c @@ -5759,12 +5759,13 @@ void bdrv_add_before_write_notifier(BlockDriverState *bs, notifier_with_return_list_add(&bs->before_write_notifiers, notifier); } -int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts) +int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb) { if (!bs->drv->bdrv_amend_options) { return -ENOTSUP; } - return bs->drv->bdrv_amend_options(bs, opts); + return bs->drv->bdrv_amend_options(bs, opts, status_cb); } /* This function will be called by the bdrv_recurse_is_first_non_filter method diff --git a/block/qcow2.c b/block/qcow2.c index 7ec7830fac..02427936c9 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2613,7 +2613,8 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version) return 0; } -static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts) +static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; int old_version = s->qcow_version, new_version = old_version; diff --git a/include/block/block.h b/include/block/block.h index b1f4385f03..5d13282fa9 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -268,7 +268,13 @@ typedef enum { int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix); -int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts); +/* The units of offset and total_work_size may be chosen arbitrarily by the + * block driver; total_work_size may change during the course of the amendment + * operation */ +typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset, + int64_t total_work_size); +int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb); /* external snapshots */ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, diff --git a/include/block/block_int.h b/include/block/block_int.h index a293e92852..a1c17b9578 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -232,7 +232,8 @@ struct BlockDriver { int (*bdrv_check)(BlockDriverState* bs, BdrvCheckResult *result, BdrvCheckMode fix); - int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts); + int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb); void (*bdrv_debug_event)(BlockDriverState *bs, BlkDebugEvent event); diff --git a/qemu-img.c b/qemu-img.c index bd5a2ae22e..a095d42d30 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2968,7 +2968,7 @@ static int img_amend(int argc, char **argv) goto out; } - ret = bdrv_amend_options(bs, opts); + ret = bdrv_amend_options(bs, opts, NULL); if (ret < 0) { error_report("Error while amending options: %s", strerror(-ret)); goto out; From 76a3a34dcefbaac3103148e9c3437749b0732cfe Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:51 +0100 Subject: [PATCH 37/53] qemu-img: Add progress output for amend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that bdrv_amend_options() supports a status callback, use it to display a progress report. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoît Canet Reviewed-by: Kevin Wolf Message-id: 1414404776-4919-3-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- qemu-img-cmds.hx | 4 ++-- qemu-img.c | 25 ++++++++++++++++++++++--- qemu-img.texi | 2 +- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 04c207da8d..95677745f9 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -70,8 +70,8 @@ STEXI ETEXI DEF("amend", img_amend, - "amend [-q] [-f fmt] [-t cache] -o options filename") + "amend [-p] [-q] [-f fmt] [-t cache] -o options filename") STEXI -@item amend [-q] [-f @var{fmt}] [-t @var{cache}] -o @var{options} @var{filename} +@item amend [-p] [-q] [-f @var{fmt}] [-t @var{cache}] -o @var{options} @var{filename} @end table ETEXI diff --git a/qemu-img.c b/qemu-img.c index a095d42d30..c7b394a7f9 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2871,6 +2871,12 @@ out: return 0; } +static void amend_status_cb(BlockDriverState *bs, + int64_t offset, int64_t total_work_size) +{ + qemu_progress_print(100.f * offset / total_work_size, 0); +} + static int img_amend(int argc, char **argv) { int c, ret = 0; @@ -2879,13 +2885,13 @@ static int img_amend(int argc, char **argv) QemuOpts *opts = NULL; const char *fmt = NULL, *filename, *cache; int flags; - bool quiet = false; + bool quiet = false, progress = false; BlockBackend *blk = NULL; BlockDriverState *bs = NULL; cache = BDRV_DEFAULT_CACHE; for (;;) { - c = getopt(argc, argv, "ho:f:t:q"); + c = getopt(argc, argv, "ho:f:t:pq"); if (c == -1) { break; } @@ -2915,6 +2921,9 @@ static int img_amend(int argc, char **argv) case 't': cache = optarg; break; + case 'p': + progress = true; + break; case 'q': quiet = true; break; @@ -2925,6 +2934,11 @@ static int img_amend(int argc, char **argv) error_exit("Must specify options (-o)"); } + if (quiet) { + progress = false; + } + qemu_progress_init(progress, 1.0); + filename = (optind == argc - 1) ? argv[argc - 1] : NULL; if (fmt && has_help_option(options)) { /* If a format is explicitly specified (and possibly no filename is @@ -2968,13 +2982,18 @@ static int img_amend(int argc, char **argv) goto out; } - ret = bdrv_amend_options(bs, opts, NULL); + /* In case the driver does not call amend_status_cb() */ + qemu_progress_print(0.f, 0); + ret = bdrv_amend_options(bs, opts, &amend_status_cb); + qemu_progress_print(100.f, 0); if (ret < 0) { error_report("Error while amending options: %s", strerror(-ret)); goto out; } out: + qemu_progress_end(); + blk_unref(blk); qemu_opts_del(opts); qemu_opts_free(create_opts); diff --git a/qemu-img.texi b/qemu-img.texi index 078851adb6..0a1ab35989 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -412,7 +412,7 @@ After using this command to grow a disk image, you must use file system and partitioning tools inside the VM to actually begin using the new space on the device. -@item amend [-f @var{fmt}] [-t @var{cache}] -o @var{options} @var{filename} +@item amend [-p] [-f @var{fmt}] [-t @var{cache}] -o @var{options} @var{filename} Amends the image format specific @var{options} for the image file @var{filename}. Not all file formats support this operation. From b2f27e4438bb9eb302a0976e5b988bdfd55e65dc Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:52 +0100 Subject: [PATCH 38/53] qemu-img: Fix insignificant memleak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As soon as options is set in img_amend(), it needs to be freed before the function returns. This leak is rather insignificant, as qemu-img will exit subsequently anyway, but there's no point in not fixing it. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Reviewed-by: Kevin Wolf Reviewed-by: Benoît Canet Message-id: 1414404776-4919-4-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- qemu-img.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index c7b394a7f9..66a7eb4045 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2948,7 +2948,9 @@ static int img_amend(int argc, char **argv) } if (optind != argc - 1) { - error_exit("Expecting one image file name"); + error_report("Expecting one image file name"); + ret = -1; + goto out; } flags = BDRV_O_FLAGS | BDRV_O_RDWR; From 4057a2b24a5f5dae826d438217f77332c3a6842e Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:53 +0100 Subject: [PATCH 39/53] block/qcow2: Implement status CB for amend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only really time-consuming operation potentially performed by qcow2_amend_options() is zero cluster expansion when downgrading qcow2 images from compat=1.1 to compat=0.10, so report status of that operation and that operation only through the status CB. For this, approximate the progress as the number of L1 entries visited during the operation. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoît Canet Reviewed-by: Kevin Wolf Reviewed-by: Benoit Canet Message-id: 1414404776-4919-5-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/qcow2-cluster.c | 37 +++++++++++++++++++++++++++++++++---- block/qcow2.c | 7 ++++--- block/qcow2.h | 3 ++- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 8411f5e6aa..5687a4b15b 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1618,10 +1618,17 @@ fail: * zero expansion (i.e., has been filled with zeroes and is referenced from an * L2 table). nb_clusters contains the total cluster count of the image file, * i.e., the number of bits in expanded_clusters. + * + * l1_entries and *visited_l1_entries are used to keep track of progress for + * status_cb(). l1_entries contains the total number of L1 entries and + * *visited_l1_entries counts all visited L1 entries. */ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, int l1_size, uint8_t **expanded_clusters, - uint64_t *nb_clusters) + uint64_t *nb_clusters, + int64_t *visited_l1_entries, + int64_t l1_entries, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; bool is_active_l1 = (l1_table == s->l1_table); @@ -1644,6 +1651,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, if (!l2_offset) { /* unallocated */ + (*visited_l1_entries)++; + if (status_cb) { + status_cb(bs, *visited_l1_entries, l1_entries); + } continue; } @@ -1776,6 +1787,11 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, } } } + + (*visited_l1_entries)++; + if (status_cb) { + status_cb(bs, *visited_l1_entries, l1_entries); + } } ret = 0; @@ -1802,15 +1818,24 @@ fail: * allocation for pre-allocated ones). This is important for downgrading to a * qcow2 version which doesn't yet support metadata zero clusters. */ -int qcow2_expand_zero_clusters(BlockDriverState *bs) +int qcow2_expand_zero_clusters(BlockDriverState *bs, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; uint64_t *l1_table = NULL; uint64_t nb_clusters; + int64_t l1_entries = 0, visited_l1_entries = 0; uint8_t *expanded_clusters; int ret; int i, j; + if (status_cb) { + l1_entries = s->l1_size; + for (i = 0; i < s->nb_snapshots; i++) { + l1_entries += s->snapshots[i].l1_size; + } + } + nb_clusters = size_to_clusters(s, bs->file->total_sectors * BDRV_SECTOR_SIZE); expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8); @@ -1820,7 +1845,9 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs) } ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, - &expanded_clusters, &nb_clusters); + &expanded_clusters, &nb_clusters, + &visited_l1_entries, l1_entries, + status_cb); if (ret < 0) { goto fail; } @@ -1854,7 +1881,9 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs) } ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, - &expanded_clusters, &nb_clusters); + &expanded_clusters, &nb_clusters, + &visited_l1_entries, l1_entries, + status_cb); if (ret < 0) { goto fail; } diff --git a/block/qcow2.c b/block/qcow2.c index 02427936c9..d12049451a 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2550,7 +2550,8 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, * Downgrades an image's version. To achieve this, any incompatible features * have to be removed. */ -static int qcow2_downgrade(BlockDriverState *bs, int target_version) +static int qcow2_downgrade(BlockDriverState *bs, int target_version, + BlockDriverAmendStatusCB *status_cb) { BDRVQcowState *s = bs->opaque; int current_version = s->qcow_version; @@ -2599,7 +2600,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version) /* clearing autoclear features is trivial */ s->autoclear_features = 0; - ret = qcow2_expand_zero_clusters(bs); + ret = qcow2_expand_zero_clusters(bs, status_cb); if (ret < 0) { return ret; } @@ -2692,7 +2693,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, return ret; } } else { - ret = qcow2_downgrade(bs, new_version); + ret = qcow2_downgrade(bs, new_version, status_cb); if (ret < 0) { return ret; } diff --git a/block/qcow2.h b/block/qcow2.h index 886b25b7e0..21ac8cead3 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -537,7 +537,8 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors, enum qcow2_discard_type type, bool full_discard); int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); -int qcow2_expand_zero_clusters(BlockDriverState *bs); +int qcow2_expand_zero_clusters(BlockDriverState *bs, + BlockDriverAmendStatusCB *status_cb); /* qcow2-snapshot.c functions */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); From 44751917db7466739af358fdb60ae5fb84ed89df Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:54 +0100 Subject: [PATCH 40/53] block/qcow2: Make get_refcount() global MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading the refcount of a cluster is an operation which can be useful in all of the qcow2 code, so make that function globally available. While touching this function, amend the comment describing the "addend" parameter: It is (no longer, if it ever was) necessary to have it set to -1 or 1; any value is fine. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoît Canet Reviewed-by: Kevin Wolf Reviewed-by: Benoit Canet Message-id: 1414404776-4919-6-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/qcow2-refcount.c | 26 +++++++++++++------------- block/qcow2.h | 2 ++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 1477031206..9afdb40b40 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -91,7 +91,7 @@ static int load_refcount_block(BlockDriverState *bs, * return value is the refcount of the cluster, negative values are -errno * and indicate an error. */ -static int get_refcount(BlockDriverState *bs, int64_t cluster_index) +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index) { BDRVQcowState *s = bs->opaque; uint64_t refcount_table_index, block_index; @@ -629,8 +629,7 @@ fail: } /* - * Increases or decreases the refcount of a given cluster by one. - * addend must be 1 or -1. + * Increases or decreases the refcount of a given cluster. * * If the return value is non-negative, it is the new refcount of the cluster. * If it is negative, it is -errno and indicates an error. @@ -649,7 +648,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, return ret; } - return get_refcount(bs, cluster_index); + return qcow2_get_refcount(bs, cluster_index); } @@ -670,7 +669,7 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) retry: for(i = 0; i < nb_clusters; i++) { uint64_t next_cluster_index = s->free_cluster_index++; - refcount = get_refcount(bs, next_cluster_index); + refcount = qcow2_get_refcount(bs, next_cluster_index); if (refcount < 0) { return refcount; @@ -734,7 +733,7 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, /* Check how many clusters there are free */ cluster_index = offset >> s->cluster_bits; for(i = 0; i < nb_clusters; i++) { - refcount = get_refcount(bs, cluster_index++); + refcount = qcow2_get_refcount(bs, cluster_index++); if (refcount < 0) { return refcount; @@ -981,7 +980,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, cluster_index, addend, QCOW2_DISCARD_SNAPSHOT); } else { - refcount = get_refcount(bs, cluster_index); + refcount = qcow2_get_refcount(bs, cluster_index); } if (refcount < 0) { @@ -1021,7 +1020,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, refcount = qcow2_update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT); } else { - refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); } if (refcount < 0) { ret = refcount; @@ -1332,8 +1331,8 @@ fail: * Checks the OFLAG_COPIED flag for all L1 and L2 entries. * * This function does not print an error message nor does it increment - * check_errors if get_refcount fails (this is because such an error will have - * been already detected and sufficiently signaled by the calling function + * check_errors if qcow2_get_refcount fails (this is because such an error will + * have been already detected and sufficiently signaled by the calling function * (qcow2_check_refcounts) by the time this function is called). */ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, @@ -1354,7 +1353,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, continue; } - refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); if (refcount < 0) { /* don't print message nor increment check_errors */ continue; @@ -1396,7 +1395,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, if ((cluster_type == QCOW2_CLUSTER_NORMAL) || ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { - refcount = get_refcount(bs, data_offset >> s->cluster_bits); + refcount = qcow2_get_refcount(bs, + data_offset >> s->cluster_bits); if (refcount < 0) { /* don't print message nor increment check_errors */ continue; @@ -1632,7 +1632,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, int refcount1, refcount2, ret; for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { - refcount1 = get_refcount(bs, i); + refcount1 = qcow2_get_refcount(bs, i); if (refcount1 < 0) { fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", i, strerror(-refcount1)); diff --git a/block/qcow2.h b/block/qcow2.h index 21ac8cead3..6e39a1b639 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -487,6 +487,8 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, int qcow2_refcount_init(BlockDriverState *bs); void qcow2_refcount_close(BlockDriverState *bs); +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index); + int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, int addend, enum qcow2_discard_type type); From ecf58777c5ff242f656dd3836475a5ded9c2eaa5 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:55 +0100 Subject: [PATCH 41/53] block/qcow2: Simplify shared L2 handling in amend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, we have a bitmap for keeping track of which clusters have been created during the zero cluster expansion process. This was necessary because we need to properly increase the refcount for shared L2 tables. However, now we can simply take the L2 refcount and use it for the cluster allocated for expansion. This will be the correct refcount and therefore we don't have to remember that cluster having been allocated any more. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoît Canet Reviewed-by: Kevin Wolf Reviewed-by: Benoit Canet Message-id: 1414404776-4919-7-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- block/qcow2-cluster.c | 94 +++++++++++++------------------------------ 1 file changed, 28 insertions(+), 66 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 5687a4b15b..df0b2c9cec 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1613,20 +1613,12 @@ fail: * Expands all zero clusters in a specific L1 table (or deallocates them, for * non-backed non-pre-allocated zero clusters). * - * expanded_clusters is a bitmap where every bit corresponds to one cluster in - * the image file; a bit gets set if the corresponding cluster has been used for - * zero expansion (i.e., has been filled with zeroes and is referenced from an - * L2 table). nb_clusters contains the total cluster count of the image file, - * i.e., the number of bits in expanded_clusters. - * * l1_entries and *visited_l1_entries are used to keep track of progress for * status_cb(). l1_entries contains the total number of L1 entries and * *visited_l1_entries counts all visited L1 entries. */ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, - int l1_size, uint8_t **expanded_clusters, - uint64_t *nb_clusters, - int64_t *visited_l1_entries, + int l1_size, int64_t *visited_l1_entries, int64_t l1_entries, BlockDriverAmendStatusCB *status_cb) { @@ -1648,6 +1640,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, for (i = 0; i < l1_size; i++) { uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; bool l2_dirty = false; + int l2_refcount; if (!l2_offset) { /* unallocated */ @@ -1671,33 +1664,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } + l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits); + if (l2_refcount < 0) { + ret = l2_refcount; + goto fail; + } + for (j = 0; j < s->l2_size; j++) { uint64_t l2_entry = be64_to_cpu(l2_table[j]); - int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index; + int64_t offset = l2_entry & L2E_OFFSET_MASK; int cluster_type = qcow2_get_cluster_type(l2_entry); bool preallocated = offset != 0; - if (cluster_type == QCOW2_CLUSTER_NORMAL) { - cluster_index = offset >> s->cluster_bits; - assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); - if ((*expanded_clusters)[cluster_index / 8] & - (1 << (cluster_index % 8))) { - /* Probably a shared L2 table; this cluster was a zero - * cluster which has been expanded, its refcount - * therefore most likely requires an update. */ - ret = qcow2_update_cluster_refcount(bs, cluster_index, 1, - QCOW2_DISCARD_NEVER); - if (ret < 0) { - goto fail; - } - /* Since we just increased the refcount, the COPIED flag may - * no longer be set. */ - l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED); - l2_dirty = true; - } - continue; - } - else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) { + if (cluster_type != QCOW2_CLUSTER_ZERO) { continue; } @@ -1715,6 +1694,19 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, ret = offset; goto fail; } + + if (l2_refcount > 1) { + /* For shared L2 tables, set the refcount accordingly (it is + * already 1 and needs to be l2_refcount) */ + ret = qcow2_update_cluster_refcount(bs, + offset >> s->cluster_bits, l2_refcount - 1, + QCOW2_DISCARD_OTHER); + if (ret < 0) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_OTHER); + goto fail; + } + } } ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); @@ -1736,29 +1728,12 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); - l2_dirty = true; - - cluster_index = offset >> s->cluster_bits; - - if (cluster_index >= *nb_clusters) { - uint64_t old_bitmap_size = (*nb_clusters + 7) / 8; - uint64_t new_bitmap_size; - /* The offset may lie beyond the old end of the underlying image - * file for growable files only */ - assert(bs->file->growable); - *nb_clusters = size_to_clusters(s, bs->file->total_sectors * - BDRV_SECTOR_SIZE); - new_bitmap_size = (*nb_clusters + 7) / 8; - *expanded_clusters = g_realloc(*expanded_clusters, - new_bitmap_size); - /* clear the newly allocated space */ - memset(&(*expanded_clusters)[old_bitmap_size], 0, - new_bitmap_size - old_bitmap_size); + if (l2_refcount == 1) { + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); + } else { + l2_table[j] = cpu_to_be64(offset); } - - assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); - (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8); + l2_dirty = true; } if (is_active_l1) { @@ -1823,9 +1798,7 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; uint64_t *l1_table = NULL; - uint64_t nb_clusters; int64_t l1_entries = 0, visited_l1_entries = 0; - uint8_t *expanded_clusters; int ret; int i, j; @@ -1836,16 +1809,7 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, } } - nb_clusters = size_to_clusters(s, bs->file->total_sectors * - BDRV_SECTOR_SIZE); - expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8); - if (expanded_clusters == NULL) { - ret = -ENOMEM; - goto fail; - } - ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, - &expanded_clusters, &nb_clusters, &visited_l1_entries, l1_entries, status_cb); if (ret < 0) { @@ -1881,7 +1845,6 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, } ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, - &expanded_clusters, &nb_clusters, &visited_l1_entries, l1_entries, status_cb); if (ret < 0) { @@ -1892,7 +1855,6 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, ret = 0; fail: - g_free(expanded_clusters); g_free(l1_table); return ret; } From 78fa65821dd581f375bf8bce47d6e00e64a8066a Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 27 Oct 2014 11:12:56 +0100 Subject: [PATCH 42/53] iotests: Expand test 061 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add some tests for progress output to 061. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Reviewed-by: Benoît Canet Reviewed-by: Kevin Wolf Reviewed-by: Benoit Canet Message-id: 1414404776-4919-8-git-send-email-mreitz@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/061 | 25 +++++++++++++++++++++++++ tests/qemu-iotests/061.out | 30 ++++++++++++++++++++++++++++++ tests/qemu-iotests/group | 2 +- 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061 index ab98def6d4..8d37f8a65c 100755 --- a/tests/qemu-iotests/061 +++ b/tests/qemu-iotests/061 @@ -209,6 +209,31 @@ $QEMU_IMG amend -o "compat=0.10" "$TEST_IMG" _check_test_img $QEMU_IO -c "read -P 0 0 64M" "$TEST_IMG" | _filter_qemu_io +echo +echo "=== Testing progress report without snapshot ===" +echo +IMGOPTS="compat=1.1" TEST_IMG="$TEST_IMG.base" _make_test_img 4G +IMGOPTS="compat=1.1" _make_test_img -b "$TEST_IMG.base" 4G +$QEMU_IO -c "write -z 0 64k" \ + -c "write -z 1G 64k" \ + -c "write -z 2G 64k" \ + -c "write -z 3G 64k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IMG amend -p -o "compat=0.10" "$TEST_IMG" +_check_test_img + +echo +echo "=== Testing progress report with snapshot ===" +echo +IMGOPTS="compat=1.1" TEST_IMG="$TEST_IMG.base" _make_test_img 4G +IMGOPTS="compat=1.1" _make_test_img -b "$TEST_IMG.base" 4G +$QEMU_IO -c "write -z 0 64k" \ + -c "write -z 1G 64k" \ + -c "write -z 2G 64k" \ + -c "write -z 3G 64k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IMG snapshot -c foo "$TEST_IMG" +$QEMU_IMG amend -p -o "compat=0.10" "$TEST_IMG" +_check_test_img + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out index 4ae6472c73..9045544df2 100644 --- a/tests/qemu-iotests/061.out +++ b/tests/qemu-iotests/061.out @@ -390,4 +390,34 @@ wrote 67108864/67108864 bytes at offset 0 No errors were found on the image. read 67108864/67108864 bytes at offset 0 64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +=== Testing progress report without snapshot === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=4294967296 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 1073741824 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 2147483648 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 3221225472 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + (0.00/100%) (12.50/100%) (25.00/100%) (37.50/100%) (50.00/100%) (62.50/100%) (75.00/100%) (87.50/100%) (100.00/100%) (100.00/100%) +No errors were found on the image. + +=== Testing progress report with snapshot === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=4294967296 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296 backing_file='TEST_DIR/t.IMGFMT.base' +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 1073741824 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 2147483648 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 3221225472 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + (0.00/100%) (6.25/100%) (12.50/100%) (18.75/100%) (25.00/100%) (31.25/100%) (37.50/100%) (43.75/100%) (50.00/100%) (56.25/100%) (62.50/100%) (68.75/100%) (75.00/100%) (81.25/100%) (87.50/100%) (93.75/100%) (100.00/100%) (100.00/100%) +No errors were found on the image. *** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 281c889921..7b2c66676b 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -67,7 +67,7 @@ 058 rw auto quick 059 rw auto quick 060 rw auto quick -061 rw auto quick +061 rw auto 062 rw auto quick 063 rw auto quick 064 rw auto quick From 3d948cdf3760b52238038626a7ffa7d30913060b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:50 +0100 Subject: [PATCH 43/53] block: acquire AioContext in generic blockjob QMP commands block-job-set-speed, block-job-cancel, block-job-pause, block-job-resume, and block-job-complete must acquire the BlockDriverState AioContext so that it is safe to access bs. At the moment bs->job is always NULL when dataplane is active because op blockers prevent blockjobs from starting. Once the rest of the blockjob API has been made aware of AioContext we can drop the op blocker. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-2-git-send-email-stefanha@redhat.com --- blockdev.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/blockdev.c b/blockdev.c index eb743a956c..501473da69 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2375,20 +2375,35 @@ void qmp_drive_mirror(const char *device, const char *target, } } -static BlockJob *find_block_job(const char *device) +/* Get the block job for a given device name and acquire its AioContext */ +static BlockJob *find_block_job(const char *device, AioContext **aio_context) { BlockDriverState *bs; bs = bdrv_find(device); - if (!bs || !bs->job) { - return NULL; + if (!bs) { + goto notfound; } + + *aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(*aio_context); + + if (!bs->job) { + aio_context_release(*aio_context); + goto notfound; + } + return bs->job; + +notfound: + *aio_context = NULL; + return NULL; } void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp) { - BlockJob *job = find_block_job(device); + AioContext *aio_context; + BlockJob *job = find_block_job(device, &aio_context); if (!job) { error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); @@ -2396,34 +2411,40 @@ void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp) } block_job_set_speed(job, speed, errp); + aio_context_release(aio_context); } void qmp_block_job_cancel(const char *device, bool has_force, bool force, Error **errp) { - BlockJob *job = find_block_job(device); - - if (!has_force) { - force = false; - } + AioContext *aio_context; + BlockJob *job = find_block_job(device, &aio_context); if (!job) { error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); return; } + + if (!has_force) { + force = false; + } + if (job->paused && !force) { error_setg(errp, "The block job for device '%s' is currently paused", device); - return; + goto out; } trace_qmp_block_job_cancel(job); block_job_cancel(job); +out: + aio_context_release(aio_context); } void qmp_block_job_pause(const char *device, Error **errp) { - BlockJob *job = find_block_job(device); + AioContext *aio_context; + BlockJob *job = find_block_job(device, &aio_context); if (!job) { error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); @@ -2432,11 +2453,13 @@ void qmp_block_job_pause(const char *device, Error **errp) trace_qmp_block_job_pause(job); block_job_pause(job); + aio_context_release(aio_context); } void qmp_block_job_resume(const char *device, Error **errp) { - BlockJob *job = find_block_job(device); + AioContext *aio_context; + BlockJob *job = find_block_job(device, &aio_context); if (!job) { error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); @@ -2445,11 +2468,13 @@ void qmp_block_job_resume(const char *device, Error **errp) trace_qmp_block_job_resume(job); block_job_resume(job); + aio_context_release(aio_context); } void qmp_block_job_complete(const char *device, Error **errp) { - BlockJob *job = find_block_job(device); + AioContext *aio_context; + BlockJob *job = find_block_job(device, &aio_context); if (!job) { error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device); @@ -2458,6 +2483,7 @@ void qmp_block_job_complete(const char *device, Error **errp) trace_qmp_block_job_complete(job); block_job_complete(job, errp); + aio_context_release(aio_context); } void qmp_change_backing_file(const char *device, From 69691e72708603592b1618f1a68d2a3f07db853d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:51 +0100 Subject: [PATCH 44/53] blockdev: acquire AioContext in do_qmp_query_block_jobs_one() Make sure that query-block-jobs acquires the BlockDriverState AioContext so that the blockjob isn't running in another thread while we access its state. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-3-git-send-email-stefanha@redhat.com --- blockdev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/blockdev.c b/blockdev.c index 501473da69..40fc5d624f 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2628,12 +2628,18 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp) BlockDriverState *bs; for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1); elem->value = block_job_query(bs->job); *p_next = elem; p_next = &elem->next; } + + aio_context_release(aio_context); } return head; From 91fddb0db6fee207ccdcca22dd996cf0154a1004 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:52 +0100 Subject: [PATCH 45/53] blockdev: acquire AioContext in blockdev_mark_auto_del() When an emulated storage controller is unrealized it will call blockdev_mark_auto_del(). This will cancel any running block job (and that eventually releases its reference to the BDS so it can be freed). Since the block job may be executing in another AioContext we must acquire/release to ensure thread safety. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-4-git-send-email-stefanha@redhat.com --- blockdev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/blockdev.c b/blockdev.c index 40fc5d624f..741df9805f 100644 --- a/blockdev.c +++ b/blockdev.c @@ -115,14 +115,21 @@ void blockdev_mark_auto_del(BlockBackend *blk) { DriveInfo *dinfo = blk_legacy_dinfo(blk); BlockDriverState *bs = blk_bs(blk); + AioContext *aio_context; if (!dinfo) { return; } + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + if (bs->job) { block_job_cancel(bs->job); } + + aio_context_release(aio_context); + dinfo->auto_del = 1; } From 723c5d93c51bdb3adbc238ce90195c0864aa6cd5 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:53 +0100 Subject: [PATCH 46/53] blockdev: add note that block_job_cb() must be thread-safe This function is correct but we should document the constraint that everything must be thread-safe. Emitting QMP events and scheduling BHs are both thread-safe so nothing needs to be done here. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-5-git-send-email-stefanha@redhat.com --- blockdev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/blockdev.c b/blockdev.c index 741df9805f..774051b44a 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1929,6 +1929,11 @@ out: static void block_job_cb(void *opaque, int ret) { + /* Note that this function may be executed from another AioContext besides + * the QEMU main loop. If you need to access anything that assumes the + * QEMU global mutex, use a BH or introduce a mutex. + */ + BlockDriverState *bs = opaque; const char *msg = NULL; From dec7d421f85578b0949292336f784f55ac84812d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:54 +0100 Subject: [PATCH 47/53] blockjob: add block_job_defer_to_main_loop() Block jobs will run in the BlockDriverState's AioContext, which may not always be the QEMU main loop. There are some block layer APIs that are either not thread-safe or risk lock ordering problems. This includes bdrv_unref(), bdrv_close(), and anything that calls bdrv_drain_all(). The block_job_defer_to_main_loop() API allows a block job to schedule a function to run in the main loop with the BlockDriverState AioContext held. This function will be used to perform cleanup and backing chain manipulations in block jobs. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-6-git-send-email-stefanha@redhat.com --- blockjob.c | 45 ++++++++++++++++++++++++++++++++++++++++ include/block/blockjob.h | 19 +++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/blockjob.c b/blockjob.c index 448b9ce113..cd4b573adb 100644 --- a/blockjob.c +++ b/blockjob.c @@ -342,3 +342,48 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs, } return action; } + +typedef struct { + BlockJob *job; + QEMUBH *bh; + AioContext *aio_context; + BlockJobDeferToMainLoopFn *fn; + void *opaque; +} BlockJobDeferToMainLoopData; + +static void block_job_defer_to_main_loop_bh(void *opaque) +{ + BlockJobDeferToMainLoopData *data = opaque; + AioContext *aio_context; + + qemu_bh_delete(data->bh); + + /* Prevent race with block_job_defer_to_main_loop() */ + aio_context_acquire(data->aio_context); + + /* Fetch BDS AioContext again, in case it has changed */ + aio_context = bdrv_get_aio_context(data->job->bs); + aio_context_acquire(aio_context); + + data->fn(data->job, data->opaque); + + aio_context_release(aio_context); + + aio_context_release(data->aio_context); + + g_free(data); +} + +void block_job_defer_to_main_loop(BlockJob *job, + BlockJobDeferToMainLoopFn *fn, + void *opaque) +{ + BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data)); + data->job = job; + data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data); + data->aio_context = bdrv_get_aio_context(job->bs); + data->fn = fn; + data->opaque = opaque; + + qemu_bh_schedule(data->bh); +} diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 9694f130b0..b6d4ebbe03 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -315,4 +315,23 @@ void block_job_iostatus_reset(BlockJob *job); BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs, BlockdevOnError on_err, int is_read, int error); + +typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque); + +/** + * block_job_defer_to_main_loop: + * @job: The job + * @fn: The function to run in the main loop + * @opaque: The opaque value that is passed to @fn + * + * Execute a given function in the main loop with the BlockDriverState + * AioContext acquired. Block jobs must call bdrv_unref(), bdrv_close(), and + * anything that uses bdrv_drain_all() in the main loop. + * + * The @job AioContext is held while @fn executes. + */ +void block_job_defer_to_main_loop(BlockJob *job, + BlockJobDeferToMainLoopFn *fn, + void *opaque); + #endif From 5b98db0ad3ad2919c71572085d104765bad6c658 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:55 +0100 Subject: [PATCH 48/53] block: add bdrv_drain() Now that op blockers are in use, we can ensure that no other sources are generating I/O on a BlockDriverState. Therefore it is possible to drain requests for a single BDS. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-7-git-send-email-stefanha@redhat.com --- block.c | 36 +++++++++++++++++++++++++++++------- include/block/block.h | 1 + 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/block.c b/block.c index c5ff56019b..a909b9d749 100644 --- a/block.c +++ b/block.c @@ -1904,6 +1904,34 @@ static bool bdrv_requests_pending(BlockDriverState *bs) return false; } +static bool bdrv_drain_one(BlockDriverState *bs) +{ + bool bs_busy; + + bdrv_flush_io_queue(bs); + bdrv_start_throttled_reqs(bs); + bs_busy = bdrv_requests_pending(bs); + bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); + return bs_busy; +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree + * + * See the warning in bdrv_drain_all(). This function can only be called if + * you are sure nothing can generate I/O because you have op blockers + * installed. + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + */ +void bdrv_drain(BlockDriverState *bs) +{ + while (bdrv_drain_one(bs)) { + /* Keep iterating */ + } +} + /* * Wait for pending requests to complete across all BlockDriverStates * @@ -1927,16 +1955,10 @@ void bdrv_drain_all(void) QTAILQ_FOREACH(bs, &bdrv_states, device_list) { AioContext *aio_context = bdrv_get_aio_context(bs); - bool bs_busy; aio_context_acquire(aio_context); - bdrv_flush_io_queue(bs); - bdrv_start_throttled_reqs(bs); - bs_busy = bdrv_requests_pending(bs); - bs_busy |= aio_poll(aio_context, bs_busy); + busy |= bdrv_drain_one(bs); aio_context_release(aio_context); - - busy |= bs_busy; } } } diff --git a/include/block/block.h b/include/block/block.h index 5d13282fa9..13e453736c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -334,6 +334,7 @@ int bdrv_flush(BlockDriverState *bs); int coroutine_fn bdrv_co_flush(BlockDriverState *bs); int bdrv_flush_all(void); void bdrv_close_all(void); +void bdrv_drain(BlockDriverState *bs); void bdrv_drain_all(void); int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); From 761731b1805f6ef64eb615e5b82a0801db3cde78 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:56 +0100 Subject: [PATCH 49/53] block: let backup blockjob run in BDS AioContext The backup block job must run in the BlockDriverState AioContext so that it works with dataplane. The basics of acquiring the AioContext are easy in blockdev.c. The completion code in block/backup.c must call bdrv_unref() from the main loop. Use block_job_defer_to_main_loop() to achieve that. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-8-git-send-email-stefanha@redhat.com --- block/backup.c | 21 +++++++++++++++++++-- blockdev.c | 23 ++++++++++++++++------- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/block/backup.c b/block/backup.c index e334740161..792e65514b 100644 --- a/block/backup.c +++ b/block/backup.c @@ -227,9 +227,25 @@ static BlockErrorAction backup_error_action(BackupBlockJob *job, } } +typedef struct { + int ret; +} BackupCompleteData; + +static void backup_complete(BlockJob *job, void *opaque) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + BackupCompleteData *data = opaque; + + bdrv_unref(s->target); + + block_job_completed(job, data->ret); + g_free(data); +} + static void coroutine_fn backup_run(void *opaque) { BackupBlockJob *job = opaque; + BackupCompleteData *data; BlockDriverState *bs = job->common.bs; BlockDriverState *target = job->target; BlockdevOnError on_target_error = job->on_target_error; @@ -344,9 +360,10 @@ static void coroutine_fn backup_run(void *opaque) hbitmap_free(job->bitmap); bdrv_iostatus_disable(target); - bdrv_unref(target); - block_job_completed(&job->common, ret); + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&job->common, backup_complete, data); } void backup_start(BlockDriverState *bs, BlockDriverState *target, diff --git a/blockdev.c b/blockdev.c index 774051b44a..9c6898833b 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2108,6 +2108,7 @@ void qmp_drive_backup(const char *device, const char *target, BlockDriverState *bs; BlockDriverState *target_bs; BlockDriverState *source = NULL; + AioContext *aio_context; BlockDriver *drv = NULL; Error *local_err = NULL; int flags; @@ -2133,9 +2134,12 @@ void qmp_drive_backup(const char *device, const char *target, return; } + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + if (!bdrv_is_inserted(bs)) { error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device); - return; + goto out; } if (!has_format) { @@ -2145,12 +2149,12 @@ void qmp_drive_backup(const char *device, const char *target, drv = bdrv_find_format(format); if (!drv) { error_set(errp, QERR_INVALID_BLOCK_FORMAT, format); - return; + goto out; } } if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { - return; + goto out; } flags = bs->open_flags | BDRV_O_RDWR; @@ -2170,7 +2174,7 @@ void qmp_drive_backup(const char *device, const char *target, size = bdrv_getlength(bs); if (size < 0) { error_setg_errno(errp, -size, "bdrv_getlength failed"); - return; + goto out; } if (mode != NEW_IMAGE_MODE_EXISTING) { @@ -2187,23 +2191,28 @@ void qmp_drive_backup(const char *device, const char *target, if (local_err) { error_propagate(errp, local_err); - return; + goto out; } target_bs = NULL; ret = bdrv_open(&target_bs, target, NULL, NULL, flags, drv, &local_err); if (ret < 0) { error_propagate(errp, local_err); - return; + goto out; } + bdrv_set_aio_context(target_bs, aio_context); + backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { bdrv_unref(target_bs); error_propagate(errp, local_err); - return; + goto out; } + +out: + aio_context_release(aio_context); } BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) From f3e69beb942103ccd5248273e4d95e76b64ab64c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:57 +0100 Subject: [PATCH 50/53] block: let stream blockjob run in BDS AioContext The stream block job must run in the BlockDriverState AioContext so that it works with dataplane. The basics of acquiring the AioContext are easy in blockdev.c. The tricky part is the completion code which drops part of the backing file chain. This must be done in the main loop where bdrv_unref() and bdrv_close() are safe to call. Use block_job_defer_to_main_loop() to achieve that. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-9-git-send-email-stefanha@redhat.com --- block/stream.c | 50 ++++++++++++++++++++++++++++++++++++-------------- blockdev.c | 16 ++++++++++++---- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/block/stream.c b/block/stream.c index a1dc8da484..a628901f69 100644 --- a/block/stream.c +++ b/block/stream.c @@ -79,9 +79,39 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, bdrv_refresh_limits(top, NULL); } +typedef struct { + int ret; + bool reached_end; +} StreamCompleteData; + +static void stream_complete(BlockJob *job, void *opaque) +{ + StreamBlockJob *s = container_of(job, StreamBlockJob, common); + StreamCompleteData *data = opaque; + BlockDriverState *base = s->base; + + if (!block_job_is_cancelled(&s->common) && data->reached_end && + data->ret == 0) { + const char *base_id = NULL, *base_fmt = NULL; + if (base) { + base_id = s->backing_file_str; + if (base->drv) { + base_fmt = base->drv->format_name; + } + } + data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); + close_unused_images(job->bs, base, base_id); + } + + g_free(s->backing_file_str); + block_job_completed(&s->common, data->ret); + g_free(data); +} + static void coroutine_fn stream_run(void *opaque) { StreamBlockJob *s = opaque; + StreamCompleteData *data; BlockDriverState *bs = s->common.bs; BlockDriverState *base = s->base; int64_t sector_num, end; @@ -183,21 +213,13 @@ wait: /* Do not remove the backing file if an error was there but ignored. */ ret = error; - if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) { - const char *base_id = NULL, *base_fmt = NULL; - if (base) { - base_id = s->backing_file_str; - if (base->drv) { - base_fmt = base->drv->format_name; - } - } - ret = bdrv_change_backing_file(bs, base_id, base_fmt); - close_unused_images(bs, base, base_id); - } - qemu_vfree(buf); - g_free(s->backing_file_str); - block_job_completed(&s->common, ret); + + /* Modify backing chain and close BDSes in main loop */ + data = g_malloc(sizeof(*data)); + data->ret = ret; + data->reached_end = sector_num == end; + block_job_defer_to_main_loop(&s->common, stream_complete, data); } static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) diff --git a/blockdev.c b/blockdev.c index 9c6898833b..6e43d2e3a8 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1963,6 +1963,7 @@ void qmp_block_stream(const char *device, { BlockDriverState *bs; BlockDriverState *base_bs = NULL; + AioContext *aio_context; Error *local_err = NULL; const char *base_name = NULL; @@ -1976,16 +1977,20 @@ void qmp_block_stream(const char *device, return; } + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_STREAM, errp)) { - return; + goto out; } if (has_base) { base_bs = bdrv_find_backing_image(bs, base); if (base_bs == NULL) { error_set(errp, QERR_BASE_NOT_FOUND, base); - return; + goto out; } + assert(bdrv_get_aio_context(base_bs) == aio_context); base_name = base; } @@ -1994,7 +1999,7 @@ void qmp_block_stream(const char *device, if (base_bs == NULL && has_backing_file) { error_setg(errp, "backing file specified, but streaming the " "entire chain"); - return; + goto out; } /* backing_file string overrides base bs filename */ @@ -2004,10 +2009,13 @@ void qmp_block_stream(const char *device, on_error, block_job_cb, bs, &local_err); if (local_err) { error_propagate(errp, local_err); - return; + goto out; } trace_qmp_block_stream(bs, bs->job); + +out: + aio_context_release(aio_context); } void qmp_block_commit(const char *device, From 5a7e7a0bad17c96e03f55ed7019e2d7545e21a96 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:58 +0100 Subject: [PATCH 51/53] block: let mirror blockjob run in BDS AioContext The mirror block job must run in the BlockDriverState AioContext so that it works with dataplane. Acquire the AioContext in blockdev.c so starting the block job is safe. Note that to_replace is treated separately from other BlockDriverStates in that it does not need to be in the same AioContext. Explicitly acquire/release to_replace's AioContext when accessing it. The completion code in block/mirror.c must perform BDS graph manipulation and bdrv_reopen() from the main loop. Use block_job_defer_to_main_loop() to achieve that. The bdrv_drain_all() call is not allowed outside the main loop since it could lead to lock ordering problems. Use bdrv_drain(bs) instead because we have acquired the AioContext so nothing else can sneak in I/O. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-10-git-send-email-stefanha@redhat.com --- block.c | 13 ++++++-- block/mirror.c | 85 +++++++++++++++++++++++++++++++++++--------------- blockdev.c | 38 +++++++++++++++------- 3 files changed, 97 insertions(+), 39 deletions(-) diff --git a/block.c b/block.c index a909b9d749..dacd8815d3 100644 --- a/block.c +++ b/block.c @@ -5850,13 +5850,19 @@ bool bdrv_is_first_non_filter(BlockDriverState *candidate) BlockDriverState *check_to_replace_node(const char *node_name, Error **errp) { BlockDriverState *to_replace_bs = bdrv_find_node(node_name); + AioContext *aio_context; + if (!to_replace_bs) { error_setg(errp, "Node name '%s' not found", node_name); return NULL; } + aio_context = bdrv_get_aio_context(to_replace_bs); + aio_context_acquire(aio_context); + if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) { - return NULL; + to_replace_bs = NULL; + goto out; } /* We don't want arbitrary node of the BDS chain to be replaced only the top @@ -5866,9 +5872,12 @@ BlockDriverState *check_to_replace_node(const char *node_name, Error **errp) */ if (!bdrv_is_first_non_filter(to_replace_bs)) { error_setg(errp, "Only top most non filter can be replaced"); - return NULL; + to_replace_bs = NULL; + goto out; } +out: + aio_context_release(aio_context); return to_replace_bs; } diff --git a/block/mirror.c b/block/mirror.c index 2a1acfe26f..2c6dd2a4c1 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -321,9 +321,56 @@ static void mirror_drain(MirrorBlockJob *s) } } +typedef struct { + int ret; +} MirrorExitData; + +static void mirror_exit(BlockJob *job, void *opaque) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + MirrorExitData *data = opaque; + AioContext *replace_aio_context = NULL; + + if (s->to_replace) { + replace_aio_context = bdrv_get_aio_context(s->to_replace); + aio_context_acquire(replace_aio_context); + } + + if (s->should_complete && data->ret == 0) { + BlockDriverState *to_replace = s->common.bs; + if (s->to_replace) { + to_replace = s->to_replace; + } + if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { + bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); + } + bdrv_swap(s->target, to_replace); + if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { + /* drop the bs loop chain formed by the swap: break the loop then + * trigger the unref from the top one */ + BlockDriverState *p = s->base->backing_hd; + bdrv_set_backing_hd(s->base, NULL); + bdrv_unref(p); + } + } + if (s->to_replace) { + bdrv_op_unblock_all(s->to_replace, s->replace_blocker); + error_free(s->replace_blocker); + bdrv_unref(s->to_replace); + } + if (replace_aio_context) { + aio_context_release(replace_aio_context); + } + g_free(s->replaces); + bdrv_unref(s->target); + block_job_completed(&s->common, data->ret); + g_free(data); +} + static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; + MirrorExitData *data; BlockDriverState *bs = s->common.bs; int64_t sector_num, end, sectors_per_chunk, length; uint64_t last_pause_ns; @@ -479,7 +526,7 @@ static void coroutine_fn mirror_run(void *opaque) * mirror_populate runs. */ trace_mirror_before_drain(s, cnt); - bdrv_drain_all(); + bdrv_drain(bs); cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); } @@ -520,31 +567,10 @@ immediate_exit: g_free(s->in_flight_bitmap); bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); bdrv_iostatus_disable(s->target); - if (s->should_complete && ret == 0) { - BlockDriverState *to_replace = s->common.bs; - if (s->to_replace) { - to_replace = s->to_replace; - } - if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { - bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); - } - bdrv_swap(s->target, to_replace); - if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { - /* drop the bs loop chain formed by the swap: break the loop then - * trigger the unref from the top one */ - BlockDriverState *p = s->base->backing_hd; - bdrv_set_backing_hd(s->base, NULL); - bdrv_unref(p); - } - } - if (s->to_replace) { - bdrv_op_unblock_all(s->to_replace, s->replace_blocker); - error_free(s->replace_blocker); - bdrv_unref(s->to_replace); - } - g_free(s->replaces); - bdrv_unref(s->target); - block_job_completed(&s->common, ret); + + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&s->common, mirror_exit, data); } static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -584,16 +610,23 @@ static void mirror_complete(BlockJob *job, Error **errp) /* check the target bs is not blocked and block all operations on it */ if (s->replaces) { + AioContext *replace_aio_context; + s->to_replace = check_to_replace_node(s->replaces, &local_err); if (!s->to_replace) { error_propagate(errp, local_err); return; } + replace_aio_context = bdrv_get_aio_context(s->to_replace); + aio_context_acquire(replace_aio_context); + error_setg(&s->replace_blocker, "block device is in use by block-job-complete"); bdrv_op_block_all(s->to_replace, s->replace_blocker); bdrv_ref(s->to_replace); + + aio_context_release(replace_aio_context); } s->should_complete = true; diff --git a/blockdev.c b/blockdev.c index 6e43d2e3a8..13763225ee 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2245,6 +2245,7 @@ void qmp_drive_mirror(const char *device, const char *target, { BlockDriverState *bs; BlockDriverState *source, *target_bs; + AioContext *aio_context; BlockDriver *drv = NULL; Error *local_err = NULL; QDict *options = NULL; @@ -2287,9 +2288,12 @@ void qmp_drive_mirror(const char *device, const char *target, return; } + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + if (!bdrv_is_inserted(bs)) { error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device); - return; + goto out; } if (!has_format) { @@ -2299,12 +2303,12 @@ void qmp_drive_mirror(const char *device, const char *target, drv = bdrv_find_format(format); if (!drv) { error_set(errp, QERR_INVALID_BLOCK_FORMAT, format); - return; + goto out; } } if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_MIRROR, errp)) { - return; + goto out; } flags = bs->open_flags | BDRV_O_RDWR; @@ -2319,29 +2323,36 @@ void qmp_drive_mirror(const char *device, const char *target, size = bdrv_getlength(bs); if (size < 0) { error_setg_errno(errp, -size, "bdrv_getlength failed"); - return; + goto out; } if (has_replaces) { BlockDriverState *to_replace_bs; + AioContext *replace_aio_context; + int64_t replace_size; if (!has_node_name) { error_setg(errp, "a node-name must be provided when replacing a" " named node of the graph"); - return; + goto out; } to_replace_bs = check_to_replace_node(replaces, &local_err); if (!to_replace_bs) { error_propagate(errp, local_err); - return; + goto out; } - if (size != bdrv_getlength(to_replace_bs)) { + replace_aio_context = bdrv_get_aio_context(to_replace_bs); + aio_context_acquire(replace_aio_context); + replace_size = bdrv_getlength(to_replace_bs); + aio_context_release(replace_aio_context); + + if (size != replace_size) { error_setg(errp, "cannot replace image with a mirror image of " "different size"); - return; + goto out; } } @@ -2370,7 +2381,7 @@ void qmp_drive_mirror(const char *device, const char *target, if (local_err) { error_propagate(errp, local_err); - return; + goto out; } if (has_node_name) { @@ -2386,9 +2397,11 @@ void qmp_drive_mirror(const char *device, const char *target, flags | BDRV_O_NO_BACKING, drv, &local_err); if (ret < 0) { error_propagate(errp, local_err); - return; + goto out; } + bdrv_set_aio_context(target_bs, aio_context); + /* pass the node name to replace to mirror start since it's loose coupling * and will allow to check whether the node still exist at mirror completion */ @@ -2400,8 +2413,11 @@ void qmp_drive_mirror(const char *device, const char *target, if (local_err != NULL) { bdrv_unref(target_bs); error_propagate(errp, local_err); - return; + goto out; } + +out: + aio_context_release(aio_context); } /* Get the block job for a given device name and acquire its AioContext */ From 9e85cd5ce0d3fc99d910428c9fd9d267764d341b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:03:59 +0100 Subject: [PATCH 52/53] block: let commit blockjob run in BDS AioContext The commit block job must run in the BlockDriverState AioContext so that it works with dataplane. Acquire the AioContext in blockdev.c so starting the block job is safe. One detail here is that the bdrv_drain_all() must be moved inside the aio_context_acquire() region so requests cannot sneak in between the drain and acquire. The completion code in block/commit.c must perform backing chain manipulation and bdrv_reopen() from the main loop. Use block_job_defer_to_main_loop() to achieve that. Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-11-git-send-email-stefanha@redhat.com --- block/commit.c | 70 +++++++++++++++++++++++++++++++------------------- blockdev.c | 29 ++++++++++++++------- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/block/commit.c b/block/commit.c index 60a2accf04..cfa2bbebc2 100644 --- a/block/commit.c +++ b/block/commit.c @@ -60,17 +60,50 @@ static int coroutine_fn commit_populate(BlockDriverState *bs, return 0; } -static void coroutine_fn commit_run(void *opaque) +typedef struct { + int ret; +} CommitCompleteData; + +static void commit_complete(BlockJob *job, void *opaque) { - CommitBlockJob *s = opaque; + CommitBlockJob *s = container_of(job, CommitBlockJob, common); + CommitCompleteData *data = opaque; BlockDriverState *active = s->active; BlockDriverState *top = s->top; BlockDriverState *base = s->base; BlockDriverState *overlay_bs; + int ret = data->ret; + + if (!block_job_is_cancelled(&s->common) && ret == 0) { + /* success */ + ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); + } + + /* restore base open flags here if appropriate (e.g., change the base back + * to r/o). These reopens do not need to be atomic, since we won't abort + * even on failure here */ + if (s->base_flags != bdrv_get_flags(base)) { + bdrv_reopen(base, s->base_flags, NULL); + } + overlay_bs = bdrv_find_overlay(active, top); + if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { + bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); + } + g_free(s->backing_file_str); + block_job_completed(&s->common, ret); + g_free(data); +} + +static void coroutine_fn commit_run(void *opaque) +{ + CommitBlockJob *s = opaque; + CommitCompleteData *data; + BlockDriverState *top = s->top; + BlockDriverState *base = s->base; int64_t sector_num, end; int ret = 0; int n = 0; - void *buf; + void *buf = NULL; int bytes_written = 0; int64_t base_len; @@ -78,18 +111,18 @@ static void coroutine_fn commit_run(void *opaque) if (s->common.len < 0) { - goto exit_restore_reopen; + goto out; } ret = base_len = bdrv_getlength(base); if (base_len < 0) { - goto exit_restore_reopen; + goto out; } if (base_len < s->common.len) { ret = bdrv_truncate(base, s->common.len); if (ret) { - goto exit_restore_reopen; + goto out; } } @@ -128,7 +161,7 @@ wait: if (s->on_error == BLOCKDEV_ON_ERROR_STOP || s->on_error == BLOCKDEV_ON_ERROR_REPORT|| (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) { - goto exit_free_buf; + goto out; } else { n = 0; continue; @@ -140,27 +173,12 @@ wait: ret = 0; - if (!block_job_is_cancelled(&s->common) && sector_num == end) { - /* success */ - ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); - } - -exit_free_buf: +out: qemu_vfree(buf); -exit_restore_reopen: - /* restore base open flags here if appropriate (e.g., change the base back - * to r/o). These reopens do not need to be atomic, since we won't abort - * even on failure here */ - if (s->base_flags != bdrv_get_flags(base)) { - bdrv_reopen(base, s->base_flags, NULL); - } - overlay_bs = bdrv_find_overlay(active, top); - if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { - bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); - } - g_free(s->backing_file_str); - block_job_completed(&s->common, ret); + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&s->common, commit_complete, data); } static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) diff --git a/blockdev.c b/blockdev.c index 13763225ee..57910b82c7 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2027,6 +2027,7 @@ void qmp_block_commit(const char *device, { BlockDriverState *bs; BlockDriverState *base_bs, *top_bs; + AioContext *aio_context; Error *local_err = NULL; /* This will be part of the QMP command, if/when the * BlockdevOnError change for blkmirror makes it in @@ -2037,9 +2038,6 @@ void qmp_block_commit(const char *device, speed = 0; } - /* drain all i/o before commits */ - bdrv_drain_all(); - /* Important Note: * libvirt relies on the DeviceNotFound error class in order to probe for * live commit feature versions; for this to work, we must make sure to @@ -2051,8 +2049,14 @@ void qmp_block_commit(const char *device, return; } + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + /* drain all i/o before commits */ + bdrv_drain_all(); + if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, errp)) { - return; + goto out; } /* default top_bs is the active layer */ @@ -2066,9 +2070,11 @@ void qmp_block_commit(const char *device, if (top_bs == NULL) { error_setg(errp, "Top image file %s not found", top ? top : "NULL"); - return; + goto out; } + assert(bdrv_get_aio_context(top_bs) == aio_context); + if (has_base && base) { base_bs = bdrv_find_backing_image(top_bs, base); } else { @@ -2077,20 +2083,22 @@ void qmp_block_commit(const char *device, if (base_bs == NULL) { error_set(errp, QERR_BASE_NOT_FOUND, base ? base : "NULL"); - return; + goto out; } + assert(bdrv_get_aio_context(base_bs) == aio_context); + /* Do not allow attempts to commit an image into itself */ if (top_bs == base_bs) { error_setg(errp, "cannot commit an image into itself"); - return; + goto out; } if (top_bs == bs) { if (has_backing_file) { error_setg(errp, "'backing-file' specified," " but 'top' is the active layer"); - return; + goto out; } commit_active_start(bs, base_bs, speed, on_error, block_job_cb, bs, &local_err); @@ -2100,8 +2108,11 @@ void qmp_block_commit(const char *device, } if (local_err != NULL) { error_propagate(errp, local_err); - return; + goto out; } + +out: + aio_context_release(aio_context); } void qmp_drive_backup(const char *device, const char *target, From b112a65c52aa45a23b83b1e0d56db3b7cc44597e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 21 Oct 2014 12:04:00 +0100 Subject: [PATCH 53/53] block: declare blockjobs and dataplane friends! Now that blockjobs use AioContext they are safe for use with dataplane. Unblock them! Signed-off-by: Stefan Hajnoczi Reviewed-by: Max Reitz Message-id: 1413889440-32577-12-git-send-email-stefanha@redhat.com --- blockjob.c | 1 + hw/block/dataplane/virtio-blk.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/blockjob.c b/blockjob.c index cd4b573adb..ba2255d91f 100644 --- a/blockjob.c +++ b/blockjob.c @@ -50,6 +50,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, error_setg(&job->blocker, "block device is in use by block job: %s", BlockJobType_lookup[driver->job_type]); bdrv_op_block_all(bs, job->blocker); + bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker); job->driver = driver; job->bs = bs; diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index 45b1164c3e..1222a37f4f 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -196,6 +196,11 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf, blk_op_block_all(conf->conf.blk, s->blocker); blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_RESIZE, s->blocker); blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_DRIVE_DEL, s->blocker); + blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, s->blocker); + blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_COMMIT, s->blocker); + blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_MIRROR, s->blocker); + blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker); + blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker); *dataplane = s; }