From bc3a7f90ff44037bbe898708081db23a08fa7189 Mon Sep 17 00:00:00 2001 From: Chunyan Liu Date: Wed, 2 Jul 2014 12:27:29 +0800 Subject: [PATCH 01/11] Fix nocow typos in manpage Signed-off-by: Chunyan Liu Signed-off-by: Stefan Hajnoczi --- qemu-doc.texi | 4 ++-- qemu-img.texi | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/qemu-doc.texi b/qemu-doc.texi index ad92c85cba..551619abd7 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -590,7 +590,7 @@ check -r all} is required, which may take some time. This option can only be enabled if @code{compat=1.1} is specified. @item nocow -If this option is set to @code{on}, it will trun off COW of the file. It's only +If this option is set to @code{on}, it will turn off COW of the file. It's only valid on btrfs, no effect on other file systems. Btrfs has low performance when hosting a VM image file, even more when the guest @@ -603,7 +603,7 @@ does. Note: this option is only valid to new or empty files. If there is an existing file which is COW and has data blocks already, it couldn't be changed to NOCOW by setting @code{nocow=on}. One can issue @code{lsattr filename} to check if -the NOCOW flag is set or not (Capitabl 'C' is NOCOW flag). +the NOCOW flag is set or not (Capital 'C' is NOCOW flag). @end table diff --git a/qemu-img.texi b/qemu-img.texi index 8496f3b8dc..514be90f7d 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -475,7 +475,7 @@ check -r all} is required, which may take some time. This option can only be enabled if @code{compat=1.1} is specified. @item nocow -If this option is set to @code{on}, it will trun off COW of the file. It's only +If this option is set to @code{on}, it will turn off COW of the file. It's only valid on btrfs, no effect on other file systems. Btrfs has low performance when hosting a VM image file, even more when the guest @@ -488,7 +488,7 @@ does. Note: this option is only valid to new or empty files. If there is an existing file which is COW and has data blocks already, it couldn't be changed to NOCOW by setting @code{nocow=on}. One can issue @code{lsattr filename} to check if -the NOCOW flag is set or not (Capitabl 'C' is NOCOW flag). +the NOCOW flag is set or not (Capital 'C' is NOCOW flag). @end table From 5a0f6fd5c84573387056e0464a7fc0c6fb70b2dc Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 1 Jul 2014 16:52:21 +0200 Subject: [PATCH 02/11] mirror: Fix qiov size for short requests When mirroring an image of a size that is not a multiple of the mirror job granularity, the last request would have the right nb_sectors argument, but a qiov that is rounded up to the next multiple of the granularity. Don't do this. This fixes a segfault that is caused by raw-posix being confused by this and allocating a buffer with request length, but operating on it with qiov length. [s/Driver/Drive/ in qemu-iotests 041 as suggested by Eric --Stefan] Reported-by: Eric Blake Signed-off-by: Kevin Wolf Tested-by: Eric Blake Reviewed-by: Eric Blake Signed-off-by: Stefan Hajnoczi --- block/mirror.c | 4 +++- tests/qemu-iotests/041 | 5 +++++ tests/qemu-iotests/041.out | 4 ++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 6c3ee7041c..c7a655fc58 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -265,9 +265,11 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) next_sector = sector_num; while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); + size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; + QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; - qemu_iovec_add(&op->qiov, buf, s->granularity); + qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); /* Advance the HBitmapIter in parallel, so that we do not examine * the same sector twice. diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 0815e19274..005090ecc2 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -217,6 +217,11 @@ class TestSingleDriveZeroLength(TestSingleDrive): test_small_buffer2 = None test_large_cluster = None +class TestSingleDriveUnalignedLength(TestSingleDrive): + image_len = 1025 * 1024 + test_small_buffer2 = None + test_large_cluster = None + class TestMirrorNoBacking(ImageMirroringTestCase): image_len = 2 * 1024 * 1024 # MB diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index 42147c0b58..24093bc631 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -.............................................. +...................................................... ---------------------------------------------------------------------- -Ran 46 tests +Ran 54 tests OK From 37253e1ec898b167c5ff1fe63e14914bd35e28d9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 1 Jul 2014 14:04:31 +0200 Subject: [PATCH 03/11] MAINTAINERS: add Stefan Hajnoczi to IDE maintainers Make Stefan officially co-maintain hw/ide/ with Kevin. Signed-off-by: Stefan Hajnoczi Acked-by: Kevin Wolf --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e7dc90782e..906f252477 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -563,6 +563,7 @@ Devices ------- IDE M: Kevin Wolf +M: Stefan Hajnoczi S: Odd Fixes F: include/hw/ide.h F: hw/ide/ From d02f8adc6d2a178bcbf77d0413f9a96fdbed53f0 Mon Sep 17 00:00:00 2001 From: Reza Jelveh Date: Tue, 1 Jul 2014 13:13:27 +0200 Subject: [PATCH 04/11] ahci.c: mask unused flags when reading size PRDT DBC The data byte count(DBC) read from the description information is defined for bits 21:00. Bits 30:22 are reserved and bit 31 is the Interrupt on Completion (I) flag. Completion interrupts are triggered after every transaction instead of on I-flag in QEMU. tbl_entry_size is a signed integer and improperly reading the DBC leads to a negative offset that causes sglist allocation to fail. Signed-off-by: Reza Jelveh Reviewed-by: Alexander Graf Reviewed-by: Kevin Wolf Reviewed-by: John Snow Signed-off-by: Stefan Hajnoczi --- hw/ide/ahci.c | 11 ++++++++--- hw/ide/ahci.h | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 9bae22ecb1..cd140d16b8 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -639,6 +639,11 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis) } } +static int prdt_tbl_entry_size(const AHCI_SG *tbl) +{ + return (le32_to_cpu(tbl->flags_size) & AHCI_PRDT_SIZE_MASK) + 1; +} + static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset) { AHCICmdHdr *cmd = ad->cur_cmd; @@ -681,7 +686,7 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset) sum = 0; for (i = 0; i < sglist_alloc_hint; i++) { /* flags_size is zero-based */ - tbl_entry_size = (le32_to_cpu(tbl[i].flags_size) + 1); + tbl_entry_size = prdt_tbl_entry_size(&tbl[i]); if (offset <= (sum + tbl_entry_size)) { off_idx = i; off_pos = offset - sum; @@ -700,12 +705,12 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist, int offset) qemu_sglist_init(sglist, qbus->parent, (sglist_alloc_hint - off_idx), ad->hba->as); qemu_sglist_add(sglist, le64_to_cpu(tbl[off_idx].addr + off_pos), - le32_to_cpu(tbl[off_idx].flags_size) + 1 - off_pos); + prdt_tbl_entry_size(&tbl[off_idx]) - off_pos); for (i = off_idx + 1; i < sglist_alloc_hint; i++) { /* flags_size is zero-based */ qemu_sglist_add(sglist, le64_to_cpu(tbl[i].addr), - le32_to_cpu(tbl[i].flags_size) + 1); + prdt_tbl_entry_size(&tbl[i])); } } diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h index 9a4064f892..f418b30ce7 100644 --- a/hw/ide/ahci.h +++ b/hw/ide/ahci.h @@ -201,6 +201,8 @@ #define AHCI_COMMAND_TABLE_ACMD 0x40 +#define AHCI_PRDT_SIZE_MASK 0x3fffff + #define IDE_FEATURE_DMA 1 #define READ_FPDMA_QUEUED 0x60 From a42a1facb77fb0e3db9a416922de13d9ab66c26a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Tue, 1 Jul 2014 13:34:12 +0200 Subject: [PATCH 05/11] qemu-iotests: Disable Quorum testing in 041 when Quorum is not builtin This avoid breaking tests on RHEL6 where gnutls is too old for quorum to be built by default. Signed-off-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/041 | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 005090ecc2..5dbd4ee91b 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -740,6 +740,9 @@ class TestRepairQuorum(ImageMirroringTestCase): image_len = 1 * 1024 * 1024 # MB IMAGES = [ quorum_img1, quorum_img2, quorum_img3 ] + def has_quorum(self): + return 'quorum' in iotests.qemu_img_pipe('--help') + def setUp(self): self.vm = iotests.VM() @@ -757,8 +760,9 @@ class TestRepairQuorum(ImageMirroringTestCase): #assemble the quorum block device from the individual files args = { "options" : { "driver": "quorum", "id": "quorum0", "vote-threshold": 2, "children": [ "img0", "img1", "img2" ] } } - result = self.vm.qmp("blockdev-add", **args) - self.assert_qmp(result, 'return', {}) + if self.has_quorum(): + result = self.vm.qmp("blockdev-add", **args) + self.assert_qmp(result, 'return', {}) def tearDown(self): @@ -771,6 +775,9 @@ class TestRepairQuorum(ImageMirroringTestCase): pass def test_complete(self): + if not self.has_quorum(): + return + self.assert_no_active_block_jobs() result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', @@ -789,6 +796,9 @@ class TestRepairQuorum(ImageMirroringTestCase): 'target image does not match source after mirroring') def test_cancel(self): + if not self.has_quorum(): + return + self.assert_no_active_block_jobs() result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', @@ -805,6 +815,9 @@ class TestRepairQuorum(ImageMirroringTestCase): self.vm.shutdown() def test_cancel_after_ready(self): + if not self.has_quorum(): + return + self.assert_no_active_block_jobs() result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', @@ -823,6 +836,9 @@ class TestRepairQuorum(ImageMirroringTestCase): 'target image does not match source after mirroring') def test_pause(self): + if not self.has_quorum(): + return + self.assert_no_active_block_jobs() result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', @@ -851,6 +867,9 @@ class TestRepairQuorum(ImageMirroringTestCase): 'target image does not match source after mirroring') def test_medium_not_found(self): + if not self.has_quorum(): + return + result = self.vm.qmp('drive-mirror', device='ide1-cd0', sync='full', node_name='repair0', replaces='img1', @@ -858,6 +877,9 @@ class TestRepairQuorum(ImageMirroringTestCase): self.assert_qmp(result, 'error/class', 'GenericError') def test_image_not_found(self): + if not self.has_quorum(): + return + result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', node_name='repair0', replaces='img1', @@ -866,6 +888,9 @@ class TestRepairQuorum(ImageMirroringTestCase): self.assert_qmp(result, 'error/class', 'GenericError') def test_device_not_found(self): + if not self.has_quorum(): + return + result = self.vm.qmp('drive-mirror', device='nonexistent', sync='full', node_name='repair0', replaces='img1', @@ -873,6 +898,9 @@ class TestRepairQuorum(ImageMirroringTestCase): self.assert_qmp(result, 'error/class', 'DeviceNotFound') def test_wrong_sync_mode(self): + if not self.has_quorum(): + return + result = self.vm.qmp('drive-mirror', device='quorum0', node_name='repair0', replaces='img1', @@ -880,12 +908,18 @@ class TestRepairQuorum(ImageMirroringTestCase): self.assert_qmp(result, 'error/class', 'GenericError') def test_no_node_name(self): + if not self.has_quorum(): + return + result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', replaces='img1', target=quorum_repair_img, format=iotests.imgfmt) self.assert_qmp(result, 'error/class', 'GenericError') def test_unexistant_replaces(self): + if not self.has_quorum(): + return + result = self.vm.qmp('drive-mirror', device='quorum0', sync='full', node_name='repair0', replaces='img77', @@ -893,6 +927,9 @@ class TestRepairQuorum(ImageMirroringTestCase): self.assert_qmp(result, 'error/class', 'GenericError') def test_after_a_quorum_snapshot(self): + if not self.has_quorum(): + return + result = self.vm.qmp('blockdev-snapshot-sync', node_name='img1', snapshot_file=quorum_snapshot_file, snapshot_node_name="snap1"); From aa729704f48331787d0afac510df214aedda2843 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:16 +0200 Subject: [PATCH 06/11] raw-posix: Fix raw_getlength() to always return -errno on error We got a merry mix of -1 and -errno here. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 825a0c878f..fa005b38b7 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1133,12 +1133,12 @@ static int64_t raw_getlength(BlockDriverState *bs) struct stat st; if (fstat(fd, &st)) - return -1; + return -errno; if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { struct disklabel dl; if (ioctl(fd, DIOCGDINFO, &dl)) - return -1; + return -errno; return (uint64_t)dl.d_secsize * dl.d_partitions[DISKPART(st.st_rdev)].p_size; } else @@ -1152,7 +1152,7 @@ static int64_t raw_getlength(BlockDriverState *bs) struct stat st; if (fstat(fd, &st)) - return -1; + return -errno; if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { struct dkwedge_info dkw; @@ -1162,7 +1162,7 @@ static int64_t raw_getlength(BlockDriverState *bs) struct disklabel dl; if (ioctl(fd, DIOCGDINFO, &dl)) - return -1; + return -errno; return (uint64_t)dl.d_secsize * dl.d_partitions[DISKPART(st.st_rdev)].p_size; } @@ -1175,6 +1175,7 @@ static int64_t raw_getlength(BlockDriverState *bs) BDRVRawState *s = bs->opaque; struct dk_minfo minfo; int ret; + int64_t size; ret = fd_open(bs); if (ret < 0) { @@ -1193,7 +1194,11 @@ static int64_t raw_getlength(BlockDriverState *bs) * There are reports that lseek on some devices fails, but * irc discussion said that contingency on contingency was overkill. */ - return lseek(s->fd, 0, SEEK_END); + size = lseek(s->fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; } #elif defined(CONFIG_BSD) static int64_t raw_getlength(BlockDriverState *bs) @@ -1231,6 +1236,9 @@ again: size = LLONG_MAX; #else size = lseek(fd, 0LL, SEEK_END); + if (size < 0) { + return -errno; + } #endif #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) switch(s->type) { @@ -1247,6 +1255,9 @@ again: #endif } else { size = lseek(fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } } return size; } @@ -1255,13 +1266,18 @@ static int64_t raw_getlength(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; int ret; + int64_t size; ret = fd_open(bs); if (ret < 0) { return ret; } - return lseek(s->fd, 0, SEEK_END); + size = lseek(s->fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; } #endif From 5a18e67dfd515992076c5fcae47035fdd3ed2462 Mon Sep 17 00:00:00 2001 From: Le Tan Date: Thu, 3 Jul 2014 16:26:27 +0800 Subject: [PATCH 07/11] ahci: map memory via device's address space instead of address_space_memory In map_page() in hw/ide/ahci.c, replace cpu_physical_memory_map() and cpu_physical_memory_unmap() with dma_memory_map() and dma_memory_unmap(), because ahci devices should not access memory directly but via their address space. Add an AddressSpace parameter to map_page(). In order to call map_page(), we should pass the AHCIState.as as the AddressSpace argument. Signed-off-by: Le Tan Reviewed-by: Michael S. Tsirkin Signed-off-by: Stefan Hajnoczi --- hw/ide/ahci.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index cd140d16b8..604152a823 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -175,17 +175,18 @@ static void ahci_trigger_irq(AHCIState *s, AHCIDevice *d, ahci_check_irq(s); } -static void map_page(uint8_t **ptr, uint64_t addr, uint32_t wanted) +static void map_page(AddressSpace *as, uint8_t **ptr, uint64_t addr, + uint32_t wanted) { hwaddr len = wanted; if (*ptr) { - cpu_physical_memory_unmap(*ptr, len, 1, len); + dma_memory_unmap(as, *ptr, len, DMA_DIRECTION_FROM_DEVICE, len); } - *ptr = cpu_physical_memory_map(addr, &len, 1); + *ptr = dma_memory_map(as, addr, &len, DMA_DIRECTION_FROM_DEVICE); if (len < wanted) { - cpu_physical_memory_unmap(*ptr, len, 1, len); + dma_memory_unmap(as, *ptr, len, DMA_DIRECTION_FROM_DEVICE, len); *ptr = NULL; } } @@ -198,24 +199,24 @@ static void ahci_port_write(AHCIState *s, int port, int offset, uint32_t val) switch (offset) { case PORT_LST_ADDR: pr->lst_addr = val; - map_page(&s->dev[port].lst, + map_page(s->as, &s->dev[port].lst, ((uint64_t)pr->lst_addr_hi << 32) | pr->lst_addr, 1024); s->dev[port].cur_cmd = NULL; break; case PORT_LST_ADDR_HI: pr->lst_addr_hi = val; - map_page(&s->dev[port].lst, + map_page(s->as, &s->dev[port].lst, ((uint64_t)pr->lst_addr_hi << 32) | pr->lst_addr, 1024); s->dev[port].cur_cmd = NULL; break; case PORT_FIS_ADDR: pr->fis_addr = val; - map_page(&s->dev[port].res_fis, + map_page(s->as, &s->dev[port].res_fis, ((uint64_t)pr->fis_addr_hi << 32) | pr->fis_addr, 256); break; case PORT_FIS_ADDR_HI: pr->fis_addr_hi = val; - map_page(&s->dev[port].res_fis, + map_page(s->as, &s->dev[port].res_fis, ((uint64_t)pr->fis_addr_hi << 32) | pr->fis_addr, 256); break; case PORT_IRQ_STAT: @@ -1265,9 +1266,9 @@ static int ahci_state_post_load(void *opaque, int version_id) ad = &s->dev[i]; AHCIPortRegs *pr = &ad->port_regs; - map_page(&ad->lst, + map_page(s->as, &ad->lst, ((uint64_t)pr->lst_addr_hi << 32) | pr->lst_addr, 1024); - map_page(&ad->res_fis, + map_page(s->as, &ad->res_fis, ((uint64_t)pr->fis_addr_hi << 32) | pr->fis_addr, 256); /* * All pending i/o should be flushed out on a migrate. However, From 448ad91db4a560c01f89bd6f7e4bec7d869926a5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 4 Jul 2014 18:04:33 +0800 Subject: [PATCH 08/11] block: block: introduce APIs for submitting IO as a batch This patch introduces three APIs so that following patches can support queuing I/O requests and submitting them as a batch for improving I/O performance. Reviewed-by: Paolo Bonzini Signed-off-by: Ming Lei Signed-off-by: Stefan Hajnoczi --- block.c | 31 +++++++++++++++++++++++++++++++ include/block/block.h | 4 ++++ include/block/block_int.h | 5 +++++ 3 files changed, 40 insertions(+) diff --git a/block.c b/block.c index f80e2b2b58..8800a6b5b3 100644 --- a/block.c +++ b/block.c @@ -1905,6 +1905,7 @@ void bdrv_drain_all(void) bool bs_busy; aio_context_acquire(aio_context); + bdrv_flush_io_queue(bs); bdrv_start_throttled_reqs(bs); bs_busy = bdrv_requests_pending(bs); bs_busy |= aio_poll(aio_context, bs_busy); @@ -5782,3 +5783,33 @@ BlockDriverState *check_to_replace_node(const char *node_name, Error **errp) return to_replace_bs; } + +void bdrv_io_plug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } else if (bs->file) { + bdrv_io_plug(bs->file); + } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } else if (bs->file) { + bdrv_io_unplug(bs->file); + } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_flush_io_queue) { + drv->bdrv_flush_io_queue(bs); + } else if (bs->file) { + bdrv_flush_io_queue(bs->file); + } +} diff --git a/include/block/block.h b/include/block/block.h index baecc26dfc..32d36760fd 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -584,4 +584,8 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs); */ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context); +void bdrv_io_plug(BlockDriverState *bs); +void bdrv_io_unplug(BlockDriverState *bs); +void bdrv_flush_io_queue(BlockDriverState *bs); + #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 8f8e65e763..f6c3befed8 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -261,6 +261,11 @@ struct BlockDriver { void (*bdrv_attach_aio_context)(BlockDriverState *bs, AioContext *new_context); + /* io queue for linux-aio */ + void (*bdrv_io_plug)(BlockDriverState *bs); + void (*bdrv_io_unplug)(BlockDriverState *bs); + void (*bdrv_flush_io_queue)(BlockDriverState *bs); + QLIST_ENTRY(BlockDriver) list; }; From 1b3abdcccf18d98c3952b41be0bc1db3ef6009dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 4 Jul 2014 18:04:34 +0800 Subject: [PATCH 09/11] linux-aio: implement io plug, unplug and flush io queue This patch implements .bdrv_io_plug, .bdrv_io_unplug and .bdrv_flush_io_queue callbacks for linux-aio Block Drivers, so that submitting I/O as a batch can be supported on linux-aio. [Unprocessed requests are completed with -EIO instead of a bogus ret value. --Stefan] Signed-off-by: Ming Lei Signed-off-by: Stefan Hajnoczi --- block/linux-aio.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++- block/raw-aio.h | 2 + block/raw-posix.c | 45 ++++++++++++++++++++++ 3 files changed, 141 insertions(+), 2 deletions(-) diff --git a/block/linux-aio.c b/block/linux-aio.c index f0a2c087b2..48673690ac 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -25,6 +25,8 @@ */ #define MAX_EVENTS 128 +#define MAX_QUEUED_IO 128 + struct qemu_laiocb { BlockDriverAIOCB common; struct qemu_laio_state *ctx; @@ -36,9 +38,19 @@ struct qemu_laiocb { QLIST_ENTRY(qemu_laiocb) node; }; +typedef struct { + struct iocb *iocbs[MAX_QUEUED_IO]; + int plugged; + unsigned int size; + unsigned int idx; +} LaioQueue; + struct qemu_laio_state { io_context_t ctx; EventNotifier e; + + /* io queue for submit at batch */ + LaioQueue io_q; }; static inline ssize_t io_event_ret(struct io_event *ev) @@ -135,6 +147,79 @@ static const AIOCBInfo laio_aiocb_info = { .cancel = laio_cancel, }; +static void ioq_init(LaioQueue *io_q) +{ + io_q->size = MAX_QUEUED_IO; + io_q->idx = 0; + io_q->plugged = 0; +} + +static int ioq_submit(struct qemu_laio_state *s) +{ + int ret, i = 0; + int len = s->io_q.idx; + + do { + ret = io_submit(s->ctx, len, s->io_q.iocbs); + } while (i++ < 3 && ret == -EAGAIN); + + /* empty io queue */ + s->io_q.idx = 0; + + if (ret < 0) { + i = 0; + } else { + i = ret; + } + + for (; i < len; i++) { + struct qemu_laiocb *laiocb = + container_of(s->io_q.iocbs[i], struct qemu_laiocb, iocb); + + laiocb->ret = (ret < 0) ? ret : -EIO; + qemu_laio_process_completion(s, laiocb); + } + return ret; +} + +static void ioq_enqueue(struct qemu_laio_state *s, struct iocb *iocb) +{ + unsigned int idx = s->io_q.idx; + + s->io_q.iocbs[idx++] = iocb; + s->io_q.idx = idx; + + /* submit immediately if queue is full */ + if (idx == s->io_q.size) { + ioq_submit(s); + } +} + +void laio_io_plug(BlockDriverState *bs, void *aio_ctx) +{ + struct qemu_laio_state *s = aio_ctx; + + s->io_q.plugged++; +} + +int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) +{ + struct qemu_laio_state *s = aio_ctx; + int ret = 0; + + assert(s->io_q.plugged > 0 || !unplug); + + if (unplug && --s->io_q.plugged > 0) { + return 0; + } + + if (s->io_q.idx > 0) { + ret = ioq_submit(s); + } + + return ret; +} + BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type) @@ -168,8 +253,13 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, } io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); - if (io_submit(s->ctx, 1, &iocbs) < 0) - goto out_free_aiocb; + if (!s->io_q.plugged) { + if (io_submit(s->ctx, 1, &iocbs) < 0) { + goto out_free_aiocb; + } + } else { + ioq_enqueue(s, iocbs); + } return &laiocb->common; out_free_aiocb: @@ -204,6 +294,8 @@ void *laio_init(void) goto out_close_efd; } + ioq_init(&s->io_q); + return s; out_close_efd: diff --git a/block/raw-aio.h b/block/raw-aio.h index 8cf084eeb5..e18c97509a 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -40,6 +40,8 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, BlockDriverCompletionFunc *cb, void *opaque, int type); void laio_detach_aio_context(void *s, AioContext *old_context); void laio_attach_aio_context(void *s, AioContext *new_context); +void laio_io_plug(BlockDriverState *bs, void *aio_ctx); +int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); #endif #ifdef _WIN32 diff --git a/block/raw-posix.c b/block/raw-posix.c index fa005b38b7..a857def671 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1057,6 +1057,36 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, cb, opaque, type); } +static void raw_aio_plug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_plug(bs, s->aio_ctx); + } +#endif +} + +static void raw_aio_unplug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_unplug(bs, s->aio_ctx, true); + } +#endif +} + +static void raw_aio_flush_io_queue(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_unplug(bs, s->aio_ctx, false); + } +#endif +} + static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) @@ -1544,6 +1574,9 @@ static BlockDriver bdrv_file = { .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = raw_aio_discard, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1943,6 +1976,9 @@ static BlockDriver bdrv_host_device = { .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = hdev_aio_discard, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2088,6 +2124,9 @@ static BlockDriver bdrv_host_floppy = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2216,6 +2255,9 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2350,6 +2392,9 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, From dd67c1d7e75151e2c058ccdd2162643074357442 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 4 Jul 2014 18:04:35 +0800 Subject: [PATCH 10/11] dataplane: submit I/O as a batch Before commit 580b6b2aa2(dataplane: use the QEMU block layer for I/O), dataplane for virtio-blk submits block I/O as a batch. This commit 580b6b2aa2 replaces the custom linux AIO implementation(including submit I/O as a batch) with QEMU block layer, but this commit causes ~40% throughput regression on virtio-blk performance, and removing submitting I/O as a batch is one of the causes. This patch applies the newly introduced bdrv_io_plug() and bdrv_io_unplug() interfaces to support submitting I/O at batch for Qemu block layer, and in my test, the change can improve throughput by ~30% with 'aio=native'. Following my fio test script: [global] direct=1 size=4G bsrange=4k-4k timeout=40 numjobs=4 ioengine=libaio iodepth=64 filename=/dev/vdc group_reporting=1 [f] rw=randread Result on one of my small machine(host: x86_64, 2cores, 4thread, guest: 4cores): - qemu master: 65K IOPS - qemu master with these patches: 92K IOPS - 2.0.0 release(dataplane using custom linux aio): 104K IOPS Reviewed-by: Paolo Bonzini Signed-off-by: Ming Lei Signed-off-by: Stefan Hajnoczi --- hw/block/dataplane/virtio-blk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index 4c5ba18122..4bc0729bf7 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -84,6 +84,7 @@ static void handle_notify(EventNotifier *e) }; event_notifier_test_and_clear(&s->host_notifier); + bdrv_io_plug(s->blk->conf.bs); for (;;) { /* Disable guest->host notifies to avoid unnecessary vmexits */ vring_disable_notification(s->vdev, &s->vring); @@ -117,6 +118,7 @@ static void handle_notify(EventNotifier *e) break; } } + bdrv_io_unplug(s->blk->conf.bs); } /* Context: QEMU global mutex held */ From f4eb32b590bf58c1c67570775eb78beb09964fad Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 20 May 2014 14:29:01 +0200 Subject: [PATCH 11/11] qmp: show QOM properties in device-list-properties Devices can use a mix of qdev and QOM properties. Currently only the qdev properties are displayed by device-list-properties. This patch extends the property enumeration algorithm to also display QOM properties (excluding the implicit "type", "realized", "hotpluggable", and "parent_bus" properties). When a qdev property exists, use the qdev type name to preserve backwards compatibility. QOM type names can be different for bool (qdev on/off) and str (used by qdev pointers). Signed-off-by: Stefan Hajnoczi --- qmp.c | 101 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/qmp.c b/qmp.c index dca6efb7b8..0d2553abf5 100644 --- a/qmp.c +++ b/qmp.c @@ -433,11 +433,57 @@ ObjectTypeInfoList *qmp_qom_list_types(bool has_implements, return ret; } +/* Return a DevicePropertyInfo for a qdev property. + * + * If a qdev property with the given name does not exist, use the given default + * type. If the qdev property info should not be shown, return NULL. + * + * The caller must free the return value. + */ +static DevicePropertyInfo *make_device_property_info(ObjectClass *klass, + const char *name, + const char *default_type) +{ + DevicePropertyInfo *info; + Property *prop; + + do { + for (prop = DEVICE_CLASS(klass)->props; prop && prop->name; prop++) { + if (strcmp(name, prop->name) != 0) { + continue; + } + + /* + * TODO Properties without a parser are just for dirty hacks. + * qdev_prop_ptr is the only such PropertyInfo. It's marked + * for removal. This conditional should be removed along with + * it. + */ + if (!prop->info->set) { + return NULL; /* no way to set it, don't show */ + } + + info = g_malloc0(sizeof(*info)); + info->name = g_strdup(prop->name); + info->type = g_strdup(prop->info->legacy_name ?: prop->info->name); + return info; + } + klass = object_class_get_parent(klass); + } while (klass != object_class_by_name(TYPE_DEVICE)); + + /* Not a qdev property, use the default type */ + info = g_malloc0(sizeof(*info)); + info->name = g_strdup(name); + info->type = g_strdup(default_type); + return info; +} + DevicePropertyInfoList *qmp_device_list_properties(const char *typename, Error **errp) { ObjectClass *klass; - Property *prop; + Object *obj; + ObjectProperty *prop; DevicePropertyInfoList *prop_list = NULL; klass = object_class_by_name(typename); @@ -453,32 +499,39 @@ DevicePropertyInfoList *qmp_device_list_properties(const char *typename, return NULL; } - do { - for (prop = DEVICE_CLASS(klass)->props; prop && prop->name; prop++) { - DevicePropertyInfoList *entry; - DevicePropertyInfo *info; + obj = object_new(typename); - /* - * TODO Properties without a parser are just for dirty hacks. - * qdev_prop_ptr is the only such PropertyInfo. It's marked - * for removal. This conditional should be removed along with - * it. - */ - if (!prop->info->set) { - continue; /* no way to set it, don't show */ - } + QTAILQ_FOREACH(prop, &obj->properties, node) { + DevicePropertyInfo *info; + DevicePropertyInfoList *entry; - info = g_malloc0(sizeof(*info)); - info->name = g_strdup(prop->name); - info->type = g_strdup(prop->info->legacy_name ?: prop->info->name); - - entry = g_malloc0(sizeof(*entry)); - entry->value = info; - entry->next = prop_list; - prop_list = entry; + /* Skip Object and DeviceState properties */ + if (strcmp(prop->name, "type") == 0 || + strcmp(prop->name, "realized") == 0 || + strcmp(prop->name, "hotpluggable") == 0 || + strcmp(prop->name, "parent_bus") == 0) { + continue; } - klass = object_class_get_parent(klass); - } while (klass != object_class_by_name(TYPE_DEVICE)); + + /* Skip legacy properties since they are just string versions of + * properties that we already list. + */ + if (strstart(prop->name, "legacy-", NULL)) { + continue; + } + + info = make_device_property_info(klass, prop->name, prop->type); + if (!info) { + continue; + } + + entry = g_malloc0(sizeof(*entry)); + entry->value = info; + entry->next = prop_list; + prop_list = entry; + } + + object_unref(obj); return prop_list; }