From f9b49088952ddb6de6e9c2e5822a5e15434abc3b Mon Sep 17 00:00:00 2001 From: Elena Afanasova Date: Sat, 17 Oct 2020 14:01:01 -0700 Subject: [PATCH 01/33] accel/kvm: add PIO ioeventfds only in case kvm_eventfds_allowed is true Signed-off-by: Stefan Hajnoczi Signed-off-by: Elena Afanasova Message-Id: <20201017210102.26036-1-eafanasova@gmail.com> Signed-off-by: Stefan Hajnoczi --- accel/kvm/kvm-all.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 9ef5daf4c5..baaa54249d 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2239,8 +2239,10 @@ static int kvm_init(MachineState *ms) kvm_memory_listener_register(s, &s->memory_listener, &address_space_memory, 0); - memory_listener_register(&kvm_io_listener, - &address_space_io); + if (kvm_eventfds_allowed) { + memory_listener_register(&kvm_io_listener, + &address_space_io); + } memory_listener_register(&kvm_coalesced_pio_listener, &address_space_io); From e6ffd757829a0dcff649c99e6304a287cc6d0d27 Mon Sep 17 00:00:00 2001 From: Elena Afanasova Date: Mon, 19 Oct 2020 13:20:13 -0700 Subject: [PATCH 02/33] softmmu/memory: fix memory_region_ioeventfd_equal() Eventfd can be registered with a zero length when fast_mmio is true. Handle this case properly when dispatching through QEMU. Signed-off-by: Elena Afanasova Message-id: cf71a62eb04e61932ff8ffdd02e0b2aab4f495a0.camel@gmail.com Signed-off-by: Stefan Hajnoczi --- softmmu/memory.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/softmmu/memory.c b/softmmu/memory.c index 21d533d8ed..8aba4114cf 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -205,8 +205,15 @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a, static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a, MemoryRegionIoeventfd *b) { - return !memory_region_ioeventfd_before(a, b) - && !memory_region_ioeventfd_before(b, a); + if (int128_eq(a->addr.start, b->addr.start) && + (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) || + (int128_eq(a->addr.size, b->addr.size) && + (a->match_data == b->match_data) && + ((a->match_data && (a->data == b->data)) || !a->match_data) && + (a->e == b->e)))) + return true; + + return false; } /* Range of memory in the global map. Addresses are absolute. */ From dc0b4d94d49a37751b37184fc891a86af3cc72e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Wed, 1 Jul 2020 16:06:34 +0200 Subject: [PATCH 03/33] MAINTAINERS: Cover "block/nvme.h" file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "block/nvme.h" header is shared by both the NVMe block driver and the NVMe emulated device. Add the 'F:' entry on both sections, so all maintainers/reviewers are notified when it is changed. Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Reviewed-by: Klaus Jensen Message-Id: <20200701140634.25994-1-philmd@redhat.com> --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c1d16026ba..ea47b9e889 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1878,6 +1878,7 @@ M: Klaus Jensen L: qemu-block@nongnu.org S: Supported F: hw/block/nvme* +F: include/block/nvme.h F: tests/qtest/nvme-test.c F: docs/specs/nvme.txt T: git git://git.infradead.org/qemu-nvme.git nvme-next @@ -2962,6 +2963,7 @@ R: Fam Zheng L: qemu-block@nongnu.org S: Supported F: block/nvme* +F: include/block/nvme.h T: git https://github.com/stefanha/qemu.git block Bootdevice From 8526e39e9974228f937d04e8eae140add8b17b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:43 +0100 Subject: [PATCH 04/33] block/nvme: Use hex format to display offset in trace events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the same format used for the hw/vfio/ trace events. Suggested-by: Eric Auger Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-3-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/trace-events | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/trace-events b/block/trace-events index 0e351c3fa3..0955c85c78 100644 --- a/block/trace-events +++ b/block/trace-events @@ -144,13 +144,13 @@ nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" nvme_handle_event(void *s) "s %p" nvme_poll_cb(void *s) "s %p" -nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d" -nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d" +nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d" +nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d" nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x" -nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d" -nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d" -nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64"" -nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d" +nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d" +nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d" +nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64"" +nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d" nvme_dma_map_flush(void *s) "s %p" nvme_free_req_queue_wait(void *q) "q %p" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" From 58ad6ae0cbad626bd574bc1347f210a35a8316eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:44 +0100 Subject: [PATCH 05/33] block/nvme: Report warning with warn_report() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of displaying warning on stderr, use warn_report() which also displays it on the monitor. Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-4-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 739a0a700c..6f1d7f9b2a 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -399,8 +399,8 @@ static bool nvme_process_completion(NVMeQueuePair *q) } cid = le16_to_cpu(c->cid); if (cid == 0 || cid > NVME_QUEUE_SIZE) { - fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n", - cid); + warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", " + "queue size: %u", cid, NVME_QUEUE_SIZE); continue; } trace_nvme_complete_command(s, q->index, cid); From 15b2260bef333762904535affa7fb30390b3fb98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:45 +0100 Subject: [PATCH 06/33] block/nvme: Trace controller capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Controllers have different capabilities and report them in the CAP register. We are particularly interested by the page size limits. Reviewed-by: Stefan Hajnoczi Reviewed-by: Eric Auger Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-5-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 13 +++++++++++++ block/trace-events | 2 ++ 2 files changed, 15 insertions(+) diff --git a/block/nvme.c b/block/nvme.c index 6f1d7f9b2a..361b5772b7 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -727,6 +727,19 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, * Initialization". */ cap = le64_to_cpu(regs->cap); + trace_nvme_controller_capability_raw(cap); + trace_nvme_controller_capability("Maximum Queue Entries Supported", + 1 + NVME_CAP_MQES(cap)); + trace_nvme_controller_capability("Contiguous Queues Required", + NVME_CAP_CQR(cap)); + trace_nvme_controller_capability("Doorbell Stride", + 2 << (2 + NVME_CAP_DSTRD(cap))); + trace_nvme_controller_capability("Subsystem Reset Supported", + NVME_CAP_NSSRS(cap)); + trace_nvme_controller_capability("Memory Page Size Minimum", + 1 << (12 + NVME_CAP_MPSMIN(cap))); + trace_nvme_controller_capability("Memory Page Size Maximum", + 1 << (12 + NVME_CAP_MPSMAX(cap))); if (!NVME_CAP_CSS(cap)) { error_setg(errp, "Device doesn't support NVMe command set"); ret = -EINVAL; diff --git a/block/trace-events b/block/trace-events index 0955c85c78..b90b07b15f 100644 --- a/block/trace-events +++ b/block/trace-events @@ -134,6 +134,8 @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" # nvme.c +nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64 +nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64 nvme_kick(void *s, int queue) "s %p queue %d" nvme_dma_flush_queue_wait(void *s) "s %p" nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" From 1c914cd1208fcf7fc61edb043f091b9412642ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:46 +0100 Subject: [PATCH 07/33] block/nvme: Trace nvme_poll_queue() per queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we want to enable multiple queues, report the event in each nvme_poll_queue() call, rather than once in the callback calling nvme_poll_queues(). Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-6-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 2 +- block/trace-events | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 361b5772b7..8d74401ae7 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -594,6 +594,7 @@ static bool nvme_poll_queue(NVMeQueuePair *q) const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; + trace_nvme_poll_queue(q->s, q->index); /* * Do an early check for completions. q->lock isn't needed because * nvme_process_completion() only runs in the event loop thread and @@ -684,7 +685,6 @@ static bool nvme_poll_cb(void *opaque) BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier[MSIX_SHARED_IRQ_IDX]); - trace_nvme_poll_cb(s); return nvme_poll_queues(s); } diff --git a/block/trace-events b/block/trace-events index b90b07b15f..86292f3312 100644 --- a/block/trace-events +++ b/block/trace-events @@ -145,7 +145,7 @@ nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" nvme_handle_event(void *s) "s %p" -nvme_poll_cb(void *s) "s %p" +nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u" nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d" nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d" nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x" From 51e98b6d21dbaf0c6adcbf00b76bfbe0a0537c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:47 +0100 Subject: [PATCH 08/33] block/nvme: Improve nvme_free_req_queue_wait() trace information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit What we want to trace is the block driver state and the queue index. Suggested-by: Stefan Hajnoczi Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-7-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 2 +- block/trace-events | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 8d74401ae7..29d2541b91 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -292,7 +292,7 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) while (q->free_req_head == -1) { if (qemu_in_coroutine()) { - trace_nvme_free_req_queue_wait(q); + trace_nvme_free_req_queue_wait(q->s, q->index); qemu_co_queue_wait(&q->free_req_queue, &q->lock); } else { qemu_mutex_unlock(&q->lock); diff --git a/block/trace-events b/block/trace-events index 86292f3312..cc5e2b55cb 100644 --- a/block/trace-events +++ b/block/trace-events @@ -154,7 +154,7 @@ nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64"" nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d" nvme_dma_map_flush(void *s) "s %p" -nvme_free_req_queue_wait(void *q) "q %p" +nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" From 6e1e9ff2d3709bb89156ea8e42239b6ee96a2a49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:48 +0100 Subject: [PATCH 09/33] block/nvme: Trace queue pair creation/deletion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-8-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 3 +++ block/trace-events | 2 ++ 2 files changed, 5 insertions(+) diff --git a/block/nvme.c b/block/nvme.c index 29d2541b91..e95d59d312 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -181,6 +181,7 @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, static void nvme_free_queue_pair(NVMeQueuePair *q) { + trace_nvme_free_queue_pair(q->index, q); if (q->completion_bh) { qemu_bh_delete(q->completion_bh); } @@ -216,6 +217,8 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, if (!q) { return NULL; } + trace_nvme_create_queue_pair(idx, q, size, aio_context, + event_notifier_get_fd(s->irq_notifier)); q->prp_list_pages = qemu_try_memalign(s->page_size, s->page_size * NVME_NUM_REQS); if (!q->prp_list_pages) { diff --git a/block/trace-events b/block/trace-events index cc5e2b55cb..f6a0f99df1 100644 --- a/block/trace-events +++ b/block/trace-events @@ -155,6 +155,8 @@ nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" byte nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d" nvme_dma_map_flush(void *s) "s %p" nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u" +nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d" +nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" From 3214b0f0948d5e01ccca62730e60a69e2ded8774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:49 +0100 Subject: [PATCH 10/33] block/nvme: Move definitions before structure declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to use some definitions in structure declarations, move them earlier. No logical change. Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-9-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index e95d59d312..b0629f5de8 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -41,6 +41,16 @@ typedef struct BDRVNVMeState BDRVNVMeState; +/* Same index is used for queues and IRQs */ +#define INDEX_ADMIN 0 +#define INDEX_IO(n) (1 + n) + +/* This driver shares a single MSIX IRQ for the admin and I/O queues */ +enum { + MSIX_SHARED_IRQ_IDX = 0, + MSIX_IRQ_COUNT = 1 +}; + typedef struct { int32_t head, tail; uint8_t *queue; @@ -81,15 +91,6 @@ typedef struct { QEMUBH *completion_bh; } NVMeQueuePair; -#define INDEX_ADMIN 0 -#define INDEX_IO(n) (1 + n) - -/* This driver shares a single MSIX IRQ for the admin and I/O queues */ -enum { - MSIX_SHARED_IRQ_IDX = 0, - MSIX_IRQ_COUNT = 1 -}; - struct BDRVNVMeState { AioContext *aio_context; QEMUVFIOState *vfio; From 1b539bd6dbe1459f160e25610ec2fc3388f700e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:50 +0100 Subject: [PATCH 11/33] block/nvme: Use unsigned integer for queue counter/size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can not have negative queue count/size/index, use unsigned type. Rename 'nr_queues' as 'queue_count' to match the spec naming. Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-10-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 38 ++++++++++++++++++-------------------- block/trace-events | 10 +++++----- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index b0629f5de8..c450499111 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -104,7 +104,7 @@ struct BDRVNVMeState { * [1..]: io queues. */ NVMeQueuePair **queues; - int nr_queues; + unsigned queue_count; size_t page_size; /* How many uint32_t elements does each doorbell entry take. */ size_t doorbell_scale; @@ -161,7 +161,7 @@ static QemuOptsList runtime_opts = { }; static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, - int nentries, int entry_bytes, Error **errp) + unsigned nentries, size_t entry_bytes, Error **errp) { size_t bytes; int r; @@ -206,7 +206,7 @@ static void nvme_free_req_queue_cb(void *opaque) static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, AioContext *aio_context, - int idx, int size, + unsigned idx, size_t size, Error **errp) { int i, r; @@ -623,7 +623,7 @@ static bool nvme_poll_queues(BDRVNVMeState *s) bool progress = false; int i; - for (i = 0; i < s->nr_queues; i++) { + for (i = 0; i < s->queue_count; i++) { if (nvme_poll_queue(s->queues[i])) { progress = true; } @@ -644,10 +644,10 @@ static void nvme_handle_event(EventNotifier *n) static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) { BDRVNVMeState *s = bs->opaque; - int n = s->nr_queues; + unsigned n = s->queue_count; NVMeQueuePair *q; NvmeCmd cmd; - int queue_size = NVME_QUEUE_SIZE; + unsigned queue_size = NVME_QUEUE_SIZE; q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), n, queue_size, errp); @@ -661,7 +661,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw11 = cpu_to_le32(0x3), }; if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { - error_setg(errp, "Failed to create CQ io queue [%d]", n); + error_setg(errp, "Failed to create CQ io queue [%u]", n); goto out_error; } cmd = (NvmeCmd) { @@ -671,12 +671,12 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw11 = cpu_to_le32(0x1 | (n << 16)), }; if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { - error_setg(errp, "Failed to create SQ io queue [%d]", n); + error_setg(errp, "Failed to create SQ io queue [%u]", n); goto out_error; } s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); s->queues[n] = q; - s->nr_queues++; + s->queue_count++; return true; out_error: nvme_free_queue_pair(q); @@ -785,7 +785,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, ret = -EINVAL; goto out; } - s->nr_queues = 1; + s->queue_count = 1; QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) | (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT)); @@ -895,10 +895,9 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, static void nvme_close(BlockDriverState *bs) { - int i; BDRVNVMeState *s = bs->opaque; - for (i = 0; i < s->nr_queues; ++i) { + for (unsigned i = 0; i < s->queue_count; ++i) { nvme_free_queue_pair(s->queues[i]); } g_free(s->queues); @@ -1123,7 +1122,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, }; trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); - assert(s->nr_queues > 1); + assert(s->queue_count > 1); req = nvme_get_free_req(ioq); assert(req); @@ -1233,7 +1232,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) .ret = -EINPROGRESS, }; - assert(s->nr_queues > 1); + assert(s->queue_count > 1); req = nvme_get_free_req(ioq); assert(req); nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); @@ -1285,7 +1284,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, cmd.cdw12 = cpu_to_le32(cdw12); trace_nvme_write_zeroes(s, offset, bytes, flags); - assert(s->nr_queues > 1); + assert(s->queue_count > 1); req = nvme_get_free_req(ioq); assert(req); @@ -1328,7 +1327,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, return -ENOTSUP; } - assert(s->nr_queues > 1); + assert(s->queue_count > 1); buf = qemu_try_memalign(s->page_size, s->page_size); if (!buf) { @@ -1408,7 +1407,7 @@ static void nvme_detach_aio_context(BlockDriverState *bs) { BDRVNVMeState *s = bs->opaque; - for (int i = 0; i < s->nr_queues; i++) { + for (unsigned i = 0; i < s->queue_count; i++) { NVMeQueuePair *q = s->queues[i]; qemu_bh_delete(q->completion_bh); @@ -1429,7 +1428,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs, aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); - for (int i = 0; i < s->nr_queues; i++) { + for (unsigned i = 0; i < s->queue_count; i++) { NVMeQueuePair *q = s->queues[i]; q->completion_bh = @@ -1446,11 +1445,10 @@ static void nvme_aio_plug(BlockDriverState *bs) static void nvme_aio_unplug(BlockDriverState *bs) { - int i; BDRVNVMeState *s = bs->opaque; assert(s->plugged); s->plugged = false; - for (i = INDEX_IO(0); i < s->nr_queues; i++) { + for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) { NVMeQueuePair *q = s->queues[i]; qemu_mutex_lock(&q->lock); nvme_kick(q); diff --git a/block/trace-events b/block/trace-events index f6a0f99df1..8368f4acb0 100644 --- a/block/trace-events +++ b/block/trace-events @@ -136,13 +136,13 @@ qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s # nvme.c nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64 nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64 -nvme_kick(void *s, int queue) "s %p queue %d" +nvme_kick(void *s, unsigned q_index) "s %p q #%u" nvme_dma_flush_queue_wait(void *s) "s %p" nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" -nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" -nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d" -nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" -nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" +nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d" +nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u" +nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" +nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" nvme_handle_event(void *s) "s %p" nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u" From 7a5f00dde39c0120b1653f189de9bbd551a6b1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:51 +0100 Subject: [PATCH 12/33] block/nvme: Make nvme_identify() return boolean indicating error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just for consistency, following the example documented since commit e3fe3988d7 ("error: Document Error API usage rules"), return a boolean value indicating an error is set or not. Directly pass errp as the local_err is not requested in our case. Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Message-id: 20201029093306.1063879-11-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index c450499111..9833501245 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -506,9 +506,11 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, return ret; } -static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) +/* Returns true on success, false on failure. */ +static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; + bool ret = false; union { NvmeIdCtrl ctrl; NvmeIdNs ns; @@ -585,10 +587,13 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) goto out; } + ret = true; s->blkshift = lbaf->ds; out: qemu_vfio_dma_unmap(s->vfio, id); qemu_vfree(id); + + return ret; } static bool nvme_poll_queue(NVMeQueuePair *q) @@ -701,7 +706,6 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, uint64_t cap; uint64_t timeout_ms; uint64_t deadline, now; - Error *local_err = NULL; volatile NvmeBar *regs = NULL; qemu_co_mutex_init(&s->dma_map_lock); @@ -818,9 +822,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); - nvme_identify(bs, namespace, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!nvme_identify(bs, namespace, errp)) { ret = -EIO; goto out; } From dfa9c6c65666d431b20f6ae3f18ee1e729f80f19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:52 +0100 Subject: [PATCH 13/33] block/nvme: Make nvme_init_queue() return boolean indicating error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just for consistency, following the example documented since commit e3fe3988d7 ("error: Document Error API usage rules"), return a boolean value indicating an error is set or not. Directly pass errp as the local_err is not requested in our case. This simplifies a bit nvme_create_queue_pair(). Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-12-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 9833501245..6eaba4e703 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -160,7 +160,8 @@ static QemuOptsList runtime_opts = { }, }; -static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, +/* Returns true on success, false on failure. */ +static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, unsigned nentries, size_t entry_bytes, Error **errp) { size_t bytes; @@ -171,13 +172,15 @@ static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, q->queue = qemu_try_memalign(s->page_size, bytes); if (!q->queue) { error_setg(errp, "Cannot allocate queue"); - return; + return false; } memset(q->queue, 0, bytes); r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova); if (r) { error_setg(errp, "Cannot map queue"); + return false; } + return true; } static void nvme_free_queue_pair(NVMeQueuePair *q) @@ -210,7 +213,6 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, Error **errp) { int i, r; - Error *local_err = NULL; NVMeQueuePair *q; uint64_t prp_list_iova; @@ -247,16 +249,12 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, req->prp_list_iova = prp_list_iova + i * s->page_size; } - nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { goto fail; } q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; - nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { goto fail; } q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; From 54248d4d73ddbce38e240056dacf582f92da872d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:53 +0100 Subject: [PATCH 14/33] block/nvme: Introduce Completion Queue definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename Submission Queue flags with 'Sq' to differentiate submission queue flags from command queue flags, and introduce Completion Queue flag definitions. Reviewed-by: Eric Auger Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Message-id: 20201029093306.1063879-13-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- include/block/nvme.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 8a46d9cf01..3e02d9ca98 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -501,6 +501,11 @@ typedef struct QEMU_PACKED NvmeCreateCq { #define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) +enum NvmeFlagsCq { + NVME_CQ_PC = 1, + NVME_CQ_IEN = 2, +}; + typedef struct QEMU_PACKED NvmeCreateSq { uint8_t opcode; uint8_t flags; @@ -518,12 +523,13 @@ typedef struct QEMU_PACKED NvmeCreateSq { #define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) #define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) -enum NvmeQueueFlags { - NVME_Q_PC = 1, - NVME_Q_PRIO_URGENT = 0, - NVME_Q_PRIO_HIGH = 1, - NVME_Q_PRIO_NORMAL = 2, - NVME_Q_PRIO_LOW = 3, +enum NvmeFlagsSq { + NVME_SQ_PC = 1, + + NVME_SQ_PRIO_URGENT = 0, + NVME_SQ_PRIO_HIGH = 1, + NVME_SQ_PRIO_NORMAL = 2, + NVME_SQ_PRIO_LOW = 3, }; typedef struct QEMU_PACKED NvmeIdentify { From 76a24781cc82d99251f933c399d78af52e1d9b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:54 +0100 Subject: [PATCH 15/33] block/nvme: Use definitions instead of magic values in add_io_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace magic values by definitions, and simplifiy since the number of queues will never reach 64K. Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-14-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 6eaba4e703..7285bd2e27 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -652,6 +652,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) NvmeCmd cmd; unsigned queue_size = NVME_QUEUE_SIZE; + assert(n <= UINT16_MAX); q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), n, queue_size, errp); if (!q) { @@ -660,8 +661,8 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_CQ, .dptr.prp1 = cpu_to_le64(q->cq.iova), - .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), - .cdw11 = cpu_to_le32(0x3), + .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), + .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), }; if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { error_setg(errp, "Failed to create CQ io queue [%u]", n); @@ -670,8 +671,8 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_SQ, .dptr.prp1 = cpu_to_le64(q->sq.iova), - .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), - .cdw11 = cpu_to_le32(0x1 | (n << 16)), + .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), + .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), }; if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { error_setg(errp, "Failed to create SQ io queue [%u]", n); From 3c363c073e024fae7b09cad916e63dae89ed46cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:55 +0100 Subject: [PATCH 16/33] block/nvme: Correctly initialize Admin Queue Attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From the specification chapter 3.1.8 "AQA - Admin Queue Attributes" the Admin Submission Queue Size field is a 0’s based value: Admin Submission Queue Size (ASQS): Defines the size of the Admin Submission Queue in entries. Enabling a controller while this field is cleared to 00h produces undefined results. The minimum size of the Admin Submission Queue is two entries. The maximum size of the Admin Submission Queue is 4096 entries. This is a 0’s based value. This bug has never been hit because the device initialization uses a single command synchronously :) Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-15-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 7285bd2e27..0902aa5542 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -789,9 +789,9 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, goto out; } s->queue_count = 1; - QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); - regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) | - (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT)); + QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); + regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | + ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); From 52b75ea8ec4f6cd8028bd8406ebcf2e80ffc6e05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:56 +0100 Subject: [PATCH 17/33] block/nvme: Simplify ADMIN queue access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't need to dereference from BDRVNVMeState each time. Use a NVMeQueuePair pointer on the admin queue. The nvme_init() becomes easier to review, matching the style of nvme_add_io_queue(). Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-16-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 0902aa5542..eed12f4933 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -700,6 +700,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; + NVMeQueuePair *q; AioContext *aio_context = bdrv_get_aio_context(bs); int ret; uint64_t cap; @@ -781,19 +782,18 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, /* Set up admin queue. */ s->queues = g_new(NVMeQueuePair *, 1); - s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0, - NVME_QUEUE_SIZE, - errp); - if (!s->queues[INDEX_ADMIN]) { + q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); + if (!q) { ret = -EINVAL; goto out; } + s->queues[INDEX_ADMIN] = q; s->queue_count = 1; QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); - regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); - regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); + regs->asq = cpu_to_le64(q->sq.iova); + regs->acq = cpu_to_le64(q->cq.iova); /* After setting up all control registers we can enable device now. */ regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | From 08d5406798f7729de8344ef0ed16d8bad8cdfc84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:57 +0100 Subject: [PATCH 18/33] block/nvme: Simplify nvme_cmd_sync() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As all commands use the ADMIN queue, it is pointless to pass it as argument each time. Remove the argument, and rename the function as nvme_admin_cmd_sync() to make this new behavior clearer. Reviewed-by: Eric Auger Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Message-id: 20201029093306.1063879-17-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index eed12f4933..cd875555ca 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -481,16 +481,17 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, qemu_mutex_unlock(&q->lock); } -static void nvme_cmd_sync_cb(void *opaque, int ret) +static void nvme_admin_cmd_sync_cb(void *opaque, int ret) { int *pret = opaque; *pret = ret; aio_wait_kick(); } -static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, - NvmeCmd *cmd) +static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) { + BDRVNVMeState *s = bs->opaque; + NVMeQueuePair *q = s->queues[INDEX_ADMIN]; AioContext *aio_context = bdrv_get_aio_context(bs); NVMeRequest *req; int ret = -EINPROGRESS; @@ -498,7 +499,7 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, if (!req) { return -EBUSY; } - nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); + nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); return ret; @@ -535,7 +536,7 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) memset(id, 0, sizeof(*id)); cmd.dptr.prp1 = cpu_to_le64(iova); - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to identify controller"); goto out; } @@ -558,7 +559,7 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) memset(id, 0, sizeof(*id)); cmd.cdw10 = 0; cmd.nsid = cpu_to_le32(namespace); - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to identify namespace"); goto out; } @@ -664,7 +665,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), }; - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to create CQ io queue [%u]", n); goto out_error; } @@ -674,7 +675,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), }; - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to create SQ io queue [%u]", n); goto out_error; } @@ -887,7 +888,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), }; - ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd); + ret = nvme_admin_cmd_sync(bs, &cmd); if (ret) { error_setg(errp, "Failed to configure NVMe write cache"); } From c8228ac35541564c46bf1c91ebe7951f367e2ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:58 +0100 Subject: [PATCH 19/33] block/nvme: Set request_alignment at initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bdd6a90a9e5 ("block: Add VFIO based NVMe driver") sets the request_alignment in nvme_refresh_limits(). For consistency, also set it during initialization. Reported-by: Stefan Hajnoczi Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-18-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/nvme.c b/block/nvme.c index cd875555ca..bb75448a09 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -758,6 +758,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap)); s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); bs->bl.opt_mem_alignment = s->page_size; + bs->bl.request_alignment = s->page_size; timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); /* Reset device to get a clean state. */ From a652a3ec697d5b4cfce44599d3540aae535b9545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:32:59 +0100 Subject: [PATCH 20/33] block/nvme: Correct minimum device page size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While trying to simplify the code using a macro, we forgot the 12-bit shift... Correct that. Fixes: fad1eb68862 ("block/nvme: Use register definitions from 'block/nvme.h'") Reported-by: Eric Auger Reviewed-by: Stefan Hajnoczi Reviewed-by: Eric Auger Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-19-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/nvme.c b/block/nvme.c index bb75448a09..bd3860ac4e 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -755,7 +755,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, goto out; } - s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap)); + s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); bs->bl.opt_mem_alignment = s->page_size; bs->bl.request_alignment = s->page_size; From 0aecd06049ced6c0f863e99510b7a7cbff54aa56 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 29 Oct 2020 10:33:00 +0100 Subject: [PATCH 21/33] block/nvme: Change size and alignment of IDENTIFY response buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation of 64kB host page support, let's change the size and alignment of the IDENTIFY command response buffer so that the VFIO DMA MAP succeeds. We align on the host page size. Signed-off-by: Eric Auger Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-20-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index bd3860ac4e..7628623c05 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -522,19 +522,20 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) .opcode = NVME_ADM_CMD_IDENTIFY, .cdw10 = cpu_to_le32(0x1), }; + size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size); - id = qemu_try_memalign(s->page_size, sizeof(*id)); + id = qemu_try_memalign(qemu_real_host_page_size, id_size); if (!id) { error_setg(errp, "Cannot allocate buffer for identify response"); goto out; } - r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova); + r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova); if (r) { error_setg(errp, "Cannot map buffer for DMA"); goto out; } - memset(id, 0, sizeof(*id)); + memset(id, 0, id_size); cmd.dptr.prp1 = cpu_to_le64(iova); if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to identify controller"); @@ -556,7 +557,7 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); s->supports_discard = !!(oncs & NVME_ONCS_DSM); - memset(id, 0, sizeof(*id)); + memset(id, 0, id_size); cmd.cdw10 = 0; cmd.nsid = cpu_to_le32(namespace); if (nvme_admin_cmd_sync(bs, &cmd)) { From 2387aaced7209872238eaf594997009cffd5501d Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 29 Oct 2020 10:33:01 +0100 Subject: [PATCH 22/33] block/nvme: Change size and alignment of queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation of 64kB host page support, let's change the size and alignment of the queue so that the VFIO DMA MAP succeeds. We align on the host page size. Signed-off-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-21-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 7628623c05..4a8589d2d2 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -167,9 +167,9 @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, size_t bytes; int r; - bytes = ROUND_UP(nentries * entry_bytes, s->page_size); + bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size); q->head = q->tail = 0; - q->queue = qemu_try_memalign(s->page_size, bytes); + q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes); if (!q->queue) { error_setg(errp, "Cannot allocate queue"); return false; From f8fd3ebac358c187d0aba7f922450ed6addf41a8 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 29 Oct 2020 10:33:02 +0100 Subject: [PATCH 23/33] block/nvme: Change size and alignment of prp_list_pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation of 64kB host page support, let's change the size and alignment of the prp_list_pages so that the VFIO DMA MAP succeeds with 64kB host page size. We align on the host page size. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-22-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 4a8589d2d2..e807dd56df 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -215,6 +215,7 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, int i, r; NVMeQueuePair *q; uint64_t prp_list_iova; + size_t bytes; q = g_try_new0(NVMeQueuePair, 1); if (!q) { @@ -222,19 +223,19 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, } trace_nvme_create_queue_pair(idx, q, size, aio_context, event_notifier_get_fd(s->irq_notifier)); - q->prp_list_pages = qemu_try_memalign(s->page_size, - s->page_size * NVME_NUM_REQS); + bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, + qemu_real_host_page_size); + q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes); if (!q->prp_list_pages) { goto fail; } - memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS); + memset(q->prp_list_pages, 0, bytes); qemu_mutex_init(&q->lock); q->s = s; q->index = idx; qemu_co_queue_init(&q->free_req_queue); q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); - r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, - s->page_size * NVME_NUM_REQS, + r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, false, &prp_list_iova); if (r) { goto fail; From 9e13d598843cca1cdab7b7bdcb9cc0868ebf7fed Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 29 Oct 2020 10:33:03 +0100 Subject: [PATCH 24/33] block/nvme: Align iov's va and size on host page size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure iov's va and size are properly aligned on the host page size. Signed-off-by: Eric Auger Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-23-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index e807dd56df..f1e2fd34cd 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1015,11 +1015,12 @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, for (i = 0; i < qiov->niov; ++i) { bool retry = true; uint64_t iova; + size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, + qemu_real_host_page_size); try_map: r = qemu_vfio_dma_map(s->vfio, qiov->iov[i].iov_base, - qiov->iov[i].iov_len, - true, &iova); + len, true, &iova); if (r == -ENOMEM && retry) { retry = false; trace_nvme_dma_flush_queue_wait(s); @@ -1163,8 +1164,9 @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs, BDRVNVMeState *s = bs->opaque; for (i = 0; i < qiov->niov; ++i) { - if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) || - !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) { + if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, + qemu_real_host_page_size) || + !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) { trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, qiov->iov[i].iov_len, s->page_size); return false; @@ -1180,7 +1182,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, int r; uint8_t *buf = NULL; QEMUIOVector local_qiov; - + size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size); assert(QEMU_IS_ALIGNED(offset, s->page_size)); assert(QEMU_IS_ALIGNED(bytes, s->page_size)); assert(bytes <= s->max_transfer); @@ -1190,7 +1192,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } s->stats.unaligned_accesses++; trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); - buf = qemu_try_memalign(s->page_size, bytes); + buf = qemu_try_memalign(qemu_real_host_page_size, len); if (!buf) { return -ENOMEM; From 4b19e9b8159b5eecf8e90c81c56c1a13547e411a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:33:04 +0100 Subject: [PATCH 25/33] block/nvme: Fix use of write-only doorbells page on Aarch64 arch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu_vfio_pci_map_bar() calls mmap(), and mmap(2) states: 'offset' must be a multiple of the page size as returned by sysconf(_SC_PAGE_SIZE). In commit f68453237b9 we started to use an offset of 4K which broke this contract on Aarch64 arch. Fix by mapping at offset 0, and and accessing doorbells at offset=4K. Fixes: f68453237b9 ("block/nvme: Map doorbells pages write-only") Reported-by: Eric Auger Reviewed-by: Eric Auger Reviewed-by: Stefan Hajnoczi Tested-by: Eric Auger Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201029093306.1063879-24-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index f1e2fd34cd..c8ef69cbb2 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -94,6 +94,7 @@ typedef struct { struct BDRVNVMeState { AioContext *aio_context; QEMUVFIOState *vfio; + void *bar0_wo_map; /* Memory mapped registers */ volatile struct { uint32_t sq_tail; @@ -777,8 +778,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } } - s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar), - NVME_DOORBELL_SIZE, PROT_WRITE, errp); + s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, + sizeof(NvmeBar) + NVME_DOORBELL_SIZE, + PROT_WRITE, errp); + s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); if (!s->doorbells) { ret = -EINVAL; goto out; @@ -910,8 +913,8 @@ static void nvme_close(BlockDriverState *bs) &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, - sizeof(NvmeBar), NVME_DOORBELL_SIZE); + qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, + 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); qemu_vfio_close(s->vfio); g_free(s->device); From a0546a7b6f14a919b67cea1d1fc4362ac2901868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 29 Oct 2020 10:33:05 +0100 Subject: [PATCH 26/33] block/nvme: Fix nvme_submit_command() on big-endian host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Completion Queue Command Identifier is a 16-bit value, so nvme_submit_command() is unlikely to work on big-endian hosts, as the relevant bits are truncated. Fix by using the correct byte-swap function. Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver") Reported-by: Keith Busch Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Message-id: 20201029093306.1063879-25-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/nvme.c b/block/nvme.c index c8ef69cbb2..a06a188d53 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -469,7 +469,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, assert(!req->cb); req->cb = cb; req->opaque = opaque; - cmd->cid = cpu_to_le32(req->cid); + cmd->cid = cpu_to_le16(req->cid); trace_nvme_submit_command(q->s, q->index, req->cid); nvme_trace_command(cmd); From a4bc212add9bd9d7a7baaa095f875d9212eb1fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:27 +0100 Subject: [PATCH 27/33] util/vfio-helpers: Improve reporting unsupported IOMMU type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change the confuse "VFIO IOMMU check failed" error message by the explicit "VFIO IOMMU Type1 is not supported" once. Example on POWER: $ qemu-system-ppc64 -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw qemu-system-ppc64: -drive if=none,id=nvme0,file=nvme://0001:01:00.0/1,format=raw: VFIO IOMMU Type1 is not supported Suggested-by: Alex Williamson Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-2-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/vfio-helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index c469beb061..14a549510f 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -300,7 +300,7 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, } if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { - error_setg_errno(errp, errno, "VFIO IOMMU check failed"); + error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported"); ret = -EINVAL; goto fail_container; } From 3d87c2d9435a4c71d588a770a8a34382c1afa9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:28 +0100 Subject: [PATCH 28/33] util/vfio-helpers: Trace PCI I/O config accesses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We sometime get kernel panic with some devices on Aarch64 hosts. Alex Williamson suggests it might be broken PCIe root complex. Add trace event to record the latest I/O access before crashing. In case, assert our accesses are aligned. Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-3-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/trace-events | 2 ++ util/vfio-helpers.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/util/trace-events b/util/trace-events index 24c31803b0..8d3615e717 100644 --- a/util/trace-events +++ b/util/trace-events @@ -85,3 +85,5 @@ qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64 qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p" qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" +qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" +qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 14a549510f..1d4efafcaa 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -227,6 +227,10 @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, { int ret; + trace_qemu_vfio_pci_read_config(buf, ofs, size, + s->config_region_info.offset, + s->config_region_info.size); + assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); do { ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); } while (ret == -1 && errno == EINTR); @@ -237,6 +241,10 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int { int ret; + trace_qemu_vfio_pci_write_config(buf, ofs, size, + s->config_region_info.offset, + s->config_region_info.size); + assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); do { ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); } while (ret == -1 && errno == EINTR); From df058222e77963e9bece723d19142354ed443d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:29 +0100 Subject: [PATCH 29/33] util/vfio-helpers: Trace PCI BAR region info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For debug purpose, trace BAR regions info. Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-4-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/trace-events | 1 + util/vfio-helpers.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/util/trace-events b/util/trace-events index 8d3615e717..0753e4a0ed 100644 --- a/util/trace-events +++ b/util/trace-events @@ -87,3 +87,4 @@ qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *io qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" +qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 1d4efafcaa..cd6287c3a9 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -136,6 +136,7 @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) { + g_autofree char *barname = NULL; assert_bar_index_valid(s, index); s->bar_region_info[index] = (struct vfio_region_info) { .index = VFIO_PCI_BAR0_REGION_INDEX + index, @@ -145,6 +146,10 @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) error_setg_errno(errp, errno, "Failed to get BAR region info"); return -errno; } + barname = g_strdup_printf("bar[%d]", index); + trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset, + s->bar_region_info[index].size, + s->bar_region_info[index].cap_offset); return 0; } @@ -416,6 +421,9 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, ret = -errno; goto fail; } + trace_qemu_vfio_region_info("config", s->config_region_info.offset, + s->config_region_info.size, + s->config_region_info.cap_offset); for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { ret = qemu_vfio_pci_init_bar(s, i, errp); From 2817fbcee060cfdf3899fbab97e185272f5f83ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:30 +0100 Subject: [PATCH 30/33] util/vfio-helpers: Trace where BARs are mapped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For debugging purpose, trace where a BAR is mapped. Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-5-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/trace-events | 1 + util/vfio-helpers.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/util/trace-events b/util/trace-events index 0753e4a0ed..52d43cda3f 100644 --- a/util/trace-events +++ b/util/trace-events @@ -88,3 +88,4 @@ qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 +qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index cd6287c3a9..278c54902e 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -166,6 +166,8 @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), prot, MAP_SHARED, s->device, s->bar_region_info[index].offset + offset); + trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset , + size, offset, p); if (p == MAP_FAILED) { error_setg_errno(errp, errno, "Failed to map BAR region"); p = NULL; From 4c946b2236e630613fcb89516bf7fb2f918dc996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:31 +0100 Subject: [PATCH 31/33] util/vfio-helpers: Improve DMA trace events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For debugging purpose, trace where DMA regions are mapped. Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-6-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/trace-events | 5 +++-- util/vfio-helpers.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/util/trace-events b/util/trace-events index 52d43cda3f..08239941cb 100644 --- a/util/trace-events +++ b/util/trace-events @@ -82,8 +82,9 @@ qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%z qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64 -qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64 -qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p" +qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx" +qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p" +qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx" qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 278c54902e..c24a510df8 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -627,7 +627,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, .vaddr = (uintptr_t)host, .size = size, }; - trace_qemu_vfio_do_mapping(s, host, size, iova); + trace_qemu_vfio_do_mapping(s, host, iova, size); if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); @@ -783,6 +783,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, } } } + trace_qemu_vfio_dma_mapped(s, host, iova0, size); if (iova) { *iova = iova0; } From f6b8104d0b75a8c2b21bd3175e9ba9a402182b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:32 +0100 Subject: [PATCH 32/33] util/vfio-helpers: Convert vfio_dump_mapping to trace events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The QEMU_VFIO_DEBUG definition is only modifiable at build-time. Trace events can be enabled at run-time. As we prefer the latter, convert qemu_vfio_dump_mappings() to use trace events instead of fprintf(). Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-7-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/trace-events | 1 + util/vfio-helpers.c | 19 ++++--------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/util/trace-events b/util/trace-events index 08239941cb..61e0d4bcdf 100644 --- a/util/trace-events +++ b/util/trace-events @@ -80,6 +80,7 @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex qemu_vfio_dma_reset_temporary(void *s) "s %p" qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx" qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx" qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64 qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index c24a510df8..73f7bfa754 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -521,23 +521,12 @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) return s; } -static void qemu_vfio_dump_mapping(IOVAMapping *m) -{ - if (QEMU_VFIO_DEBUG) { - printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, - (uint64_t)m->size, (uint64_t)m->iova); - } -} - static void qemu_vfio_dump_mappings(QEMUVFIOState *s) { - int i; - - if (QEMU_VFIO_DEBUG) { - printf("vfio mappings\n"); - for (i = 0; i < s->nr_mappings; ++i) { - qemu_vfio_dump_mapping(&s->mappings[i]); - } + for (int i = 0; i < s->nr_mappings; ++i) { + trace_qemu_vfio_dump_mapping(s->mappings[i].host, + s->mappings[i].iova, + s->mappings[i].size); } } From fc107d86840b3364e922c26cf7631b7fd38ce523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 3 Nov 2020 03:07:33 +0100 Subject: [PATCH 33/33] util/vfio-helpers: Assert offset is aligned to page size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mmap(2) states: 'offset' must be a multiple of the page size as returned by sysconf(_SC_PAGE_SIZE). Add an assertion to be sure we don't break this contract. Signed-off-by: Philippe Mathieu-Daudé Message-id: 20201103020733.2303148-8-philmd@redhat.com Signed-off-by: Stefan Hajnoczi Tested-by: Eric Auger --- util/vfio-helpers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 73f7bfa754..804768d5c6 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -162,6 +162,7 @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, Error **errp) { void *p; + assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size)); assert_bar_index_valid(s, index); p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), prot, MAP_SHARED,