From 4bb571d857d973d9308d9fdb1f48d983d6639bd4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 15 Feb 2017 22:37:45 +0200 Subject: [PATCH 01/23] pci/pcie: don't assume cap id 0 is reserved VFIO actually wants to create a capability with ID == 0. This is done to make guest drivers skip the given capability. pcie_add_capability then trips up on this capability when looking for end of capability list. To support this use-case, it's easy enough to switch to e.g. 0xffffffff for these comparisons - we can be sure it will never match a 16-bit capability ID. Signed-off-by: Michael S. Tsirkin Reviewed-by: Peter Xu Reviewed-by: Alex Williamson --- hw/pci/pcie.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index cbd4bb4f8c..f4dd177bc4 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -610,7 +610,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev) * uint16_t ext_cap_size */ -static uint16_t pcie_find_capability_list(PCIDevice *dev, uint16_t cap_id, +/* Passing a cap_id value > 0xffff will return 0 and put end of list in prev */ +static uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id, uint16_t *prev_p) { uint16_t prev = 0; @@ -679,9 +680,11 @@ void pcie_add_capability(PCIDevice *dev, } else { uint16_t prev; - /* 0 is reserved cap id. use internally to find the last capability - in the linked list */ - next = pcie_find_capability_list(dev, 0, &prev); + /* + * 0xffffffff is not a valid cap id (it's a 16 bit field). use + * internally to find the last capability in the linked list. + */ + next = pcie_find_capability_list(dev, 0xffffffff, &prev); assert(prev >= PCI_CONFIG_SPACE_SIZE); assert(next == 0); From 0793169870f376bc9959b7d81df48ab4a90dcceb Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Thu, 9 Feb 2017 16:40:47 +0800 Subject: [PATCH 02/23] virtio: Report real progress in VQ aio poll handler In virtio_queue_host_notifier_aio_poll, not all "!virtio_queue_empty()" cases are making true progress. Currently the offending one is virtio-scsi event queue, whose handler does nothing if no event is pending. As a result aio_poll() will spin on the "non-empty" VQ and take 100% host CPU. Fix this by reporting actual progress from virtio queue aio handlers. Reported-by: Ed Swierk Signed-off-by: Fam Zheng Tested-by: Ed Swierk Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/block/dataplane/virtio-blk.c | 4 ++-- hw/block/virtio-blk.c | 12 ++++++++++-- hw/scsi/virtio-scsi-dataplane.c | 14 +++++++------- hw/scsi/virtio-scsi.c | 14 +++++++++++--- hw/virtio/virtio.c | 15 +++++++++------ include/hw/virtio/virtio-blk.h | 2 +- include/hw/virtio/virtio-scsi.h | 6 +++--- include/hw/virtio/virtio.h | 4 ++-- 8 files changed, 45 insertions(+), 26 deletions(-) diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index d1f9f63eaf..5556f0e64e 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -147,7 +147,7 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) g_free(s); } -static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev, +static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBlock *s = (VirtIOBlock *)vdev; @@ -155,7 +155,7 @@ static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev, assert(s->dataplane); assert(s->dataplane_started); - virtio_blk_handle_vq(s, vq); + return virtio_blk_handle_vq(s, vq); } /* Context: QEMU global mutex held */ diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 702eda863e..baaa19593f 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -581,10 +581,11 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) return 0; } -void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) +bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) { VirtIOBlockReq *req; MultiReqBuffer mrb = {}; + bool progress = false; blk_io_plug(s->blk); @@ -592,6 +593,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) virtio_queue_set_notification(vq, 0); while ((req = virtio_blk_get_request(s, vq))) { + progress = true; if (virtio_blk_handle_request(req, &mrb)) { virtqueue_detach_element(req->vq, &req->elem, 0); virtio_blk_free_request(req); @@ -607,6 +609,12 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) } blk_io_unplug(s->blk); + return progress; +} + +static void virtio_blk_handle_output_do(VirtIOBlock *s, VirtQueue *vq) +{ + virtio_blk_handle_vq(s, vq); } static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -622,7 +630,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) return; } } - virtio_blk_handle_vq(s, vq); + virtio_blk_handle_output_do(s, vq); } static void virtio_blk_dma_restart_bh(void *opaque) diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c index 6b8d0f0024..74c95e0e60 100644 --- a/hw/scsi/virtio-scsi-dataplane.c +++ b/hw/scsi/virtio-scsi-dataplane.c @@ -49,35 +49,35 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) } } -static void virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev, +static bool virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev, VirtQueue *vq) { VirtIOSCSI *s = (VirtIOSCSI *)vdev; assert(s->ctx && s->dataplane_started); - virtio_scsi_handle_cmd_vq(s, vq); + return virtio_scsi_handle_cmd_vq(s, vq); } -static void virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev, +static bool virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) { VirtIOSCSI *s = VIRTIO_SCSI(vdev); assert(s->ctx && s->dataplane_started); - virtio_scsi_handle_ctrl_vq(s, vq); + return virtio_scsi_handle_ctrl_vq(s, vq); } -static void virtio_scsi_data_plane_handle_event(VirtIODevice *vdev, +static bool virtio_scsi_data_plane_handle_event(VirtIODevice *vdev, VirtQueue *vq) { VirtIOSCSI *s = VIRTIO_SCSI(vdev); assert(s->ctx && s->dataplane_started); - virtio_scsi_handle_event_vq(s, vq); + return virtio_scsi_handle_event_vq(s, vq); } static int virtio_scsi_vring_init(VirtIOSCSI *s, VirtQueue *vq, int n, - void (*fn)(VirtIODevice *vdev, VirtQueue *vq)) + VirtIOHandleAIOOutput fn) { BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s))); int rc; diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index ce19efffc8..b01030b745 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -436,13 +436,16 @@ static inline void virtio_scsi_release(VirtIOSCSI *s) } } -void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq) +bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq) { VirtIOSCSIReq *req; + bool progress = false; while ((req = virtio_scsi_pop_req(s, vq))) { + progress = true; virtio_scsi_handle_ctrl_req(s, req); } + return progress; } static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) @@ -591,10 +594,11 @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req) scsi_req_unref(sreq); } -void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) +bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) { VirtIOSCSIReq *req, *next; int ret = 0; + bool progress = false; QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); @@ -602,6 +606,7 @@ void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) virtio_queue_set_notification(vq, 0); while ((req = virtio_scsi_pop_req(s, vq))) { + progress = true; ret = virtio_scsi_handle_cmd_req_prepare(s, req); if (!ret) { QTAILQ_INSERT_TAIL(&reqs, req, next); @@ -624,6 +629,7 @@ void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) QTAILQ_FOREACH_SAFE(req, &reqs, next, next) { virtio_scsi_handle_cmd_req_submit(s, req); } + return progress; } static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq) @@ -752,11 +758,13 @@ out: virtio_scsi_release(s); } -void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) +bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) { if (s->events_dropped) { virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0); + return true; } + return false; } static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 63657066e7..2461c06199 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -97,7 +97,7 @@ struct VirtQueue uint16_t vector; VirtIOHandleOutput handle_output; - VirtIOHandleOutput handle_aio_output; + VirtIOHandleAIOOutput handle_aio_output; VirtIODevice *vdev; EventNotifier guest_notifier; EventNotifier host_notifier; @@ -1287,14 +1287,16 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) virtio_queue_update_rings(vdev, n); } -static void virtio_queue_notify_aio_vq(VirtQueue *vq) +static bool virtio_queue_notify_aio_vq(VirtQueue *vq) { if (vq->vring.desc && vq->handle_aio_output) { VirtIODevice *vdev = vq->vdev; trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); - vq->handle_aio_output(vdev, vq); + return vq->handle_aio_output(vdev, vq); } + + return false; } static void virtio_queue_notify_vq(VirtQueue *vq) @@ -2125,16 +2127,17 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque) { EventNotifier *n = opaque; VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + bool progress; if (virtio_queue_empty(vq)) { return false; } - virtio_queue_notify_aio_vq(vq); + progress = virtio_queue_notify_aio_vq(vq); /* In case the handler function re-enabled notifications */ virtio_queue_set_notification(vq, 0); - return true; + return progress; } static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) @@ -2146,7 +2149,7 @@ static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) } void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx, - VirtIOHandleOutput handle_output) + VirtIOHandleAIOOutput handle_output) { if (handle_output) { vq->handle_aio_output = handle_output; diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 9734b4c446..d3c8a6fa8c 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -80,6 +80,6 @@ typedef struct MultiReqBuffer { bool is_write; } MultiReqBuffer; -void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); +bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); #endif diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h index 73751969ba..f536f77e68 100644 --- a/include/hw/virtio/virtio-scsi.h +++ b/include/hw/virtio/virtio-scsi.h @@ -126,9 +126,9 @@ void virtio_scsi_common_realize(DeviceState *dev, Error **errp, VirtIOHandleOutput cmd); void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp); -void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq); -void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq); -void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq); +bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq); +bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq); +bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq); void virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq, VirtIOSCSIReq *req); void virtio_scsi_free_req(VirtIOSCSIReq *req); void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev, diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 525da24222..0863a254ba 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -154,6 +154,7 @@ void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3); void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name); typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *); +typedef bool (*VirtIOHandleAIOOutput)(VirtIODevice *, VirtQueue *); VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, VirtIOHandleOutput handle_output); @@ -284,8 +285,7 @@ bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev); EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq); void virtio_queue_host_notifier_read(EventNotifier *n); void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx, - void (*fn)(VirtIODevice *, - VirtQueue *)); + VirtIOHandleAIOOutput handle_output); VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector); VirtQueue *virtio_vector_next_queue(VirtQueue *vq); From 79c0f397feaea37992500eafd933e301c2306410 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Fri, 3 Feb 2017 08:32:12 +0800 Subject: [PATCH 03/23] docs: add document to explain the usage of vNVDIMM Signed-off-by: Haozhong Zhang Reviewed-by: Xiao Guangrong Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/nvdimm.txt | 124 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 docs/nvdimm.txt diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt new file mode 100644 index 0000000000..2d9f8c0e8c --- /dev/null +++ b/docs/nvdimm.txt @@ -0,0 +1,124 @@ +QEMU Virtual NVDIMM +=================== + +This document explains the usage of virtual NVDIMM (vNVDIMM) feature +which is available since QEMU v2.6.0. + +The current QEMU only implements the persistent memory mode of vNVDIMM +device and not the block window mode. + +Basic Usage +----------- + +The storage of a vNVDIMM device in QEMU is provided by the memory +backend (i.e. memory-backend-file and memory-backend-ram). A simple +way to create a vNVDIMM device at startup time is done via the +following command line options: + + -machine pc,nvdimm + -m $RAM_SIZE,slots=$N,maxmem=$MAX_SIZE + -object memory-backend-file,id=mem1,share=on,mem-path=$PATH,size=$NVDIMM_SIZE + -device nvdimm,id=nvdimm1,memdev=mem1 + +Where, + + - the "nvdimm" machine option enables vNVDIMM feature. + + - "slots=$N" should be equal to or larger than the total amount of + normal RAM devices and vNVDIMM devices, e.g. $N should be >= 2 here. + + - "maxmem=$MAX_SIZE" should be equal to or larger than the total size + of normal RAM devices and vNVDIMM devices, e.g. $MAX_SIZE should be + >= $RAM_SIZE + $NVDIMM_SIZE here. + + - "object memory-backend-file,id=mem1,share=on,mem-path=$PATH,size=$NVDIMM_SIZE" + creates a backend storage of size $NVDIMM_SIZE on a file $PATH. All + accesses to the virtual NVDIMM device go to the file $PATH. + + "share=on/off" controls the visibility of guest writes. If + "share=on", then guest writes will be applied to the backend + file. If another guest uses the same backend file with option + "share=on", then above writes will be visible to it as well. If + "share=off", then guest writes won't be applied to the backend + file and thus will be invisible to other guests. + + - "device nvdimm,id=nvdimm1,memdev=mem1" creates a virtual NVDIMM + device whose storage is provided by above memory backend device. + +Multiple vNVDIMM devices can be created if multiple pairs of "-object" +and "-device" are provided. + +For above command line options, if the guest OS has the proper NVDIMM +driver, it should be able to detect a NVDIMM device which is in the +persistent memory mode and whose size is $NVDIMM_SIZE. + +Note: + +1. Prior to QEMU v2.8.0, if memory-backend-file is used and the actual + backend file size is not equal to the size given by "size" option, + QEMU will truncate the backend file by ftruncate(2), which will + corrupt the existing data in the backend file, especially for the + shrink case. + + QEMU v2.8.0 and later check the backend file size and the "size" + option. If they do not match, QEMU will report errors and abort in + order to avoid the data corruption. + +2. QEMU v2.6.0 only puts a basic alignment requirement on the "size" + option of memory-backend-file, e.g. 4KB alignment on x86. However, + QEMU v.2.7.0 puts an additional alignment requirement, which may + require a larger value than the basic one, e.g. 2MB on x86. This + change breaks the usage of memory-backend-file that only satisfies + the basic alignment. + + QEMU v2.8.0 and later remove the additional alignment on non-s390x + architectures, so the broken memory-backend-file can work again. + +Label +----- + +QEMU v2.7.0 and later implement the label support for vNVDIMM devices. +To enable label on vNVDIMM devices, users can simply add +"label-size=$SZ" option to "-device nvdimm", e.g. + + -device nvdimm,id=nvdimm1,memdev=mem1,label-size=128K + +Note: + +1. The minimal label size is 128KB. + +2. QEMU v2.7.0 and later store labels at the end of backend storage. + If a memory backend file, which was previously used as the backend + of a vNVDIMM device without labels, is now used for a vNVDIMM + device with label, the data in the label area at the end of file + will be inaccessible to the guest. If any useful data (e.g. the + meta-data of the file system) was stored there, the latter usage + may result guest data corruption (e.g. breakage of guest file + system). + +Hotplug +------- + +QEMU v2.8.0 and later implement the hotplug support for vNVDIMM +devices. Similarly to the RAM hotplug, the vNVDIMM hotplug is +accomplished by two monitor commands "object_add" and "device_add". + +For example, the following commands add another 4GB vNVDIMM device to +the guest: + + (qemu) object_add memory-backend-file,id=mem2,share=on,mem-path=new_nvdimm.img,size=4G + (qemu) device_add nvdimm,id=nvdimm2,memdev=mem2 + +Note: + +1. Each hotplugged vNVDIMM device consumes one memory slot. Users + should always ensure the memory option "-m ...,slots=N" specifies + enough number of slots, i.e. + N >= number of RAM devices + + number of statically plugged vNVDIMM devices + + number of hotplugged vNVDIMM devices + +2. The similar is required for the memory option "-m ...,maxmem=M", i.e. + M >= size of RAM devices + + size of statically plugged vNVDIMM devices + + size of hotplugged vNVDIMM devices From 1d8280c18f96f0cd96d1e7acd62f7250c4da1a84 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:12 +0100 Subject: [PATCH 04/23] memory: make memory_listener_unregister idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it easy to unregister a MemoryListener without tracking whether it had been registered before. Signed-off-by: Paolo Bonzini Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/memory.c b/memory.c index 6c58373422..ed8b5aa83e 100644 --- a/memory.c +++ b/memory.c @@ -2371,8 +2371,13 @@ void memory_listener_register(MemoryListener *listener, AddressSpace *as) void memory_listener_unregister(MemoryListener *listener) { + if (!listener->address_space) { + return; + } + QTAILQ_REMOVE(&memory_listeners, listener, link); QTAILQ_REMOVE(&listener->address_space->listeners, listener, link_as); + listener->address_space = NULL; } void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) From e6a830d6eba6ed116a0b4ef5577ca0b46edf4c67 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:13 +0100 Subject: [PATCH 05/23] virtio: add virtio_*_phys_cached Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/virtio-access.h | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h index 91ae14d254..2e92074bd1 100644 --- a/include/hw/virtio/virtio-access.h +++ b/include/hw/virtio/virtio-access.h @@ -156,6 +156,58 @@ static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s) #endif } +static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa) +{ + if (virtio_access_is_big_endian(vdev)) { + return lduw_be_phys_cached(cache, pa); + } + return lduw_le_phys_cached(cache, pa); +} + +static inline uint32_t virtio_ldl_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa) +{ + if (virtio_access_is_big_endian(vdev)) { + return ldl_be_phys_cached(cache, pa); + } + return ldl_le_phys_cached(cache, pa); +} + +static inline uint64_t virtio_ldq_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa) +{ + if (virtio_access_is_big_endian(vdev)) { + return ldq_be_phys_cached(cache, pa); + } + return ldq_le_phys_cached(cache, pa); +} + +static inline void virtio_stw_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa, uint16_t value) +{ + if (virtio_access_is_big_endian(vdev)) { + stw_be_phys_cached(cache, pa, value); + } else { + stw_le_phys_cached(cache, pa, value); + } +} + +static inline void virtio_stl_phys_cached(VirtIODevice *vdev, + MemoryRegionCache *cache, + hwaddr pa, uint32_t value) +{ + if (virtio_access_is_big_endian(vdev)) { + stl_be_phys_cached(cache, pa, value); + } else { + stl_le_phys_cached(cache, pa, value); + } +} + static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s) { *s = virtio_tswap16(vdev, *s); From 9796d0ac8fb0a8d522afd409cb25a607eae61617 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:14 +0100 Subject: [PATCH 06/23] virtio: use address_space_map/unmap to access descriptors This makes little difference, but it makes the code change smaller for the next patch that introduces MemoryRegionCache. This is because map/unmap are similar to MemoryRegionCache init/destroy. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 103 +++++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 2461c06199..6ce6a26908 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -120,10 +120,9 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) } static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, - hwaddr desc_pa, int i) + uint8_t *desc_ptr, int i) { - address_space_read(vdev->dma_as, desc_pa + i * sizeof(VRingDesc), - MEMTXATTRS_UNSPECIFIED, (void *)desc, sizeof(VRingDesc)); + memcpy(desc, desc_ptr + i * sizeof(VRingDesc), sizeof(VRingDesc)); virtio_tswap64s(vdev, &desc->addr); virtio_tswap32s(vdev, &desc->len); virtio_tswap16s(vdev, &desc->flags); @@ -408,7 +407,7 @@ enum { }; static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, - hwaddr desc_pa, unsigned int max, + void *desc_ptr, unsigned int max, unsigned int *next) { /* If this descriptor says it doesn't chain, we're done. */ @@ -426,7 +425,7 @@ static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, return VIRTQUEUE_READ_DESC_ERROR; } - vring_desc_read(vdev, desc, desc_pa, *next); + vring_desc_read(vdev, desc, desc_ptr, *next); return VIRTQUEUE_READ_DESC_MORE; } @@ -434,31 +433,41 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, unsigned int *out_bytes, unsigned max_in_bytes, unsigned max_out_bytes) { - unsigned int idx; + VirtIODevice *vdev = vq->vdev; + unsigned int max, idx; unsigned int total_bufs, in_total, out_total; + void *vring_desc_ptr; + void *indirect_desc_ptr = NULL; + hwaddr len = 0; int rc; idx = vq->last_avail_idx; - total_bufs = in_total = out_total = 0; + + max = vq->vring.num; + len = max * sizeof(VRingDesc); + vring_desc_ptr = address_space_map(vdev->dma_as, vq->vring.desc, &len, false); + if (len < max * sizeof(VRingDesc)) { + virtio_error(vdev, "Cannot map descriptor ring"); + goto err; + } + while ((rc = virtqueue_num_heads(vq, idx)) > 0) { - VirtIODevice *vdev = vq->vdev; - unsigned int max, num_bufs, indirect = 0; + void *desc_ptr = vring_desc_ptr; + unsigned int num_bufs; VRingDesc desc; - hwaddr desc_pa; unsigned int i; - max = vq->vring.num; num_bufs = total_bufs; if (!virtqueue_get_head(vq, idx++, &i)) { goto err; } - desc_pa = vq->vring.desc; - vring_desc_read(vdev, &desc, desc_pa, i); + vring_desc_read(vdev, &desc, desc_ptr, i); if (desc.flags & VRING_DESC_F_INDIRECT) { + len = desc.len; if (desc.len % sizeof(VRingDesc)) { virtio_error(vdev, "Invalid size for indirect buffer table"); goto err; @@ -471,11 +480,17 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } /* loop over the indirect descriptor table */ - indirect = 1; + indirect_desc_ptr = address_space_map(vdev->dma_as, desc.addr, + &len, false); + desc_ptr = indirect_desc_ptr; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto err; + } + max = desc.len / sizeof(VRingDesc); - desc_pa = desc.addr; num_bufs = i = 0; - vring_desc_read(vdev, &desc, desc_pa, i); + vring_desc_read(vdev, &desc, desc_ptr, i); } do { @@ -494,17 +509,20 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, goto done; } - rc = virtqueue_read_next_desc(vdev, &desc, desc_pa, max, &i); + rc = virtqueue_read_next_desc(vdev, &desc, desc_ptr, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { goto err; } - if (!indirect) - total_bufs = num_bufs; - else + if (desc_ptr == indirect_desc_ptr) { + address_space_unmap(vdev->dma_as, desc_ptr, len, false, 0); + indirect_desc_ptr = NULL; total_bufs++; + } else { + total_bufs = num_bufs; + } } if (rc < 0) { @@ -512,6 +530,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } done: + if (indirect_desc_ptr) { + address_space_unmap(vdev->dma_as, indirect_desc_ptr, len, false, 0); + } + address_space_unmap(vdev->dma_as, vring_desc_ptr, len, false, 0); if (in_bytes) { *in_bytes = in_total; } @@ -651,9 +673,12 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu void *virtqueue_pop(VirtQueue *vq, size_t sz) { unsigned int i, head, max; - hwaddr desc_pa = vq->vring.desc; + void *vring_desc_ptr; + void *indirect_desc_ptr = NULL; + void *desc_ptr; + hwaddr len; VirtIODevice *vdev = vq->vdev; - VirtQueueElement *elem; + VirtQueueElement *elem = NULL; unsigned out_num, in_num; hwaddr addr[VIRTQUEUE_MAX_SIZE]; struct iovec iov[VIRTQUEUE_MAX_SIZE]; @@ -689,18 +714,34 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) } i = head; - vring_desc_read(vdev, &desc, desc_pa, i); + + len = max * sizeof(VRingDesc); + vring_desc_ptr = address_space_map(vdev->dma_as, vq->vring.desc, &len, false); + if (len < max * sizeof(VRingDesc)) { + virtio_error(vdev, "Cannot map descriptor ring"); + goto done; + } + + desc_ptr = vring_desc_ptr; + vring_desc_read(vdev, &desc, desc_ptr, i); if (desc.flags & VRING_DESC_F_INDIRECT) { if (desc.len % sizeof(VRingDesc)) { virtio_error(vdev, "Invalid size for indirect buffer table"); - return NULL; + goto done; } /* loop over the indirect descriptor table */ + len = desc.len; + indirect_desc_ptr = address_space_map(vdev->dma_as, desc.addr, &len, false); + desc_ptr = indirect_desc_ptr; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto done; + } + max = desc.len / sizeof(VRingDesc); - desc_pa = desc.addr; i = 0; - vring_desc_read(vdev, &desc, desc_pa, i); + vring_desc_read(vdev, &desc, desc_ptr, i); } /* Collect all the descriptors */ @@ -731,7 +772,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) goto err_undo_map; } - rc = virtqueue_read_next_desc(vdev, &desc, desc_pa, max, &i); + rc = virtqueue_read_next_desc(vdev, &desc, desc_ptr, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { @@ -753,11 +794,17 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); +done: + if (indirect_desc_ptr) { + address_space_unmap(vdev->dma_as, indirect_desc_ptr, len, false, 0); + } + address_space_unmap(vdev->dma_as, vring_desc_ptr, len, false, 0); + return elem; err_undo_map: virtqueue_undo_map_desc(out_num, in_num, iov); - return NULL; + goto done; } /* virtqueue_drop_all: From 91047df38dffa80222179f63fbb74c1dfefa25ed Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:15 +0100 Subject: [PATCH 07/23] exec: make address_space_cache_destroy idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clear cache->mr so that address_space_cache_destroy does nothing the second time it is called. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- exec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/exec.c b/exec.c index 6fa337b8d8..865a1e8295 100644 --- a/exec.c +++ b/exec.c @@ -3166,6 +3166,7 @@ void address_space_cache_destroy(MemoryRegionCache *cache) xen_invalidate_map_cache_entry(cache->ptr); } memory_region_unref(cache->mr); + cache->mr = NULL; } /* Called from RCU critical section. This function has the same From 5eba0404b98294906134c06519c272bfb5f50453 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:16 +0100 Subject: [PATCH 08/23] virtio: use MemoryRegionCache to access descriptors For now, the cache is created on every virtqueue_pop. Later on, direct descriptors will be able to reuse it. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 80 +++++++++++++++++++++---------------------- include/exec/memory.h | 2 ++ 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 6ce6a26908..71e41f66b1 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -120,9 +120,10 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) } static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, - uint8_t *desc_ptr, int i) + MemoryRegionCache *cache, int i) { - memcpy(desc, desc_ptr + i * sizeof(VRingDesc), sizeof(VRingDesc)); + address_space_read_cached(cache, i * sizeof(VRingDesc), + desc, sizeof(VRingDesc)); virtio_tswap64s(vdev, &desc->addr); virtio_tswap32s(vdev, &desc->len); virtio_tswap16s(vdev, &desc->flags); @@ -407,7 +408,7 @@ enum { }; static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, - void *desc_ptr, unsigned int max, + MemoryRegionCache *desc_cache, unsigned int max, unsigned int *next) { /* If this descriptor says it doesn't chain, we're done. */ @@ -425,7 +426,7 @@ static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, return VIRTQUEUE_READ_DESC_ERROR; } - vring_desc_read(vdev, desc, desc_ptr, *next); + vring_desc_read(vdev, desc, desc_cache, *next); return VIRTQUEUE_READ_DESC_MORE; } @@ -436,24 +437,25 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, VirtIODevice *vdev = vq->vdev; unsigned int max, idx; unsigned int total_bufs, in_total, out_total; - void *vring_desc_ptr; - void *indirect_desc_ptr = NULL; - hwaddr len = 0; + MemoryRegionCache vring_desc_cache; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + int64_t len = 0; int rc; idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; max = vq->vring.num; - len = max * sizeof(VRingDesc); - vring_desc_ptr = address_space_map(vdev->dma_as, vq->vring.desc, &len, false); + len = address_space_cache_init(&vring_desc_cache, vdev->dma_as, + vq->vring.desc, max * sizeof(VRingDesc), + false); if (len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); goto err; } while ((rc = virtqueue_num_heads(vq, idx)) > 0) { - void *desc_ptr = vring_desc_ptr; + MemoryRegionCache *desc_cache = &vring_desc_cache; unsigned int num_bufs; VRingDesc desc; unsigned int i; @@ -464,10 +466,9 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, goto err; } - vring_desc_read(vdev, &desc, desc_ptr, i); + vring_desc_read(vdev, &desc, desc_cache, i); if (desc.flags & VRING_DESC_F_INDIRECT) { - len = desc.len; if (desc.len % sizeof(VRingDesc)) { virtio_error(vdev, "Invalid size for indirect buffer table"); goto err; @@ -480,9 +481,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } /* loop over the indirect descriptor table */ - indirect_desc_ptr = address_space_map(vdev->dma_as, desc.addr, - &len, false); - desc_ptr = indirect_desc_ptr; + len = address_space_cache_init(&indirect_desc_cache, + vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; if (len < desc.len) { virtio_error(vdev, "Cannot map indirect buffer"); goto err; @@ -490,7 +492,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, max = desc.len / sizeof(VRingDesc); num_bufs = i = 0; - vring_desc_read(vdev, &desc, desc_ptr, i); + vring_desc_read(vdev, &desc, desc_cache, i); } do { @@ -509,16 +511,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, goto done; } - rc = virtqueue_read_next_desc(vdev, &desc, desc_ptr, max, &i); + rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { goto err; } - if (desc_ptr == indirect_desc_ptr) { - address_space_unmap(vdev->dma_as, desc_ptr, len, false, 0); - indirect_desc_ptr = NULL; + if (desc_cache == &indirect_desc_cache) { + address_space_cache_destroy(&indirect_desc_cache); total_bufs++; } else { total_bufs = num_bufs; @@ -530,10 +531,8 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, } done: - if (indirect_desc_ptr) { - address_space_unmap(vdev->dma_as, indirect_desc_ptr, len, false, 0); - } - address_space_unmap(vdev->dma_as, vring_desc_ptr, len, false, 0); + address_space_cache_destroy(&indirect_desc_cache); + address_space_cache_destroy(&vring_desc_cache); if (in_bytes) { *in_bytes = in_total; } @@ -673,10 +672,10 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu void *virtqueue_pop(VirtQueue *vq, size_t sz) { unsigned int i, head, max; - void *vring_desc_ptr; - void *indirect_desc_ptr = NULL; - void *desc_ptr; - hwaddr len; + MemoryRegionCache vring_desc_cache; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + MemoryRegionCache *desc_cache; + int64_t len; VirtIODevice *vdev = vq->vdev; VirtQueueElement *elem = NULL; unsigned out_num, in_num; @@ -715,15 +714,16 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) i = head; - len = max * sizeof(VRingDesc); - vring_desc_ptr = address_space_map(vdev->dma_as, vq->vring.desc, &len, false); + len = address_space_cache_init(&vring_desc_cache, vdev->dma_as, + vq->vring.desc, max * sizeof(VRingDesc), + false); if (len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); goto done; } - desc_ptr = vring_desc_ptr; - vring_desc_read(vdev, &desc, desc_ptr, i); + desc_cache = &vring_desc_cache; + vring_desc_read(vdev, &desc, desc_cache, i); if (desc.flags & VRING_DESC_F_INDIRECT) { if (desc.len % sizeof(VRingDesc)) { virtio_error(vdev, "Invalid size for indirect buffer table"); @@ -731,9 +731,9 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) } /* loop over the indirect descriptor table */ - len = desc.len; - indirect_desc_ptr = address_space_map(vdev->dma_as, desc.addr, &len, false); - desc_ptr = indirect_desc_ptr; + len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; if (len < desc.len) { virtio_error(vdev, "Cannot map indirect buffer"); goto done; @@ -741,7 +741,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) max = desc.len / sizeof(VRingDesc); i = 0; - vring_desc_read(vdev, &desc, desc_ptr, i); + vring_desc_read(vdev, &desc, desc_cache, i); } /* Collect all the descriptors */ @@ -772,7 +772,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) goto err_undo_map; } - rc = virtqueue_read_next_desc(vdev, &desc, desc_ptr, max, &i); + rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { @@ -795,10 +795,8 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); done: - if (indirect_desc_ptr) { - address_space_unmap(vdev->dma_as, indirect_desc_ptr, len, false, 0); - } - address_space_unmap(vdev->dma_as, vring_desc_ptr, len, false, 0); + address_space_cache_destroy(&indirect_desc_cache); + address_space_cache_destroy(&vring_desc_cache); return elem; diff --git a/include/exec/memory.h b/include/exec/memory.h index 987f9251c6..691102317c 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1426,6 +1426,8 @@ struct MemoryRegionCache { bool is_write; }; +#define MEMORY_REGION_CACHE_INVALID ((MemoryRegionCache) { .mr = NULL }) + /* address_space_cache_init: prepare for repeated access to a physical * memory region * From c611c76417f52b335ecaab01c61743e3b705eb7c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:17 +0100 Subject: [PATCH 09/23] virtio: add MemoryListener to cache ring translations The cached translations are RCU-protected to allow efficient use when processing virtqueues. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 105 +++++++++++++++++++++++++++++++++++-- include/hw/virtio/virtio.h | 1 + 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 71e41f66b1..b75cb52fcc 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -60,6 +60,13 @@ typedef struct VRingUsed VRingUsedElem ring[0]; } VRingUsed; +typedef struct VRingMemoryRegionCaches { + struct rcu_head rcu; + MemoryRegionCache desc; + MemoryRegionCache avail; + MemoryRegionCache used; +} VRingMemoryRegionCaches; + typedef struct VRing { unsigned int num; @@ -68,6 +75,7 @@ typedef struct VRing hwaddr desc; hwaddr avail; hwaddr used; + VRingMemoryRegionCaches *caches; } VRing; struct VirtQueue @@ -104,6 +112,51 @@ struct VirtQueue QLIST_ENTRY(VirtQueue) node; }; +static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) +{ + if (!caches) { + return; + } + + address_space_cache_destroy(&caches->desc); + address_space_cache_destroy(&caches->avail); + address_space_cache_destroy(&caches->used); + g_free(caches); +} + +static void virtio_init_region_cache(VirtIODevice *vdev, int n) +{ + VirtQueue *vq = &vdev->vq[n]; + VRingMemoryRegionCaches *old = vq->vring.caches; + VRingMemoryRegionCaches *new; + hwaddr addr, size; + int event_size; + + event_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + + addr = vq->vring.desc; + if (!addr) { + return; + } + new = g_new0(VRingMemoryRegionCaches, 1); + size = virtio_queue_get_desc_size(vdev, n); + address_space_cache_init(&new->desc, vdev->dma_as, + addr, size, false); + + size = virtio_queue_get_used_size(vdev, n) + event_size; + address_space_cache_init(&new->used, vdev->dma_as, + vq->vring.used, size, true); + + size = virtio_queue_get_avail_size(vdev, n) + event_size; + address_space_cache_init(&new->avail, vdev->dma_as, + vq->vring.avail, size, false); + + atomic_rcu_set(&vq->vring.caches, new); + if (old) { + call_rcu(old, virtio_free_region_cache, rcu); + } +} + /* virt queue functions */ void virtio_queue_update_rings(VirtIODevice *vdev, int n) { @@ -117,6 +170,7 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) vring->used = vring_align(vring->avail + offsetof(VRingAvail, ring[vring->num]), vring->align); + virtio_init_region_cache(vdev, n); } static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, @@ -1264,6 +1318,7 @@ void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc, vdev->vq[n].vring.desc = desc; vdev->vq[n].vring.avail = avail; vdev->vq[n].vring.used = used; + virtio_init_region_cache(vdev, n); } void virtio_queue_set_num(VirtIODevice *vdev, int n, int num) @@ -1984,9 +2039,6 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) void virtio_cleanup(VirtIODevice *vdev) { qemu_del_vm_change_state_handler(vdev->vmstate); - g_free(vdev->config); - g_free(vdev->vq); - g_free(vdev->vector_queues); } static void virtio_vmstate_change(void *opaque, int running, RunState state) @@ -2248,6 +2300,19 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...) } } +static void virtio_memory_listener_commit(MemoryListener *listener) +{ + VirtIODevice *vdev = container_of(listener, VirtIODevice, listener); + int i; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) { + break; + } + virtio_init_region_cache(vdev, i); + } +} + static void virtio_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); @@ -2270,6 +2335,9 @@ static void virtio_device_realize(DeviceState *dev, Error **errp) error_propagate(errp, err); return; } + + vdev->listener.commit = virtio_memory_listener_commit; + memory_listener_register(&vdev->listener, vdev->dma_as); } static void virtio_device_unrealize(DeviceState *dev, Error **errp) @@ -2292,6 +2360,36 @@ static void virtio_device_unrealize(DeviceState *dev, Error **errp) vdev->bus_name = NULL; } +static void virtio_device_free_virtqueues(VirtIODevice *vdev) +{ + int i; + if (!vdev->vq) { + return; + } + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + VRingMemoryRegionCaches *caches; + if (vdev->vq[i].vring.num == 0) { + break; + } + caches = atomic_read(&vdev->vq[i].vring.caches); + atomic_set(&vdev->vq[i].vring.caches, NULL); + virtio_free_region_cache(caches); + } + g_free(vdev->vq); +} + +static void virtio_device_instance_finalize(Object *obj) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(obj); + + memory_listener_unregister(&vdev->listener); + virtio_device_free_virtqueues(vdev); + + g_free(vdev->config); + g_free(vdev->vector_queues); +} + static Property virtio_properties[] = { DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features), DEFINE_PROP_END_OF_LIST(), @@ -2418,6 +2516,7 @@ static const TypeInfo virtio_device_info = { .parent = TYPE_DEVICE, .instance_size = sizeof(VirtIODevice), .class_init = virtio_device_class_init, + .instance_finalize = virtio_device_instance_finalize, .abstract = true, .class_size = sizeof(VirtioDeviceClass), }; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 0863a254ba..15efcf2057 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -85,6 +85,7 @@ struct VirtIODevice uint32_t generation; int nvectors; VirtQueue *vq; + MemoryListener listener; uint16_t device_id; bool vm_running; bool broken; /* device in invalid state, needs reset */ From 991976f751c98c79bd174f0cd48250ac2120d04c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:18 +0100 Subject: [PATCH 10/23] virtio: use VRingMemoryRegionCaches for descriptor ring Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index b75cb52fcc..d62509d6cf 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -491,25 +491,24 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, VirtIODevice *vdev = vq->vdev; unsigned int max, idx; unsigned int total_bufs, in_total, out_total; - MemoryRegionCache vring_desc_cache; + VRingMemoryRegionCaches *caches; MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; int64_t len = 0; int rc; + rcu_read_lock(); idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; max = vq->vring.num; - len = address_space_cache_init(&vring_desc_cache, vdev->dma_as, - vq->vring.desc, max * sizeof(VRingDesc), - false); - if (len < max * sizeof(VRingDesc)) { + caches = atomic_rcu_read(&vq->vring.caches); + if (caches->desc.len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); goto err; } while ((rc = virtqueue_num_heads(vq, idx)) > 0) { - MemoryRegionCache *desc_cache = &vring_desc_cache; + MemoryRegionCache *desc_cache = &caches->desc; unsigned int num_bufs; VRingDesc desc; unsigned int i; @@ -586,13 +585,13 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, done: address_space_cache_destroy(&indirect_desc_cache); - address_space_cache_destroy(&vring_desc_cache); if (in_bytes) { *in_bytes = in_total; } if (out_bytes) { *out_bytes = out_total; } + rcu_read_unlock(); return; err: @@ -726,7 +725,7 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu void *virtqueue_pop(VirtQueue *vq, size_t sz) { unsigned int i, head, max; - MemoryRegionCache vring_desc_cache; + VRingMemoryRegionCaches *caches; MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; MemoryRegionCache *desc_cache; int64_t len; @@ -768,15 +767,14 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) i = head; - len = address_space_cache_init(&vring_desc_cache, vdev->dma_as, - vq->vring.desc, max * sizeof(VRingDesc), - false); - if (len < max * sizeof(VRingDesc)) { + rcu_read_lock(); + caches = atomic_rcu_read(&vq->vring.caches); + if (caches->desc.len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); goto done; } - desc_cache = &vring_desc_cache; + desc_cache = &caches->desc; vring_desc_read(vdev, &desc, desc_cache, i); if (desc.flags & VRING_DESC_F_INDIRECT) { if (desc.len % sizeof(VRingDesc)) { @@ -850,7 +848,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); done: address_space_cache_destroy(&indirect_desc_cache); - address_space_cache_destroy(&vring_desc_cache); + rcu_read_unlock(); return elem; From ca0176ad8368668c5ad2b428361652e05984e930 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:19 +0100 Subject: [PATCH 11/23] virtio: check for vring setup in virtio_queue_update_used_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the vring has not been set up, it is not necessary for vring_used_idx to do anything (as is already the case when the caller is virtio_load). This is harmless for now, but it will be a problem when the MemoryRegionCache has not been set up. Signed-off-by: Paolo Bonzini Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d62509d6cf..cdafcec9be 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2156,7 +2156,9 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) void virtio_queue_update_used_idx(VirtIODevice *vdev, int n) { - vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]); + if (vdev->vq[n].vring.desc) { + vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]); + } } void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n) From 97cd965c070152bc626c7507df9fb356bbe1cd81 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 16:40:20 +0100 Subject: [PATCH 12/23] virtio: use VRingMemoryRegionCaches for avail and used rings The virtio-net change is necessary because it uses virtqueue_fill and virtqueue_flush instead of the more convenient virtqueue_push. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/virtio-net.c | 14 ++++- hw/virtio/virtio.c | 132 ++++++++++++++++++++++++++++++++------------ 2 files changed, 109 insertions(+), 37 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 354a19eab8..c32168077a 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1130,7 +1130,8 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) return 0; } -static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) +static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, + size_t size) { VirtIONet *n = qemu_get_nic_opaque(nc); VirtIONetQueue *q = virtio_net_get_subqueue(nc); @@ -1233,6 +1234,17 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t return size; } +static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, + size_t size) +{ + ssize_t r; + + rcu_read_lock(); + r = virtio_net_receive_rcu(nc, buf, size); + rcu_read_unlock(); + return r; +} + static int32_t virtio_net_flush_tx(VirtIONetQueue *q); static void virtio_net_tx_complete(NetClientState *nc, ssize_t len) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index cdafcec9be..c08e50fb41 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -173,6 +173,7 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) virtio_init_region_cache(vdev, n); } +/* Called within rcu_read_lock(). */ static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, MemoryRegionCache *cache, int i) { @@ -184,88 +185,110 @@ static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, virtio_tswap16s(vdev, &desc->next); } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_flags(VirtQueue *vq) { - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, flags); - return virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingAvail, flags); + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_idx(VirtQueue *vq) { - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, idx); - vq->shadow_avail_idx = virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingAvail, idx); + vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); return vq->shadow_avail_idx; } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) { - hwaddr pa; - pa = vq->vring.avail + offsetof(VRingAvail, ring[i]); - return virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingAvail, ring[i]); + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); } +/* Called within rcu_read_lock(). */ static inline uint16_t vring_get_used_event(VirtQueue *vq) { return vring_avail_ring(vq, vq->vring.num); } +/* Called within rcu_read_lock(). */ static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem, int i) { - hwaddr pa; + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingUsed, ring[i]); virtio_tswap32s(vq->vdev, &uelem->id); virtio_tswap32s(vq->vdev, &uelem->len); - pa = vq->vring.used + offsetof(VRingUsed, ring[i]); - address_space_write(vq->vdev->dma_as, pa, MEMTXATTRS_UNSPECIFIED, - (void *)uelem, sizeof(VRingUsedElem)); + address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem)); + address_space_cache_invalidate(&caches->used, pa, sizeof(VRingUsedElem)); } +/* Called within rcu_read_lock(). */ static uint16_t vring_used_idx(VirtQueue *vq) { - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - return virtio_lduw_phys(vq->vdev, pa); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingUsed, idx); + return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); } +/* Called within rcu_read_lock(). */ static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) { - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, idx); - virtio_stw_phys(vq->vdev, pa, val); + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); + hwaddr pa = offsetof(VRingUsed, idx); + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); + address_space_cache_invalidate(&caches->used, pa, sizeof(val)); vq->used_idx = val; } +/* Called within rcu_read_lock(). */ static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) { + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); VirtIODevice *vdev = vq->vdev; - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) | mask); + hwaddr pa = offsetof(VRingUsed, flags); + uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + + virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask); + address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); } +/* Called within rcu_read_lock(). */ static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) { + VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches); VirtIODevice *vdev = vq->vdev; - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, flags); - virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) & ~mask); + hwaddr pa = offsetof(VRingUsed, flags); + uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + + virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask); + address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); } +/* Called within rcu_read_lock(). */ static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) { + VRingMemoryRegionCaches *caches; hwaddr pa; if (!vq->notification) { return; } - pa = vq->vring.used + offsetof(VRingUsed, ring[vq->vring.num]); - virtio_stw_phys(vq->vdev, pa, val); + + caches = atomic_rcu_read(&vq->vring.caches); + pa = offsetof(VRingUsed, ring[vq->vring.num]); + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); } void virtio_queue_set_notification(VirtQueue *vq, int enable) { vq->notification = enable; + + rcu_read_lock(); if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vring_avail_idx(vq)); } else if (enable) { @@ -277,6 +300,7 @@ void virtio_queue_set_notification(VirtQueue *vq, int enable) /* Expose avail event/used flags before caller checks the avail idx. */ smp_mb(); } + rcu_read_unlock(); } int virtio_queue_ready(VirtQueue *vq) @@ -285,8 +309,9 @@ int virtio_queue_ready(VirtQueue *vq) } /* Fetch avail_idx from VQ memory only when we really need to know if - * guest has added some buffers. */ -int virtio_queue_empty(VirtQueue *vq) + * guest has added some buffers. + * Called within rcu_read_lock(). */ +static int virtio_queue_empty_rcu(VirtQueue *vq) { if (vq->shadow_avail_idx != vq->last_avail_idx) { return 0; @@ -295,6 +320,20 @@ int virtio_queue_empty(VirtQueue *vq) return vring_avail_idx(vq) == vq->last_avail_idx; } +int virtio_queue_empty(VirtQueue *vq) +{ + bool empty; + + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + + rcu_read_lock(); + empty = vring_avail_idx(vq) == vq->last_avail_idx; + rcu_read_unlock(); + return empty; +} + static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len) { @@ -373,6 +412,7 @@ bool virtqueue_rewind(VirtQueue *vq, unsigned int num) return true; } +/* Called within rcu_read_lock(). */ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len, unsigned int idx) { @@ -393,6 +433,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, vring_used_write(vq, &uelem, idx); } +/* Called within rcu_read_lock(). */ void virtqueue_flush(VirtQueue *vq, unsigned int count) { uint16_t old, new; @@ -416,10 +457,13 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len) { + rcu_read_lock(); virtqueue_fill(vq, elem, len, 0); virtqueue_flush(vq, 1); + rcu_read_unlock(); } +/* Called within rcu_read_lock(). */ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) { uint16_t num_heads = vring_avail_idx(vq) - idx; @@ -439,6 +483,7 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) return num_heads; } +/* Called within rcu_read_lock(). */ static bool virtqueue_get_head(VirtQueue *vq, unsigned int idx, unsigned int *head) { @@ -740,8 +785,9 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) if (unlikely(vdev->broken)) { return NULL; } - if (virtio_queue_empty(vq)) { - return NULL; + rcu_read_lock(); + if (virtio_queue_empty_rcu(vq)) { + goto done; } /* Needed after virtio_queue_empty(), see comment in * virtqueue_num_heads(). */ @@ -754,11 +800,11 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) if (vq->inuse >= vq->vring.num) { virtio_error(vdev, "Virtqueue size exceeded"); - return NULL; + goto done; } if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) { - return NULL; + goto done; } if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { @@ -767,7 +813,6 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) i = head; - rcu_read_lock(); caches = atomic_rcu_read(&vq->vring.caches); if (caches->desc.len < max * sizeof(VRingDesc)) { virtio_error(vdev, "Cannot map descriptor ring"); @@ -1483,6 +1528,7 @@ static void virtio_set_isr(VirtIODevice *vdev, int value) } } +/* Called within rcu_read_lock(). */ static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) { uint16_t old, new; @@ -1508,7 +1554,12 @@ static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) { - if (!virtio_should_notify(vdev, vq)) { + bool should_notify; + rcu_read_lock(); + should_notify = virtio_should_notify(vdev, vq); + rcu_read_unlock(); + + if (!should_notify) { return; } @@ -1535,7 +1586,12 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) { - if (!virtio_should_notify(vdev, vq)) { + bool should_notify; + rcu_read_lock(); + should_notify = virtio_should_notify(vdev, vq); + rcu_read_unlock(); + + if (!should_notify) { return; } @@ -1996,6 +2052,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) } } + rcu_read_lock(); for (i = 0; i < num; i++) { if (vdev->vq[i].vring.desc) { uint16_t nheads; @@ -2030,6 +2087,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) } } } + rcu_read_unlock(); return 0; } @@ -2156,9 +2214,11 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) void virtio_queue_update_used_idx(VirtIODevice *vdev, int n) { + rcu_read_lock(); if (vdev->vq[n].vring.desc) { vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]); } + rcu_read_unlock(); } void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n) From b4b9862b536f41fcdf6ad193a306a852c5b5b71a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 17 Feb 2017 04:52:16 +0200 Subject: [PATCH 13/23] virtio: Fix no interrupt when not creating msi controller For ARM virt machine, if we use virt-2.7 which will not create ITS node, the virtio-net can not recieve interrupts so it can't get ip address through dhcp. This fixes commit 83d768b(virtio: set ISR on dataplane notifications). Signed-off-by: Shannon Zhao Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index c08e50fb41..23483c752f 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1584,6 +1584,12 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) event_notifier_set(&vq->guest_notifier); } +static void virtio_irq(VirtQueue *vq) +{ + virtio_set_isr(vq->vdev, 0x1); + virtio_notify_vector(vq->vdev, vq->vector); +} + void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) { bool should_notify; @@ -1596,8 +1602,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) } trace_virtio_notify(vdev, vq); - virtio_set_isr(vq->vdev, 0x1); - virtio_notify_vector(vdev, vq->vector); + virtio_irq(vq); } void virtio_notify_config(VirtIODevice *vdev) @@ -2240,7 +2245,7 @@ static void virtio_queue_guest_notifier_read(EventNotifier *n) { VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); if (event_notifier_test_and_clear(n)) { - virtio_notify_vector(vq->vdev, vq->vector); + virtio_irq(vq); } } From d4e9b75aa03d3f0e08fa431998764d7f72d78a48 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 16 Feb 2017 12:06:01 +0800 Subject: [PATCH 14/23] pcie: simplify pcie_add_capability() When we add PCIe extended capabilities, we should be following the rule that we add the head extended cap (at offset 0x100) first, then the rest of them. Meanwhile, we are always adding new capability bits at the end of the list. Here the "next" looks meaningless in all cases since it should always be zero (along with the "header"). Simplify the function a bit, and it looks more readable now. Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index f4dd177bc4..fc54bfd53d 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -665,32 +665,24 @@ void pcie_add_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver, uint16_t offset, uint16_t size) { - uint32_t header; - uint16_t next; - assert(offset >= PCI_CONFIG_SPACE_SIZE); assert(offset < offset + size); assert(offset + size <= PCIE_CONFIG_SPACE_SIZE); assert(size >= 8); assert(pci_is_express(dev)); - if (offset == PCI_CONFIG_SPACE_SIZE) { - header = pci_get_long(dev->config + offset); - next = PCI_EXT_CAP_NEXT(header); - } else { + if (offset != PCI_CONFIG_SPACE_SIZE) { uint16_t prev; /* * 0xffffffff is not a valid cap id (it's a 16 bit field). use * internally to find the last capability in the linked list. */ - next = pcie_find_capability_list(dev, 0xffffffff, &prev); - + pcie_find_capability_list(dev, 0xffffffff, &prev); assert(prev >= PCI_CONFIG_SPACE_SIZE); - assert(next == 0); pcie_ext_cap_set_next(dev, prev, offset); } - pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, next)); + pci_set_long(dev->config + offset, PCI_EXT_CAP(cap_id, cap_ver, 0)); /* Make capability read-only by default */ memset(dev->wmask + offset, 0, size); From 3213835720f0b79ea60ead8e81f356695a446d1d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:03 +0800 Subject: [PATCH 15/23] vfio: trace map/unmap for notify as well We traces its range, but we don't know whether it's a MAP/UNMAP. Let's dump it as well. Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/vfio/common.c | 3 ++- hw/vfio/trace-events | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 801578b4b9..174f3512d1 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -305,7 +305,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) void *vaddr; int ret; - trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask); + trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP", + iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 8de8281357..2561c6d31a 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -84,7 +84,7 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" # hw/vfio/common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 -vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ %"PRIx64" - %"PRIx64 +vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ %"PRIx64" - %"PRIx64 vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add %"PRIx64" - %"PRIx64 vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] %"PRIx64" - %"PRIx64 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] %"PRIx64" - %"PRIx64" [%p]" From 4a4b88fbe1a95e80a2e29830e69e1deded407fc1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:04 +0800 Subject: [PATCH 16/23] vfio: introduce vfio_get_vaddr() A cleanup for vfio_iommu_map_notify(). Now we will fetch vaddr even if the operation is unmap, but it won't hurt much. One thing to mention is that we need the RCU read lock to protect the whole translation and map/unmap procedure. Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/vfio/common.c | 73 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 174f3512d1..42c4790003 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -294,14 +294,50 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } +/* Called with rcu_read_lock held. */ +static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, + bool *read_only) +{ + MemoryRegion *mr; + hwaddr xlat; + hwaddr len = iotlb->addr_mask + 1; + bool writable = iotlb->perm & IOMMU_WO; + + /* + * The IOMMU TLB entry we have just covers translation through + * this IOMMU to its immediate target. We need to translate + * it the rest of the way through to memory. + */ + mr = address_space_translate(&address_space_memory, + iotlb->translated_addr, + &xlat, &len, writable); + if (!memory_region_is_ram(mr)) { + error_report("iommu map to non memory area %"HWADDR_PRIx"", + xlat); + return false; + } + + /* + * Translation truncates length to the IOMMU page size, + * check that it did not truncate too much. + */ + if (len & iotlb->addr_mask) { + error_report("iommu has granularity incompatible with target AS"); + return false; + } + + *vaddr = memory_region_get_ram_ptr(mr) + xlat; + *read_only = !writable || mr->readonly; + + return true; +} + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; hwaddr iova = iotlb->iova + giommu->iommu_offset; - MemoryRegion *mr; - hwaddr xlat; - hwaddr len = iotlb->addr_mask + 1; + bool read_only; void *vaddr; int ret; @@ -314,34 +350,23 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) return; } - /* - * The IOMMU TLB entry we have just covers translation through - * this IOMMU to its immediate target. We need to translate - * it the rest of the way through to memory. - */ rcu_read_lock(); - mr = address_space_translate(&address_space_memory, - iotlb->translated_addr, - &xlat, &len, iotlb->perm & IOMMU_WO); - if (!memory_region_is_ram(mr)) { - error_report("iommu map to non memory area %"HWADDR_PRIx"", - xlat); - goto out; - } - /* - * Translation truncates length to the IOMMU page size, - * check that it did not truncate too much. - */ - if (len & iotlb->addr_mask) { - error_report("iommu has granularity incompatible with target AS"); + + if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { goto out; } if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { - vaddr = memory_region_get_ram_ptr(mr) + xlat; + /* + * vaddr is only valid until rcu_read_unlock(). But after + * vfio_dma_map has set up the mapping the pages will be + * pinned by the kernel. This makes sure that the RAM backend + * of vaddr will always be there, even if the memory object is + * destroyed and its backing memory munmap-ed. + */ ret = vfio_dma_map(container, iova, iotlb->addr_mask + 1, vaddr, - !(iotlb->perm & IOMMU_WO) || mr->readonly); + read_only); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", From dfbd90e5b9b772b1c5a52bad9f6dbabb385a7dc2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:05 +0800 Subject: [PATCH 17/23] vfio: allow to notify unmap for very large region Linux vfio driver supports to do VFIO_IOMMU_UNMAP_DMA for a very big region. This can be leveraged by QEMU IOMMU implementation to cleanup existing page mappings for an entire iova address space (by notifying with an IOTLB with extremely huge addr_mask). However current vfio_iommu_map_notify() does not allow that. It make sure that all the translated address in IOTLB is falling into RAM range. The check makes sense, but it should only be a sensible checker for mapping operations, and mean little for unmap operations. This patch moves this check into map logic only, so that we'll get faster unmap handling (no need to translate again), and also we can then better support unmapping a very big region when it covers non-ram ranges or even not-existing ranges. Acked-by: Alex Williamson Signed-off-by: Peter Xu Reviewed-by: David Gibson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/vfio/common.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 42c4790003..f3ba9b9007 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -352,11 +352,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); - if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { - goto out; - } - if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { + if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { + goto out; + } /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be From 3b40f0e53c2ebfcec8aabab7e91c11c5bd441ac0 Mon Sep 17 00:00:00 2001 From: Aviv Ben-David Date: Tue, 7 Feb 2017 16:28:06 +0800 Subject: [PATCH 18/23] intel_iommu: add "caching-mode" option This capability asks the guest to invalidate cache before each map operation. We can use this invalidation to trap map operations in the hypervisor. Signed-off-by: Aviv Ben-David [peterx: using "caching-mode" instead of "cache-mode" to align with spec] [peterx: re-write the subject to make it short and clear] Reviewed-by: Jason Wang Signed-off-by: Peter Xu Signed-off-by: Aviv Ben-David Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 5 +++++ hw/i386/intel_iommu_internal.h | 1 + include/hw/i386/intel_iommu.h | 2 ++ 3 files changed, 8 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3270fb9162..50251c3090 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2115,6 +2115,7 @@ static Property vtd_properties[] = { DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), + DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), DEFINE_PROP_END_OF_LIST(), }; @@ -2496,6 +2497,10 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_DT; } + if (s->caching_mode) { + s->cap |= VTD_CAP_CM; + } + vtd_reset_context_cache(s); vtd_reset_iotlb(s); diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 356f188b73..41041219ba 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -202,6 +202,7 @@ #define VTD_CAP_MAMV (VTD_MAMV << 48) #define VTD_CAP_PSI (1ULL << 39) #define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35)) +#define VTD_CAP_CM (1ULL << 7) /* Supported Adjusted Guest Address Widths */ #define VTD_CAP_SAGAW_SHIFT 8 diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 405c9d122e..fe645aa93a 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -257,6 +257,8 @@ struct IntelIOMMUState { uint8_t womask[DMAR_REG_SIZE]; /* WO (write only - read returns 0) */ uint32_t version; + bool caching_mode; /* RO - is cap CM enabled? */ + dma_addr_t root; /* Current root table pointer */ bool root_extended; /* Type of root table (extended or not) */ bool dmar_enabled; /* Set if DMA remapping is enabled */ From 046ab7e9beea6abdcc016556f82d7075e4ff0155 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:07 +0800 Subject: [PATCH 19/23] intel_iommu: simplify irq region translation Now we have a standalone memory region for MSI, all the irq region requests should be redirected there. Cleaning up the block with an assertion instead. Reviewed-by: Jason Wang Signed-off-by: Peter Xu Reviewed-by: David Gibson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 50251c3090..86d19bbb9a 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -818,28 +818,12 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, bool writes = true; VTDIOTLBEntry *iotlb_entry; - /* Check if the request is in interrupt address range */ - if (vtd_is_interrupt_addr(addr)) { - if (is_write) { - /* FIXME: since we don't know the length of the access here, we - * treat Non-DWORD length write requests without PASID as - * interrupt requests, too. Withoud interrupt remapping support, - * we just use 1:1 mapping. - */ - VTD_DPRINTF(MMU, "write request to interrupt address " - "gpa 0x%"PRIx64, addr); - entry->iova = addr & VTD_PAGE_MASK_4K; - entry->translated_addr = addr & VTD_PAGE_MASK_4K; - entry->addr_mask = ~VTD_PAGE_MASK_4K; - entry->perm = IOMMU_WO; - return; - } else { - VTD_DPRINTF(GENERAL, "error: read request from interrupt address " - "gpa 0x%"PRIx64, addr); - vtd_report_dmar_fault(s, source_id, addr, VTD_FR_READ, is_write); - return; - } - } + /* + * We have standalone memory region for interrupt addresses, we + * should never receive translation requests in this region. + */ + assert(!vtd_is_interrupt_addr(addr)); + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { From 6e9055641b7a02131a7879369bcd1fe6c8ccb503 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:08 +0800 Subject: [PATCH 20/23] intel_iommu: renaming gpa to iova where proper There are lots of places in current intel_iommu.c codes that named "iova" as "gpa". It is really confusing to use a name "gpa" in these places (which is very easily to be understood as "Guest Physical Address", while it's not). To make the codes (much) easier to be read, I decided to do this once and for all. No functional change is made. Only literal ones. Reviewed-by: Jason Wang Signed-off-by: Peter Xu Reviewed-by: David Gibson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 44 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 86d19bbb9a..0c94b79f7f 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -259,7 +259,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint64_t *key = g_malloc(sizeof(*key)); uint64_t gfn = vtd_get_iotlb_gfn(addr, level); - VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64 + VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64 " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { @@ -575,12 +575,12 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) return slpte; } -/* Given a gpa and the level of paging structure, return the offset of current - * level. +/* Given an iova and the level of paging structure, return the offset + * of current level. */ -static inline uint32_t vtd_gpa_level_offset(uint64_t gpa, uint32_t level) +static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) { - return (gpa >> vtd_slpt_level_shift(level)) & + return (iova >> vtd_slpt_level_shift(level)) & ((1ULL << VTD_SL_LEVEL_BITS) - 1); } @@ -628,12 +628,12 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) } } -/* Given the @gpa, get relevant @slptep. @slpte_level will be the last level +/* Given the @iova, get relevant @slptep. @slpte_level will be the last level * of the translation, can be used for deciding the size of large page. */ -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, - uint64_t *slptep, uint32_t *slpte_level, - bool *reads, bool *writes) +static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, + uint64_t *slptep, uint32_t *slpte_level, + bool *reads, bool *writes) { dma_addr_t addr = vtd_get_slpt_base_from_context(ce); uint32_t level = vtd_get_level_from_context_entry(ce); @@ -642,11 +642,11 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); uint64_t access_right_check; - /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG - * and AW in context-entry. + /* Check if @iova is above 2^X-1, where X is the minimum of MGAW + * in CAP_REG and AW in context-entry. */ - if (gpa & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) { - VTD_DPRINTF(GENERAL, "error: gpa 0x%"PRIx64 " exceeds limits", gpa); + if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) { + VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", iova); return -VTD_FR_ADDR_BEYOND_MGAW; } @@ -654,13 +654,13 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, access_right_check = is_write ? VTD_SL_W : VTD_SL_R; while (true) { - offset = vtd_gpa_level_offset(gpa, level); + offset = vtd_iova_level_offset(iova, level); slpte = vtd_get_slpte(addr, offset); if (slpte == (uint64_t)-1) { VTD_DPRINTF(GENERAL, "error: fail to access second-level paging " - "entry at level %"PRIu32 " for gpa 0x%"PRIx64, - level, gpa); + "entry at level %"PRIu32 " for iova 0x%"PRIx64, + level, iova); if (level == vtd_get_level_from_context_entry(ce)) { /* Invalid programming of context-entry */ return -VTD_FR_CONTEXT_ENTRY_INV; @@ -672,8 +672,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write, *writes = (*writes) && (slpte & VTD_SL_W); if (!(slpte & access_right_check)) { VTD_DPRINTF(GENERAL, "error: lack of %s permission for " - "gpa 0x%"PRIx64 " slpte 0x%"PRIx64, - (is_write ? "write" : "read"), gpa, slpte); + "iova 0x%"PRIx64 " slpte 0x%"PRIx64, + (is_write ? "write" : "read"), iova, slpte); return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; } if (vtd_slpte_nonzero_rsvd(slpte, level)) { @@ -827,7 +827,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { - VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64 + VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64 " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, iotlb_entry->slpte, iotlb_entry->domain_id); slpte = iotlb_entry->slpte; @@ -867,8 +867,8 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry->context_cache_gen = s->context_cache_gen; } - ret_fr = vtd_gpa_to_slpte(&ce, addr, is_write, &slpte, &level, - &reads, &writes); + ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, + &reads, &writes); if (ret_fr) { ret_fr = -ret_fr; if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { @@ -2033,7 +2033,7 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, is_write, &ret); VTD_DPRINTF(MMU, "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8 - " gpa 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), + " iova 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus), VTD_PCI_SLOT(vtd_as->devfn), VTD_PCI_FUNC(vtd_as->devfn), vtd_as->devfn, addr, ret.translated_addr); return ret; From bc535e59c4ec7d98a8011a3ffb6580fa04b7745c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:09 +0800 Subject: [PATCH 21/23] intel_iommu: convert dbg macros to traces for inv VT-d codes are still using static DEBUG_INTEL_IOMMU macro. That's not good, and we should end the day when we need to recompile the code before getting useful debugging information for vt-d. Time to switch to the trace system. This is the first patch to do it. Signed-off-by: Peter Xu Reviewed-by: Jason Wang Reviewed-by: David Gibson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 95 +++++++++++++++++-------------------------- hw/i386/trace-events | 18 ++++++++ 2 files changed, 56 insertions(+), 57 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 0c94b79f7f..08e43b6ee6 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -35,6 +35,7 @@ #include "sysemu/kvm.h" #include "hw/i386/apic_internal.h" #include "kvm_i386.h" +#include "trace.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -474,22 +475,19 @@ static void vtd_handle_inv_queue_error(IntelIOMMUState *s) /* Set the IWC field and try to generate an invalidation completion interrupt */ static void vtd_generate_completion_event(IntelIOMMUState *s) { - VTD_DPRINTF(INV, "completes an invalidation wait command with " - "Interrupt Flag"); if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { - VTD_DPRINTF(INV, "there is a previous interrupt condition to be " - "serviced by software, " - "new invalidation event is not generated"); + trace_vtd_inv_desc_wait_irq("One pending, skip current"); return; } vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { - VTD_DPRINTF(INV, "IM filed in IECTL_REG is set, new invalidation " - "event is not generated"); + trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " + "new event not generated"); return; } else { /* Generate the interrupt event */ + trace_vtd_inv_desc_wait_irq("Generating complete event"); vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); } @@ -923,6 +921,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_context_global_invalidate(IntelIOMMUState *s) { + trace_vtd_inv_desc_cc_global(); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { vtd_reset_context_cache(s); @@ -962,9 +961,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, uint16_t mask; VTDBus *vtd_bus; VTDAddressSpace *vtd_as; - uint16_t devfn; + uint8_t bus_n, devfn; uint16_t devfn_it; + trace_vtd_inv_desc_cc_devices(source_id, func_mask); + switch (func_mask & 3) { case 0: mask = 0; /* No bits in the SID field masked */ @@ -980,16 +981,16 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, break; } mask = ~mask; - VTD_DPRINTF(INV, "device-selective invalidation source 0x%"PRIx16 - " mask %"PRIu16, source_id, mask); - vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + + bus_n = VTD_SID_TO_BUS(source_id); + vtd_bus = vtd_find_as_from_bus_num(s, bus_n); if (vtd_bus) { devfn = VTD_SID_TO_DEVFN(source_id); for (devfn_it = 0; devfn_it < X86_IOMMU_PCI_DEVFN_MAX; ++devfn_it) { vtd_as = vtd_bus->dev_as[devfn_it]; if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { - VTD_DPRINTF(INV, "invalidate context-cahce of devfn 0x%"PRIx16, - devfn_it); + trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), + VTD_PCI_FUNC(devfn_it)); vtd_as->context_cache_entry.context_cache_gen = 0; } } @@ -1302,9 +1303,7 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Invalidation " - "Wait Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); return false; } if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { @@ -1316,21 +1315,18 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) /* FIXME: need to be masked with HAW? */ dma_addr_t status_addr = inv_desc->hi; - VTD_DPRINTF(INV, "status data 0x%x, status addr 0x%"PRIx64, - status_data, status_addr); + trace_vtd_inv_desc_wait_sw(status_addr, status_data); status_data = cpu_to_le32(status_data); if (dma_memory_write(&address_space_memory, status_addr, &status_data, sizeof(status_data))) { - VTD_DPRINTF(GENERAL, "error: fail to perform a coherent write"); + trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); return false; } } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { /* Interrupt flag */ - VTD_DPRINTF(INV, "Invalidation Wait Descriptor interrupt completion"); vtd_generate_completion_event(s); } else { - VTD_DPRINTF(GENERAL, "error: invalid Invalidation Wait Descriptor: " - "hi 0x%"PRIx64 " lo 0x%"PRIx64, inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); return false; } return true; @@ -1339,30 +1335,29 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) static bool vtd_process_context_cache_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { + uint16_t sid, fmask; + if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Context-cache " - "Invalidate Descriptor"); + trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); return false; } switch (inv_desc->lo & VTD_INV_DESC_CC_G) { case VTD_INV_DESC_CC_DOMAIN: - VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16, - (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); + trace_vtd_inv_desc_cc_domain( + (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); /* Fall through */ case VTD_INV_DESC_CC_GLOBAL: - VTD_DPRINTF(INV, "global invalidation"); vtd_context_global_invalidate(s); break; case VTD_INV_DESC_CC_DEVICE: - vtd_context_device_invalidate(s, VTD_INV_DESC_CC_SID(inv_desc->lo), - VTD_INV_DESC_CC_FM(inv_desc->lo)); + sid = VTD_INV_DESC_CC_SID(inv_desc->lo); + fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); + vtd_context_device_invalidate(s, sid, fmask); break; default: - VTD_DPRINTF(GENERAL, "error: invalid granularity in Context-cache " - "Invalidate Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); return false; } return true; @@ -1376,22 +1371,19 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in IOTLB " - "Invalidate Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); return false; } switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { case VTD_INV_DESC_IOTLB_GLOBAL: - VTD_DPRINTF(INV, "global invalidation"); + trace_vtd_inv_desc_iotlb_global(); vtd_iotlb_global_invalidate(s); break; case VTD_INV_DESC_IOTLB_DOMAIN: domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); - VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16, - domain_id); + trace_vtd_inv_desc_iotlb_domain(domain_id); vtd_iotlb_domain_invalidate(s, domain_id); break; @@ -1399,20 +1391,16 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); - VTD_DPRINTF(INV, "page-selective invalidation domain 0x%"PRIx16 - " addr 0x%"PRIx64 " mask %"PRIu8, domain_id, addr, am); + trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); if (am > VTD_MAMV) { - VTD_DPRINTF(GENERAL, "error: supported max address mask value is " - "%"PRIu8, (uint8_t)VTD_MAMV); + trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); return false; } vtd_iotlb_page_invalidate(s, domain_id, addr, am); break; default: - VTD_DPRINTF(GENERAL, "error: invalid granularity in IOTLB Invalidate " - "Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc->hi, inv_desc->lo); + trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); return false; } return true; @@ -1511,33 +1499,28 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) switch (desc_type) { case VTD_INV_DESC_CC: - VTD_DPRINTF(INV, "Context-cache Invalidate Descriptor hi 0x%"PRIx64 - " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); if (!vtd_process_context_cache_desc(s, &inv_desc)) { return false; } break; case VTD_INV_DESC_IOTLB: - VTD_DPRINTF(INV, "IOTLB Invalidate Descriptor hi 0x%"PRIx64 - " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); if (!vtd_process_iotlb_desc(s, &inv_desc)) { return false; } break; case VTD_INV_DESC_WAIT: - VTD_DPRINTF(INV, "Invalidation Wait Descriptor hi 0x%"PRIx64 - " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); if (!vtd_process_wait_desc(s, &inv_desc)) { return false; } break; case VTD_INV_DESC_IEC: - VTD_DPRINTF(INV, "Invalidation Interrupt Entry Cache " - "Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64, - inv_desc.hi, inv_desc.lo); + trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); if (!vtd_process_inv_iec_desc(s, &inv_desc)) { return false; } @@ -1552,9 +1535,7 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) break; default: - VTD_DPRINTF(GENERAL, "error: unkonw Invalidation Descriptor type " - "hi 0x%"PRIx64 " lo 0x%"PRIx64 " type %"PRIu8, - inv_desc.hi, inv_desc.lo, desc_type); + trace_vtd_inv_desc_invalid(inv_desc.hi, inv_desc.lo); return false; } s->iq_head++; diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 1cc4a10a07..02aeaab688 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -3,6 +3,24 @@ # hw/i386/x86-iommu.c x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32 +# hw/i386/intel_iommu.c +vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" +vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64 +vtd_inv_desc_invalid(uint64_t hi, uint64_t lo) "invalid inv desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16 +vtd_inv_desc_cc_global(void) "context invalidate globally" +vtd_inv_desc_cc_device(uint8_t bus, uint8_t dev, uint8_t fn) "context invalidate device %02"PRIx8":%02"PRIx8".%02"PRIx8 +vtd_inv_desc_cc_devices(uint16_t sid, uint16_t fmask) "context invalidate devices sid 0x%"PRIx16" fmask 0x%"PRIx16 +vtd_inv_desc_cc_invalid(uint64_t hi, uint64_t lo) "invalid context-cache desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_iotlb_global(void) "iotlb invalidate global" +vtd_inv_desc_iotlb_domain(uint16_t domain) "iotlb invalidate whole domain 0x%"PRIx16 +vtd_inv_desc_iotlb_pages(uint16_t domain, uint64_t addr, uint8_t mask) "iotlb invalidate domain 0x%"PRIx16" addr 0x%"PRIx64" mask 0x%"PRIx8 +vtd_inv_desc_iotlb_invalid(uint64_t hi, uint64_t lo) "invalid iotlb desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write addr 0x%"PRIx64" data 0x%"PRIx32 +vtd_inv_desc_wait_irq(const char *msg) "%s" +vtd_inv_desc_wait_invalid(uint64_t hi, uint64_t lo) "invalid wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 + # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 From 6c441e1d613a1f91a53bd215f2ed1f9559194f21 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:10 +0800 Subject: [PATCH 22/23] intel_iommu: convert dbg macros to trace for trans Another patch to convert the DPRINTF() stuffs. This patch focuses on the address translation path and caching. Signed-off-by: Peter Xu Reviewed-by: Jason Wang Reviewed-by: David Gibson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 69 +++++++++++++++---------------------------- hw/i386/trace-events | 10 +++++++ 2 files changed, 34 insertions(+), 45 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 08e43b6ee6..ad304f681c 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -260,11 +260,9 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint64_t *key = g_malloc(sizeof(*key)); uint64_t gfn = vtd_get_iotlb_gfn(addr, level); - VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64 - " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte, - domain_id); + trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { - VTD_DPRINTF(CACHE, "iotlb exceeds size limit, forced to reset"); + trace_vtd_iotlb_reset("iotlb exceeds size limit"); vtd_reset_iotlb(s); } @@ -505,8 +503,7 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, addr = s->root + index * sizeof(*re); if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { - VTD_DPRINTF(GENERAL, "error: fail to access root-entry at 0x%"PRIx64 - " + %"PRIu8, s->root, index); + trace_vtd_re_invalid(re->rsvd, re->val); re->val = 0; return -VTD_FR_ROOT_TABLE_INV; } @@ -524,15 +521,10 @@ static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index, { dma_addr_t addr; - if (!vtd_root_entry_present(root)) { - VTD_DPRINTF(GENERAL, "error: root-entry is not present"); - return -VTD_FR_ROOT_ENTRY_P; - } + /* we have checked that root entry is present */ addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce); if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) { - VTD_DPRINTF(GENERAL, "error: fail to access context-entry at 0x%"PRIx64 - " + %"PRIu8, - (uint64_t)(root->val & VTD_ROOT_ENTRY_CTP), index); + trace_vtd_re_invalid(root->rsvd, root->val); return -VTD_FR_CONTEXT_TABLE_INV; } ce->lo = le64_to_cpu(ce->lo); @@ -704,12 +696,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, } if (!vtd_root_entry_present(&re)) { - VTD_DPRINTF(GENERAL, "error: root-entry #%"PRIu8 " is not present", - bus_num); + /* Not error - it's okay we don't have root entry. */ + trace_vtd_re_not_present(bus_num); return -VTD_FR_ROOT_ENTRY_P; } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { - VTD_DPRINTF(GENERAL, "error: non-zero reserved field in root-entry " - "hi 0x%"PRIx64 " lo 0x%"PRIx64, re.rsvd, re.val); + trace_vtd_re_invalid(re.rsvd, re.val); return -VTD_FR_ROOT_ENTRY_RSVD; } @@ -719,22 +710,17 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, } if (!vtd_context_entry_present(ce)) { - VTD_DPRINTF(GENERAL, - "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " - "is not present", devfn, bus_num); + /* Not error - it's okay we don't have context entry. */ + trace_vtd_ce_not_present(bus_num, devfn); return -VTD_FR_CONTEXT_ENTRY_P; } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { - VTD_DPRINTF(GENERAL, - "error: non-zero reserved field in context-entry " - "hi 0x%"PRIx64 " lo 0x%"PRIx64, ce->hi, ce->lo); + trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_RSVD; } /* Check if the programming of context-entry is valid */ if (!vtd_is_level_supported(s, vtd_get_level_from_context_entry(ce))) { - VTD_DPRINTF(GENERAL, "error: unsupported Address Width value in " - "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64, - ce->hi, ce->lo); + trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; } else { switch (ce->lo & VTD_CONTEXT_ENTRY_TT) { @@ -743,9 +729,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, case VTD_CONTEXT_TT_DEV_IOTLB: break; default: - VTD_DPRINTF(GENERAL, "error: unsupported Translation Type in " - "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64, - ce->hi, ce->lo); + trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_INV; } } @@ -825,9 +809,8 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { - VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64 - " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, - iotlb_entry->slpte, iotlb_entry->domain_id); + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + iotlb_entry->domain_id); slpte = iotlb_entry->slpte; reads = iotlb_entry->read_flags; writes = iotlb_entry->write_flags; @@ -836,10 +819,9 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, } /* Try to fetch context-entry from cache first */ if (cc_entry->context_cache_gen == s->context_cache_gen) { - VTD_DPRINTF(CACHE, "hit context-cache bus %d devfn %d " - "(hi %"PRIx64 " lo %"PRIx64 " gen %"PRIu32 ")", - bus_num, devfn, cc_entry->context_entry.hi, - cc_entry->context_entry.lo, cc_entry->context_cache_gen); + trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, + cc_entry->context_entry.lo, + cc_entry->context_cache_gen); ce = cc_entry->context_entry; is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; } else { @@ -848,19 +830,16 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, if (ret_fr) { ret_fr = -ret_fr; if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA " - "requests through this context-entry " - "(with FPD Set)"); + trace_vtd_fault_disabled(); } else { vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); } return; } /* Update context-cache */ - VTD_DPRINTF(CACHE, "update context-cache bus %d devfn %d " - "(hi %"PRIx64 " lo %"PRIx64 " gen %"PRIu32 "->%"PRIu32 ")", - bus_num, devfn, ce.hi, ce.lo, - cc_entry->context_cache_gen, s->context_cache_gen); + trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, + cc_entry->context_cache_gen, + s->context_cache_gen); cc_entry->context_entry = ce; cc_entry->context_cache_gen = s->context_cache_gen; } @@ -870,8 +849,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, if (ret_fr) { ret_fr = -ret_fr; if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests " - "through this context-entry (with FPD Set)"); + trace_vtd_fault_disabled(); } else { vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); } @@ -1031,6 +1009,7 @@ static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) { + trace_vtd_iotlb_reset("global invalidation recved"); vtd_reset_iotlb(s); } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 02aeaab688..88ad5e4c43 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -20,6 +20,16 @@ vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write vtd_inv_desc_wait_irq(const char *msg) "%s" vtd_inv_desc_wait_invalid(uint64_t hi, uint64_t lo) "invalid wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present" +vtd_re_invalid(uint64_t hi, uint64_t lo) "invalid root entry hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present" +vtd_ce_invalid(uint64_t hi, uint64_t lo) "invalid context entry hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 +vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 +vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32 +vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32 +vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)" +vtd_fault_disabled(void) "Fault processing disabled for context entry" # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 From 7e58326ad7e79b8c5dbcc6f24e9dc1523d84c11b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Feb 2017 16:28:11 +0800 Subject: [PATCH 23/23] intel_iommu: vtd_slpt_level_shift check level This helps in debugging incorrect level passed in. Reviewed-by: Jason Wang Signed-off-by: Peter Xu Reviewed-by: David Gibson Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index ad304f681c..22d8226e43 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -168,6 +168,7 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, /* The shift of an addr for a certain level of paging structure */ static inline uint32_t vtd_slpt_level_shift(uint32_t level) { + assert(level != 0); return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; }