From 10dab9f2635b9bab23a2b29974b526e62bb61268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:02 +0200 Subject: [PATCH 01/21] vdpa: Skip the maps not in the iova tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Next patch will skip the registering of dma maps that the vdpa device rejects in the iova tree. We need to consider that here or we cause a SIGSEGV accessing result. Reported-by: Lei Yang Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 3ff9ce3501..983d3697b0 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -289,6 +289,10 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, }; result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); + if (!result) { + /* The memory listener map wasn't mapped */ + return; + } iova = result->iova; vhost_iova_tree_remove(v->iova_tree, result); } From 7dab70bec397e3522211e7bcc36d879bad8154c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:03 +0200 Subject: [PATCH 02/21] vdpa: do not save failed dma maps in SVQ iova tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a map fails for whatever reason, it must not be saved in the tree. Otherwise, qemu will try to unmap it in cleanup, leaving to more errors. Fixes: 34e3c94eda ("vdpa: Add custom IOTLB translations to SVQ") Reported-by: Lei Yang Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 983d3697b0..7e28d2f674 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -176,6 +176,7 @@ static void vhost_vdpa_listener_commit(MemoryListener *listener) static void vhost_vdpa_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { + DMAMap mem_region = {}; struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); hwaddr iova; Int128 llend, llsize; @@ -212,13 +213,13 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (v->shadow_vqs_enabled) { - DMAMap mem_region = { - .translated_addr = (hwaddr)(uintptr_t)vaddr, - .size = int128_get64(llsize) - 1, - .perm = IOMMU_ACCESS_FLAG(true, section->readonly), - }; + int r; - int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); + mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr, + mem_region.size = int128_get64(llsize) - 1, + mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly), + + r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); if (unlikely(r != IOVA_OK)) { error_report("Can't allocate a mapping (%d)", r); goto fail; @@ -232,11 +233,16 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, vaddr, section->readonly); if (ret) { error_report("vhost vdpa map fail!"); - goto fail; + goto fail_map; } return; +fail_map: + if (v->shadow_vqs_enabled) { + vhost_iova_tree_remove(v->iova_tree, &mem_region); + } + fail: /* * On the initfn path, store the first error in the container so we From 69292a8e40f4dae8af5f04724e06392cdf03c09e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:04 +0200 Subject: [PATCH 03/21] util: accept iova_tree_remove_parameter by value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's convenient to call iova_tree_remove from a map returned from iova_tree_find or iova_tree_find_iova. With the current code this is not possible, since we will free it, and then we will try to search for it again. Fix it making accepting the map by value, forcing a copy of the argument. Not applying a fixes tag, since there is no use like that at the moment. Signed-off-by: Eugenio Pérez Signed-off-by: Jason Wang --- hw/i386/intel_iommu.c | 6 +++--- hw/virtio/vhost-iova-tree.c | 2 +- hw/virtio/vhost-iova-tree.h | 2 +- hw/virtio/vhost-vdpa.c | 6 +++--- include/qemu/iova-tree.h | 2 +- net/vhost-vdpa.c | 4 ++-- util/iova-tree.c | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 2162394e08..05d53a1aa9 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1187,7 +1187,7 @@ static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) return ret; } /* Drop any existing mapping */ - iova_tree_remove(as->iova_tree, &target); + iova_tree_remove(as->iova_tree, target); /* Recover the correct type */ event->type = IOMMU_NOTIFIER_MAP; entry->perm = cache_perm; @@ -1200,7 +1200,7 @@ static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); return 0; } - iova_tree_remove(as->iova_tree, &target); + iova_tree_remove(as->iova_tree, target); } trace_vtd_page_walk_one(info->domain_id, entry->iova, @@ -3563,7 +3563,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) map.iova = n->start; map.size = size; - iova_tree_remove(as->iova_tree, &map); + iova_tree_remove(as->iova_tree, map); } static void vtd_address_space_unmap_all(IntelIOMMUState *s) diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c index 67bf6d57ab..3d03395a77 100644 --- a/hw/virtio/vhost-iova-tree.c +++ b/hw/virtio/vhost-iova-tree.c @@ -104,7 +104,7 @@ int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map) * @iova_tree: The vhost iova tree * @map: The map to remove */ -void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map) +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map) { iova_tree_remove(iova_tree->iova_taddr_map, map); } diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h index 6a4f24e0f9..4adfd79ff0 100644 --- a/hw/virtio/vhost-iova-tree.h +++ b/hw/virtio/vhost-iova-tree.h @@ -22,6 +22,6 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete); const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree, const DMAMap *map); int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map); -void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map); +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map); #endif diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 7e28d2f674..87e0ad393f 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -240,7 +240,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, fail_map: if (v->shadow_vqs_enabled) { - vhost_iova_tree_remove(v->iova_tree, &mem_region); + vhost_iova_tree_remove(v->iova_tree, mem_region); } fail: @@ -300,7 +300,7 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, return; } iova = result->iova; - vhost_iova_tree_remove(v->iova_tree, result); + vhost_iova_tree_remove(v->iova_tree, *result); } vhost_vdpa_iotlb_batch_begin_once(v); ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); @@ -944,7 +944,7 @@ static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, needle->perm == IOMMU_RO); if (unlikely(r != 0)) { error_setg_errno(errp, -r, "Cannot map region to device"); - vhost_iova_tree_remove(v->iova_tree, needle); + vhost_iova_tree_remove(v->iova_tree, *needle); } return r == 0; diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h index 16bbfdf5f8..8528e5c98f 100644 --- a/include/qemu/iova-tree.h +++ b/include/qemu/iova-tree.h @@ -73,7 +73,7 @@ int iova_tree_insert(IOVATree *tree, const DMAMap *map); * all the mappings that are included in the provided range will be * removed from the tree. Here map->translated_addr is meaningless. */ -void iova_tree_remove(IOVATree *tree, const DMAMap *map); +void iova_tree_remove(IOVATree *tree, DMAMap map); /** * iova_tree_find: diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 303447a68e..a49e7e649d 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -244,7 +244,7 @@ static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr) error_report("Device cannot unmap: %s(%d)", g_strerror(r), r); } - vhost_iova_tree_remove(tree, map); + vhost_iova_tree_remove(tree, *map); } static size_t vhost_vdpa_net_cvq_cmd_len(void) @@ -297,7 +297,7 @@ static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, return true; dma_map_err: - vhost_iova_tree_remove(v->iova_tree, &map); + vhost_iova_tree_remove(v->iova_tree, map); return false; } diff --git a/util/iova-tree.c b/util/iova-tree.c index fee530a579..536789797e 100644 --- a/util/iova-tree.c +++ b/util/iova-tree.c @@ -164,11 +164,11 @@ void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator) g_tree_foreach(tree->tree, iova_tree_traverse, iterator); } -void iova_tree_remove(IOVATree *tree, const DMAMap *map) +void iova_tree_remove(IOVATree *tree, DMAMap map) { const DMAMap *overlap; - while ((overlap = iova_tree_find(tree, map))) { + while ((overlap = iova_tree_find(tree, &map))) { g_tree_remove(tree->tree, overlap); } } From b37c12be962f95fd1e93b470a5ff05f6e2035d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:05 +0200 Subject: [PATCH 04/21] vdpa: Remove SVQ vring from iova_tree at shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Although the device will be reset before usage, the right thing to do is to clean it. Reported-by: Lei Yang Fixes: 34e3c94eda ("vdpa: Add custom IOTLB translations to SVQ") Signed-off-by: Eugenio Pérez Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 87e0ad393f..e16e0e222e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -898,6 +898,12 @@ static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, size = ROUND_UP(result->size, qemu_real_host_page_size()); r = vhost_vdpa_dma_unmap(v, result->iova, size); + if (unlikely(r < 0)) { + error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r); + return false; + } + + vhost_iova_tree_remove(v->iova_tree, *result); return r == 0; } From 5b590f51b923776a14d3bcafcb393279c1b72022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:06 +0200 Subject: [PATCH 05/21] vdpa: Make SVQ vring unmapping return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nothing actually reads the return value, but an error in cleaning some entries could cause device stop to abort, making a restart impossible. Better ignore explicitely the return value. Reported-by: Lei Yang Fixes: 34e3c94eda ("vdpa: Add custom IOTLB translations to SVQ") Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index e16e0e222e..e208dd000e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -884,7 +884,7 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, /** * Unmap a SVQ area in the device */ -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, +static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, const DMAMap *needle) { const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); @@ -893,38 +893,33 @@ static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, if (unlikely(!result)) { error_report("Unable to find SVQ address to unmap"); - return false; + return; } size = ROUND_UP(result->size, qemu_real_host_page_size()); r = vhost_vdpa_dma_unmap(v, result->iova, size); if (unlikely(r < 0)) { error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r); - return false; + return; } vhost_iova_tree_remove(v->iova_tree, *result); - return r == 0; } -static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, +static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, const VhostShadowVirtqueue *svq) { DMAMap needle = {}; struct vhost_vdpa *v = dev->opaque; struct vhost_vring_addr svq_addr; - bool ok; vhost_svq_get_vring_addr(svq, &svq_addr); needle.translated_addr = svq_addr.desc_user_addr; - ok = vhost_vdpa_svq_unmap_ring(v, &needle); - if (unlikely(!ok)) { - return false; - } + vhost_vdpa_svq_unmap_ring(v, &needle); needle.translated_addr = svq_addr.used_user_addr; - return vhost_vdpa_svq_unmap_ring(v, &needle); + vhost_vdpa_svq_unmap_ring(v, &needle); } /** @@ -1095,26 +1090,22 @@ err: return false; } -static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev) +static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) { struct vhost_vdpa *v = dev->opaque; if (!v->shadow_vqs) { - return true; + return; } for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); - bool ok = vhost_vdpa_svq_unmap_rings(dev, svq); - if (unlikely(!ok)) { - return false; - } + vhost_vdpa_svq_unmap_rings(dev, svq); } if (v->migration_blocker) { migrate_del_blocker(v->migration_blocker); } - return true; } static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) @@ -1131,10 +1122,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) } vhost_vdpa_set_vring_ready(dev); } else { - ok = vhost_vdpa_svqs_stop(dev); - if (unlikely(!ok)) { - return -1; - } + vhost_vdpa_svqs_stop(dev); vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } From 8b64e486423b09db4463799727bf1fad62fe496a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:07 +0200 Subject: [PATCH 06/21] vhost: Always store new kick fd on vhost_svq_set_svq_kick_fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can unbind twice a file descriptor if we call twice vhost_svq_set_svq_kick_fd because of this. Since it comes from vhost and not from SVQ, that file descriptor could be a different thing that guest's vhost notifier. Likewise, it can happens the same if a guest start and stop the device multiple times. Reported-by: Lei Yang Fixes: dff4426fa6 ("vhost: Add Shadow VirtQueue kick forwarding capabilities") Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index e4956728dd..82a784d250 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -602,13 +602,13 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd) event_notifier_set_handler(svq_kick, NULL); } + event_notifier_init_fd(svq_kick, svq_kick_fd); /* * event_notifier_set_handler already checks for guest's notifications if * they arrive at the new file descriptor in the switch, so there is no * need to explicitly check for them. */ if (poll_start) { - event_notifier_init_fd(svq_kick, svq_kick_fd); event_notifier_set(svq_kick); event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier); } @@ -655,7 +655,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, */ void vhost_svq_stop(VhostShadowVirtqueue *svq) { - event_notifier_set_handler(&svq->svq_kick, NULL); + vhost_svq_set_svq_kick_fd(svq, VHOST_FILE_UNBIND); g_autofree VirtQueueElement *next_avail_elem = NULL; if (!svq->vq) { From 8b6d6119ad7fd983d192f60c4960fb6a9197d995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:20:08 +0200 Subject: [PATCH 07/21] vdpa: Use ring hwaddr at vhost_vdpa_svq_unmap_ring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce code duplication. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index e208dd000e..23ae5ef48b 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -884,10 +884,12 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, /** * Unmap a SVQ area in the device */ -static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, - const DMAMap *needle) +static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr) { - const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); + const DMAMap needle = { + .translated_addr = addr, + }; + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle); hwaddr size; int r; @@ -909,17 +911,14 @@ static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, const VhostShadowVirtqueue *svq) { - DMAMap needle = {}; struct vhost_vdpa *v = dev->opaque; struct vhost_vring_addr svq_addr; vhost_svq_get_vring_addr(svq, &svq_addr); - needle.translated_addr = svq_addr.desc_user_addr; - vhost_vdpa_svq_unmap_ring(v, &needle); + vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr); - needle.translated_addr = svq_addr.used_user_addr; - vhost_vdpa_svq_unmap_ring(v, &needle); + vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr); } /** @@ -997,7 +996,7 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); if (unlikely(!ok)) { error_prepend(errp, "Cannot create vq device region: "); - vhost_vdpa_svq_unmap_ring(v, &driver_region); + vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr); } addr->used_user_addr = device_region.iova; From 9c2ab2f1ec333be8614cc12272d4b91960704dbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:26 +0200 Subject: [PATCH 08/21] vhost: stop transfer elem ownership in vhost_handle_guest_kick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was easier to allow vhost_svq_add to handle the memory. Now that we will allow qemu to add elements to a SVQ without the guest's knowledge, it's better to handle it in the caller. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 82a784d250..a1261d4a0f 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -233,9 +233,6 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) /** * Add an element to a SVQ. * - * The caller must check that there is enough slots for the new element. It - * takes ownership of the element: In case of failure not ENOSPC, it is free. - * * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full */ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, @@ -252,7 +249,6 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head); if (unlikely(!ok)) { - g_free(elem); return -EINVAL; } @@ -293,7 +289,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) virtio_queue_set_notification(svq->vq, false); while (true) { - VirtQueueElement *elem; + g_autofree VirtQueueElement *elem; int r; if (svq->next_guest_avail_elem) { @@ -324,12 +320,14 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) * queue the current guest descriptor and ignore kicks * until some elements are used. */ - svq->next_guest_avail_elem = elem; + svq->next_guest_avail_elem = g_steal_pointer(&elem); } /* VQ is full or broken, just return and ignore kicks */ return; } + /* elem belongs to SVQ or external caller now */ + elem = NULL; } virtio_queue_set_notification(svq->vq, true); From 86f5f2546f03a3dfde421c715187b262e29b2848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:27 +0200 Subject: [PATCH 09/21] vhost: use SVQ element ndescs instead of opaque data for desc validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we're going to allow SVQ to add elements without the guest's knowledge and without its own VirtQueueElement, it's easier to check if an element is a valid head checking a different thing than the VirtQueueElement. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index a1261d4a0f..b35aeef4bd 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -414,7 +414,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, return NULL; } - if (unlikely(!svq->desc_state[used_elem.id].elem)) { + if (unlikely(!svq->desc_state[used_elem.id].ndescs)) { qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used, but it was not available", svq->vdev->name, used_elem.id); @@ -422,6 +422,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, } num = svq->desc_state[used_elem.id].ndescs; + svq->desc_state[used_elem.id].ndescs = 0; last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); svq->desc_next[last_used_chain] = svq->free_head; svq->free_head = used_elem.id; From 9e193cec5db949e4001070442a2f7de7042ef09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:28 +0200 Subject: [PATCH 10/21] vhost: Delete useless read memory barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed in previous series [1], this memory barrier is useless with the atomic read of used idx at vhost_svq_more_used. Deleting it. [1] https://lists.nongnu.org/archive/html/qemu-devel/2022-07/msg02616.html Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index b35aeef4bd..8df5296f24 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -509,9 +509,6 @@ size_t vhost_svq_poll(VhostShadowVirtqueue *svq) if (unlikely(g_get_monotonic_time() - start_us > 10e6)) { return 0; } - - /* Make sure we read new used_idx */ - smp_rmb(); } while (true); } From d368c0b052ad95d3bf4fcc5a5d25715a35c91d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:29 +0200 Subject: [PATCH 11/21] vhost: Do not depend on !NULL VirtQueueElement on vhost_svq_flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since QEMU will be able to inject new elements on CVQ to restore the state, we need not to depend on a VirtQueueElement to know if a new element has been used by the device or not. Instead of check that, check if there are new elements only using used idx on vhost_svq_flush. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 8df5296f24..e8e5bbc368 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -499,17 +499,20 @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq, size_t vhost_svq_poll(VhostShadowVirtqueue *svq) { int64_t start_us = g_get_monotonic_time(); + uint32_t len; + do { - uint32_t len; - VirtQueueElement *elem = vhost_svq_get_buf(svq, &len); - if (elem) { - return len; + if (vhost_svq_more_used(svq)) { + break; } if (unlikely(g_get_monotonic_time() - start_us > 10e6)) { return 0; } } while (true); + + vhost_svq_get_buf(svq, &len); + return len; } /** From eb92b75380fc0f2368e22be45d1e2d1e2cd2f79c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:30 +0200 Subject: [PATCH 12/21] vhost_net: Add NetClientInfo start callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is used by the backend to perform actions before the device is started. In particular, vdpa net use it to map CVQ buffers to the device, so it can send control commands using them. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/net/vhost_net.c | 7 +++++++ include/net/net.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index ccac5b7a64..2e0baeba26 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -244,6 +244,13 @@ static int vhost_net_start_one(struct vhost_net *net, struct vhost_vring_file file = { }; int r; + if (net->nc->info->start) { + r = net->nc->info->start(net->nc); + if (r < 0) { + return r; + } + } + r = vhost_dev_enable_notifiers(&net->dev, dev); if (r < 0) { goto fail_notifiers; diff --git a/include/net/net.h b/include/net/net.h index 523136c7ac..ad9e80083a 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -44,6 +44,7 @@ typedef struct NICConf { typedef void (NetPoll)(NetClientState *, bool enable); typedef bool (NetCanReceive)(NetClientState *); +typedef int (NetStart)(NetClientState *); typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t); typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int); typedef void (NetCleanup) (NetClientState *); @@ -71,6 +72,7 @@ typedef struct NetClientInfo { NetReceive *receive_raw; NetReceiveIOV *receive_iov; NetCanReceive *can_receive; + NetStart *start; NetCleanup *cleanup; LinkStatusChanged *link_status_changed; QueryRxFilter *query_rx_filter; From c5e5269d8a955a0f924218911c2f4a0b34e87a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:31 +0200 Subject: [PATCH 13/21] vhost_net: Add NetClientInfo stop callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Used by the backend to perform actions after the device is stopped. In particular, vdpa net use it to unmap CVQ buffers to the device, cleaning the actions performed in prepare(). Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/net/vhost_net.c | 3 +++ include/net/net.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 2e0baeba26..9d4b334453 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -320,6 +320,9 @@ static void vhost_net_stop_one(struct vhost_net *net, net->nc->info->poll(net->nc, true); } vhost_dev_stop(&net->dev, dev); + if (net->nc->info->stop) { + net->nc->info->stop(net->nc); + } vhost_dev_disable_notifiers(&net->dev, dev); } diff --git a/include/net/net.h b/include/net/net.h index ad9e80083a..476ad45b9a 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -45,6 +45,7 @@ typedef struct NICConf { typedef void (NetPoll)(NetClientState *, bool enable); typedef bool (NetCanReceive)(NetClientState *); typedef int (NetStart)(NetClientState *); +typedef void (NetStop)(NetClientState *); typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t); typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int); typedef void (NetCleanup) (NetClientState *); @@ -73,6 +74,7 @@ typedef struct NetClientInfo { NetReceiveIOV *receive_iov; NetCanReceive *can_receive; NetStart *start; + NetStop *stop; NetCleanup *cleanup; LinkStatusChanged *link_status_changed; QueryRxFilter *query_rx_filter; From f8972b56eeace10a410990f032406250abe18d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:32 +0200 Subject: [PATCH 14/21] vdpa: add net_vhost_vdpa_cvq_info NetClientInfo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Next patches will add a new info callback to restore NIC status through CVQ. Since only the CVQ vhost device is needed, create it with a new NetClientInfo. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index a49e7e649d..1a597c2e92 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -334,6 +334,16 @@ static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s, return true; } +static NetClientInfo net_vhost_vdpa_cvq_info = { + .type = NET_CLIENT_DRIVER_VHOST_VDPA, + .size = sizeof(VhostVDPAState), + .receive = vhost_vdpa_receive, + .cleanup = vhost_vdpa_cleanup, + .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, + .has_ufo = vhost_vdpa_has_ufo, + .check_peer_type = vhost_vdpa_check_peer_type, +}; + /** * Do not forward commands not supported by SVQ. Otherwise, the device could * accept it and qemu would not know how to update the device model. @@ -475,7 +485,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, name); } else { - nc = qemu_new_net_control_client(&net_vhost_vdpa_info, peer, + nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer, device, name); } snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA); From 7a7f87e94c4e75ca177564491595dd17b7e41a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:33 +0200 Subject: [PATCH 15/21] vdpa: Move command buffers map to start of net device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As this series will reuse them to restore the device state at the end of a migration (or a device start), let's allocate only once at the device start so we don't duplicate their map and unmap. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 123 ++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 1a597c2e92..452d10ed93 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -263,29 +263,20 @@ static size_t vhost_vdpa_net_cvq_cmd_page_len(void) return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size()); } -/** Copy and map a guest buffer. */ -static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, - const struct iovec *out_data, - size_t out_num, size_t data_len, void *buf, - size_t *written, bool write) +/** Map CVQ buffer. */ +static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size, + bool write) { DMAMap map = {}; int r; - if (unlikely(!data_len)) { - qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n", - __func__, write ? "in" : "out"); - return false; - } - - *written = iov_to_buf(out_data, out_num, 0, buf, data_len); map.translated_addr = (hwaddr)(uintptr_t)buf; - map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1; + map.size = size - 1; map.perm = write ? IOMMU_RW : IOMMU_RO, r = vhost_iova_tree_map_alloc(v->iova_tree, &map); if (unlikely(r != IOVA_OK)) { error_report("Cannot map injected element"); - return false; + return r; } r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf, @@ -294,50 +285,58 @@ static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, goto dma_map_err; } - return true; + return 0; dma_map_err: vhost_iova_tree_remove(v->iova_tree, map); - return false; + return r; } -/** - * Copy the guest element into a dedicated buffer suitable to be sent to NIC - * - * @iov: [0] is the out buffer, [1] is the in one - */ -static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s, - VirtQueueElement *elem, - struct iovec *iov) +static int vhost_vdpa_net_cvq_start(NetClientState *nc) { - size_t in_copied; - bool ok; + VhostVDPAState *s; + int r; - iov[0].iov_base = s->cvq_cmd_out_buffer; - ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num, - vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base, - &iov[0].iov_len, false); - if (unlikely(!ok)) { - return false; + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); + + s = DO_UPCAST(VhostVDPAState, nc, nc); + if (!s->vhost_vdpa.shadow_vqs_enabled) { + return 0; } - iov[1].iov_base = s->cvq_cmd_in_buffer; - ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0, - sizeof(virtio_net_ctrl_ack), iov[1].iov_base, - &in_copied, true); - if (unlikely(!ok)) { + r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer, + vhost_vdpa_net_cvq_cmd_page_len(), false); + if (unlikely(r < 0)) { + return r; + } + + r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_in_buffer, + vhost_vdpa_net_cvq_cmd_page_len(), true); + if (unlikely(r < 0)) { vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); - return false; } - iov[1].iov_len = sizeof(virtio_net_ctrl_ack); - return true; + return r; +} + +static void vhost_vdpa_net_cvq_stop(NetClientState *nc) +{ + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); + + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); + + if (s->vhost_vdpa.shadow_vqs_enabled) { + vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); + vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_in_buffer); + } } static NetClientInfo net_vhost_vdpa_cvq_info = { .type = NET_CLIENT_DRIVER_VHOST_VDPA, .size = sizeof(VhostVDPAState), .receive = vhost_vdpa_receive, + .start = vhost_vdpa_net_cvq_start, + .stop = vhost_vdpa_net_cvq_stop, .cleanup = vhost_vdpa_cleanup, .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, .has_ufo = vhost_vdpa_has_ufo, @@ -348,19 +347,17 @@ static NetClientInfo net_vhost_vdpa_cvq_info = { * Do not forward commands not supported by SVQ. Otherwise, the device could * accept it and qemu would not know how to update the device model. */ -static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out, - size_t out_num) +static bool vhost_vdpa_net_cvq_validate_cmd(const void *out_buf, size_t len) { struct virtio_net_ctrl_hdr ctrl; - size_t n; - n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl)); - if (unlikely(n < sizeof(ctrl))) { + if (unlikely(len < sizeof(ctrl))) { qemu_log_mask(LOG_GUEST_ERROR, - "%s: invalid legnth of out buffer %zu\n", __func__, n); + "%s: invalid legnth of out buffer %zu\n", __func__, len); return false; } + memcpy(&ctrl, out_buf, sizeof(ctrl)); switch (ctrl.class) { case VIRTIO_NET_CTRL_MAC: switch (ctrl.cmd) { @@ -392,10 +389,14 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, VhostVDPAState *s = opaque; size_t in_len, dev_written; virtio_net_ctrl_ack status = VIRTIO_NET_ERR; - /* out and in buffers sent to the device */ - struct iovec dev_buffers[2] = { - { .iov_base = s->cvq_cmd_out_buffer }, - { .iov_base = s->cvq_cmd_in_buffer }, + /* Out buffer sent to both the vdpa device and the device model */ + struct iovec out = { + .iov_base = s->cvq_cmd_out_buffer, + }; + /* In buffer sent to the device */ + const struct iovec dev_in = { + .iov_base = s->cvq_cmd_in_buffer, + .iov_len = sizeof(virtio_net_ctrl_ack), }; /* in buffer used for device model */ const struct iovec in = { @@ -405,17 +406,15 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, int r = -EINVAL; bool ok; - ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers); + out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0, + s->cvq_cmd_out_buffer, + vhost_vdpa_net_cvq_cmd_len()); + ok = vhost_vdpa_net_cvq_validate_cmd(s->cvq_cmd_out_buffer, out.iov_len); if (unlikely(!ok)) { goto out; } - ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1); - if (unlikely(!ok)) { - goto out; - } - - r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem); + r = vhost_svq_add(svq, &out, 1, &dev_in, 1, elem); if (unlikely(r != 0)) { if (unlikely(r == -ENOSPC)) { qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", @@ -435,13 +434,13 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, goto out; } - memcpy(&status, dev_buffers[1].iov_base, sizeof(status)); + memcpy(&status, s->cvq_cmd_in_buffer, sizeof(status)); if (status != VIRTIO_NET_OK) { goto out; } status = VIRTIO_NET_ERR; - virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1); + virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1); if (status != VIRTIO_NET_OK) { error_report("Bad CVQ processing in model"); } @@ -454,12 +453,6 @@ out: } vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); g_free(elem); - if (dev_buffers[0].iov_base) { - vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base); - } - if (dev_buffers[1].iov_base) { - vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base); - } return r; } From be4278b65fc1be8fce87e1e7c01bc52602d304eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:34 +0200 Subject: [PATCH 16/21] vdpa: extract vhost_vdpa_net_cvq_add from vhost_vdpa_net_handle_ctrl_avail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we can reuse it to inject state messages. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang -- v7: * Remove double free error v6: * Do not assume in buffer sent to the device is sizeof(virtio_net_ctrl_ack) v5: * Do not use an artificial !NULL VirtQueueElement * Use only out size instead of iovec dev_buffers for these functions. Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 59 +++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 452d10ed93..3575bf64ee 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -331,6 +331,38 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc) } } +static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len, + size_t in_len) +{ + /* Buffers for the device */ + const struct iovec out = { + .iov_base = s->cvq_cmd_out_buffer, + .iov_len = out_len, + }; + const struct iovec in = { + .iov_base = s->cvq_cmd_in_buffer, + .iov_len = sizeof(virtio_net_ctrl_ack), + }; + VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); + int r; + + r = vhost_svq_add(svq, &out, 1, &in, 1, NULL); + if (unlikely(r != 0)) { + if (unlikely(r == -ENOSPC)) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", + __func__); + } + return r; + } + + /* + * We can poll here since we've had BQL from the time we sent the + * descriptor. Also, we need to take the answer before SVQ pulls by itself, + * when BQL is released + */ + return vhost_svq_poll(svq); +} + static NetClientInfo net_vhost_vdpa_cvq_info = { .type = NET_CLIENT_DRIVER_VHOST_VDPA, .size = sizeof(VhostVDPAState), @@ -387,23 +419,18 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, void *opaque) { VhostVDPAState *s = opaque; - size_t in_len, dev_written; + size_t in_len; virtio_net_ctrl_ack status = VIRTIO_NET_ERR; /* Out buffer sent to both the vdpa device and the device model */ struct iovec out = { .iov_base = s->cvq_cmd_out_buffer, }; - /* In buffer sent to the device */ - const struct iovec dev_in = { - .iov_base = s->cvq_cmd_in_buffer, - .iov_len = sizeof(virtio_net_ctrl_ack), - }; /* in buffer used for device model */ const struct iovec in = { .iov_base = &status, .iov_len = sizeof(status), }; - int r = -EINVAL; + ssize_t dev_written = -EINVAL; bool ok; out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0, @@ -414,21 +441,11 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, goto out; } - r = vhost_svq_add(svq, &out, 1, &dev_in, 1, elem); - if (unlikely(r != 0)) { - if (unlikely(r == -ENOSPC)) { - qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", - __func__); - } + dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status)); + if (unlikely(dev_written < 0)) { goto out; } - /* - * We can poll here since we've had BQL from the time we sent the - * descriptor. Also, we need to take the answer before SVQ pulls by itself, - * when BQL is released - */ - dev_written = vhost_svq_poll(svq); if (unlikely(dev_written < sizeof(status))) { error_report("Insufficient written data (%zu)", dev_written); goto out; @@ -436,7 +453,7 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, memcpy(&status, s->cvq_cmd_in_buffer, sizeof(status)); if (status != VIRTIO_NET_OK) { - goto out; + return VIRTIO_NET_ERR; } status = VIRTIO_NET_ERR; @@ -453,7 +470,7 @@ out: } vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); g_free(elem); - return r; + return dev_written < 0 ? dev_written : 0; } static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = { From 539573c317dc0b8d50a128db60550f2f2898d2fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:35 +0200 Subject: [PATCH 17/21] vhost_net: add NetClientState->load() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It allows per-net client operations right after device's successful start. In particular, to load the device status. Vhost-vdpa net will use it to add the CVQ buffers to restore the device status. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/net/vhost_net.c | 7 +++++++ include/net/net.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 9d4b334453..d28f8b974b 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -281,6 +281,13 @@ static int vhost_net_start_one(struct vhost_net *net, } } } + + if (net->nc->info->load) { + r = net->nc->info->load(net->nc); + if (r < 0) { + goto fail; + } + } return 0; fail: file.fd = -1; diff --git a/include/net/net.h b/include/net/net.h index 476ad45b9a..81d0b21def 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -45,6 +45,7 @@ typedef struct NICConf { typedef void (NetPoll)(NetClientState *, bool enable); typedef bool (NetCanReceive)(NetClientState *); typedef int (NetStart)(NetClientState *); +typedef int (NetLoad)(NetClientState *); typedef void (NetStop)(NetClientState *); typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t); typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int); @@ -74,6 +75,7 @@ typedef struct NetClientInfo { NetReceiveIOV *receive_iov; NetCanReceive *can_receive; NetStart *start; + NetLoad *load; NetStop *stop; NetCleanup *cleanup; LinkStatusChanged *link_status_changed; From dd036d8d278e6882803bccaa8c51b8527ea33f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:36 +0200 Subject: [PATCH 18/21] vdpa: Add virtio-net mac address via CVQ at start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is needed so the destination vdpa device see the same state a the guest set in the source. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 3575bf64ee..640434d1ea 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -363,11 +363,51 @@ static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len, return vhost_svq_poll(svq); } +static int vhost_vdpa_net_load(NetClientState *nc) +{ + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); + const struct vhost_vdpa *v = &s->vhost_vdpa; + const VirtIONet *n; + uint64_t features; + + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); + + if (!v->shadow_vqs_enabled) { + return 0; + } + + n = VIRTIO_NET(v->dev->vdev); + features = n->parent_obj.guest_features; + if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) { + const struct virtio_net_ctrl_hdr ctrl = { + .class = VIRTIO_NET_CTRL_MAC, + .cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET, + }; + char *cursor = s->cvq_cmd_out_buffer; + ssize_t dev_written; + + memcpy(cursor, &ctrl, sizeof(ctrl)); + cursor += sizeof(ctrl); + memcpy(cursor, n->mac, sizeof(n->mac)); + + dev_written = vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + sizeof(n->mac), + sizeof(virtio_net_ctrl_ack)); + if (unlikely(dev_written < 0)) { + return dev_written; + } + + return *((virtio_net_ctrl_ack *)s->cvq_cmd_in_buffer) != VIRTIO_NET_OK; + } + + return 0; +} + static NetClientInfo net_vhost_vdpa_cvq_info = { .type = NET_CLIENT_DRIVER_VHOST_VDPA, .size = sizeof(VhostVDPAState), .receive = vhost_vdpa_receive, .start = vhost_vdpa_net_cvq_start, + .load = vhost_vdpa_net_load, .stop = vhost_vdpa_net_cvq_stop, .cleanup = vhost_vdpa_cleanup, .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, From 0e3fdcffead7c651ce06ab50cffb89e806f04e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Tue, 23 Aug 2022 20:30:37 +0200 Subject: [PATCH 19/21] vdpa: Delete CVQ migration blocker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can restore the device state in the destination via CVQ now. Remove the migration blocker. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 15 --------------- include/hw/virtio/vhost-vdpa.h | 1 - net/vhost-vdpa.c | 2 -- 3 files changed, 18 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 23ae5ef48b..7468e44b87 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1033,13 +1033,6 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) return true; } - if (v->migration_blocker) { - int r = migrate_add_blocker(v->migration_blocker, &err); - if (unlikely(r < 0)) { - return false; - } - } - for (i = 0; i < v->shadow_vqs->len; ++i) { VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); @@ -1082,10 +1075,6 @@ err: vhost_svq_stop(svq); } - if (v->migration_blocker) { - migrate_del_blocker(v->migration_blocker); - } - return false; } @@ -1101,10 +1090,6 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); vhost_vdpa_svq_unmap_rings(dev, svq); } - - if (v->migration_blocker) { - migrate_del_blocker(v->migration_blocker); - } } static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index d10a89303e..1111d85643 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -35,7 +35,6 @@ typedef struct vhost_vdpa { bool shadow_vqs_enabled; /* IOVA mapping used by the Shadow Virtqueue */ VhostIOVATree *iova_tree; - Error *migration_blocker; GPtrArray *shadow_vqs; const VhostShadowVirtqueueOps *shadow_vq_ops; void *shadow_vq_ops_opaque; diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 640434d1ea..6ce68fcd3f 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -555,8 +555,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; s->vhost_vdpa.shadow_vq_ops_opaque = s; - error_setg(&s->vhost_vdpa.migration_blocker, - "Migration disabled: vhost-vdpa uses CVQ."); } ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); if (ret) { From 3772cf0d1b37d32e61dc314e9cc18ff745327ddd Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Mon, 22 Aug 2022 16:14:36 +0800 Subject: [PATCH 20/21] net/colo.c: Fix the pointer issue reported by Coverity. When enabled the virtio-net-pci, guest network packet will load the vnet_hdr. In COLO status, the primary VM's network packet maybe redirect to another VM, it needs filter-redirect enable the vnet_hdr flag at the same time, COLO-proxy will correctly parse the original network packet. If have any misconfiguration here, the vnet_hdr_len is wrong for parse the packet, the data+offset will point to wrong place. Signed-off-by: Zhang Chen Signed-off-by: Jason Wang --- net/colo.c | 25 ++++++++++++++++--------- net/colo.h | 1 + net/trace-events | 2 +- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/net/colo.c b/net/colo.c index 6b0ff562ad..fb2c36a026 100644 --- a/net/colo.c +++ b/net/colo.c @@ -44,21 +44,28 @@ int parse_packet_early(Packet *pkt) { int network_length; static const uint8_t vlan[] = {0x81, 0x00}; - uint8_t *data = pkt->data + pkt->vnet_hdr_len; + uint8_t *data = pkt->data; uint16_t l3_proto; ssize_t l2hdr_len; - if (data == NULL) { - trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, " - "pkt->vnet_hdr_len", pkt->vnet_hdr_len); - return 1; - } - l2hdr_len = eth_get_l2_hdr_length(data); + assert(data); - if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) { - trace_colo_proxy_main("pkt->size < ETH_HLEN"); + /* Check the received vnet_hdr_len then add the offset */ + if ((pkt->vnet_hdr_len > sizeof(struct virtio_net_hdr_v1_hash)) || + (pkt->size < sizeof(struct eth_header) + sizeof(struct vlan_header) + + pkt->vnet_hdr_len)) { + /* + * The received remote packet maybe misconfiguration here, + * Please enable/disable filter module's the vnet_hdr flag at + * the same time. + */ + trace_colo_proxy_main_vnet_info("This received packet load wrong ", + pkt->vnet_hdr_len, pkt->size); return 1; } + data += pkt->vnet_hdr_len; + + l2hdr_len = eth_get_l2_hdr_length(data); /* * TODO: support vlan. diff --git a/net/colo.h b/net/colo.h index 8b3e8d5a83..22fc3031f7 100644 --- a/net/colo.h +++ b/net/colo.h @@ -18,6 +18,7 @@ #include "qemu/jhash.h" #include "qemu/timer.h" #include "net/eth.h" +#include "standard-headers/linux/virtio_net.h" #define HASHTABLE_MAX_SIZE 16384 diff --git a/net/trace-events b/net/trace-events index 6af927b4b9..823a071bdc 100644 --- a/net/trace-events +++ b/net/trace-events @@ -9,7 +9,7 @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d" # colo.c colo_proxy_main(const char *chr) ": %s" -colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d" +colo_proxy_main_vnet_info(const char *sta, uint32_t vnet_hdr, int size) ": %s pkt->vnet_hdr_len = %u, pkt->size = %d" # colo-compare.c colo_compare_main(const char *chr) ": %s" From 36a894aeb64a2e02871016da1c37d4a4ca109182 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sun, 21 Aug 2022 20:43:43 +0800 Subject: [PATCH 21/21] net: tulip: Restrict DMA engine to memories The DMA engine is started by I/O access and then itself accesses the I/O registers, triggering a reentrancy bug. The following log can reveal it: ==5637==ERROR: AddressSanitizer: stack-overflow #0 0x5595435f6078 in tulip_xmit_list_update qemu/hw/net/tulip.c:673 #1 0x5595435f204a in tulip_write qemu/hw/net/tulip.c:805:13 #2 0x559544637f86 in memory_region_write_accessor qemu/softmmu/memory.c:492:5 #3 0x5595446379fa in access_with_adjusted_size qemu/softmmu/memory.c:554:18 #4 0x5595446372fa in memory_region_dispatch_write qemu/softmmu/memory.c #5 0x55954468b74c in flatview_write_continue qemu/softmmu/physmem.c:2825:23 #6 0x559544683662 in flatview_write qemu/softmmu/physmem.c:2867:12 #7 0x5595446833f3 in address_space_write qemu/softmmu/physmem.c:2963:18 #8 0x5595435fb082 in dma_memory_rw_relaxed qemu/include/sysemu/dma.h:87:12 #9 0x5595435fb082 in dma_memory_rw qemu/include/sysemu/dma.h:130:12 #10 0x5595435fb082 in dma_memory_write qemu/include/sysemu/dma.h:171:12 #11 0x5595435fb082 in stl_le_dma qemu/include/sysemu/dma.h:272:1 #12 0x5595435fb082 in stl_le_pci_dma qemu/include/hw/pci/pci.h:910:1 #13 0x5595435fb082 in tulip_desc_write qemu/hw/net/tulip.c:101:9 #14 0x5595435f7e3d in tulip_xmit_list_update qemu/hw/net/tulip.c:706:9 #15 0x5595435f204a in tulip_write qemu/hw/net/tulip.c:805:13 Fix this bug by restricting the DMA engine to memories regions. Signed-off-by: Zheyu Ma Signed-off-by: Jason Wang --- hw/net/tulip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/net/tulip.c b/hw/net/tulip.c index 097e905bec..b9e42c322a 100644 --- a/hw/net/tulip.c +++ b/hw/net/tulip.c @@ -70,7 +70,7 @@ static const VMStateDescription vmstate_pci_tulip = { static void tulip_desc_read(TULIPState *s, hwaddr p, struct tulip_descriptor *desc) { - const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; + const MemTxAttrs attrs = { .memory = true }; if (s->csr[0] & CSR0_DBO) { ldl_be_pci_dma(&s->dev, p, &desc->status, attrs); @@ -88,7 +88,7 @@ static void tulip_desc_read(TULIPState *s, hwaddr p, static void tulip_desc_write(TULIPState *s, hwaddr p, struct tulip_descriptor *desc) { - const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; + const MemTxAttrs attrs = { .memory = true }; if (s->csr[0] & CSR0_DBO) { stl_be_pci_dma(&s->dev, p, desc->status, attrs);