From 009c2549bb9dc7f7061009eb87f2a53d4b364983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:26 +0200 Subject: [PATCH 01/25] vhost: move descriptor translation to vhost_svq_vring_write_descs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's done for both in and out descriptors so it's better placed here. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 56c96ebd13..e2184a4481 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -122,17 +122,35 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, return true; } -static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, - const struct iovec *iovec, size_t num, - bool more_descs, bool write) +/** + * Write descriptors to SVQ vring + * + * @svq: The shadow virtqueue + * @sg: Cache for hwaddr + * @iovec: The iovec from the guest + * @num: iovec length + * @more_descs: True if more descriptors come in the chain + * @write: True if they are writeable descriptors + * + * Return true if success, false otherwise and print error. + */ +static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, + const struct iovec *iovec, size_t num, + bool more_descs, bool write) { uint16_t i = svq->free_head, last = svq->free_head; unsigned n; uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; vring_desc_t *descs = svq->vring.desc; + bool ok; if (num == 0) { - return; + return true; + } + + ok = vhost_svq_translate_addr(svq, sg, iovec, num); + if (unlikely(!ok)) { + return false; } for (n = 0; n < num; n++) { @@ -150,6 +168,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, } svq->free_head = le16_to_cpu(svq->desc_next[last]); + return true; } static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, @@ -169,20 +188,17 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, return false; } - ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num); - if (unlikely(!ok)) { - return false; - } - vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, - elem->in_num > 0, false); - - - ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num); + ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, + elem->in_num > 0, false); if (unlikely(!ok)) { return false; } - vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true); + ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, + true); + if (unlikely(!ok)) { + return false; + } /* * Put the entry in the available array (but don't update avail->idx until From 6758c01f054c2a842d41d927d628b09f649d3254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:27 +0200 Subject: [PATCH 02/25] virtio-net: Expose MAC_TABLE_ENTRIES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vhost-vdpa control virtqueue needs to know the maximum entries supported by the virtio-net device, so we know if it is possible to apply the filter. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/net/virtio-net.c | 1 - include/hw/virtio/virtio-net.h | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7ad948ee7c..f83e96e4ce 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -49,7 +49,6 @@ #define VIRTIO_NET_VM_VERSION 11 -#define MAC_TABLE_ENTRIES 64 #define MAX_VLAN (1 << 12) /* Per 802.1Q definition */ /* previously fixed value */ diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index eb87032627..cce1c554f7 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -35,6 +35,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET) * and latency. */ #define TX_BURST 256 +/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */ +#define MAC_TABLE_ENTRIES 64 + typedef struct virtio_net_conf { uint32_t txtimer; From 640b8a1c588b56349b3307d88459ea1cd86181fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:28 +0200 Subject: [PATCH 03/25] virtio-net: Expose ctrl virtqueue logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows external vhost-net devices to modify the state of the VirtIO device model once the vhost-vdpa device has acknowledged the control commands. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/net/virtio-net.c | 84 ++++++++++++++++++++-------------- include/hw/virtio/virtio-net.h | 4 ++ 2 files changed, 53 insertions(+), 35 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index f83e96e4ce..dd0d056fde 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1433,57 +1433,71 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, return VIRTIO_NET_OK; } -static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) +size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev, + const struct iovec *in_sg, unsigned in_num, + const struct iovec *out_sg, + unsigned out_num) { VirtIONet *n = VIRTIO_NET(vdev); struct virtio_net_ctrl_hdr ctrl; virtio_net_ctrl_ack status = VIRTIO_NET_ERR; - VirtQueueElement *elem; size_t s; struct iovec *iov, *iov2; - unsigned int iov_cnt; + + if (iov_size(in_sg, in_num) < sizeof(status) || + iov_size(out_sg, out_num) < sizeof(ctrl)) { + virtio_error(vdev, "virtio-net ctrl missing headers"); + return 0; + } + + iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num); + s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl)); + iov_discard_front(&iov, &out_num, sizeof(ctrl)); + if (s != sizeof(ctrl)) { + status = VIRTIO_NET_ERR; + } else if (ctrl.class == VIRTIO_NET_CTRL_RX) { + status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num); + } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) { + status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num); + } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) { + status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num); + } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) { + status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num); + } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) { + status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num); + } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) { + status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num); + } + + s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status)); + assert(s == sizeof(status)); + + g_free(iov2); + return sizeof(status); +} + +static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtQueueElement *elem; for (;;) { + size_t written; elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); if (!elem) { break; } - if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) || - iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) { - virtio_error(vdev, "virtio-net ctrl missing headers"); + + written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num, + elem->out_sg, elem->out_num); + if (written > 0) { + virtqueue_push(vq, elem, written); + virtio_notify(vdev, vq); + g_free(elem); + } else { virtqueue_detach_element(vq, elem, 0); g_free(elem); break; } - - iov_cnt = elem->out_num; - iov2 = iov = g_memdup2(elem->out_sg, - sizeof(struct iovec) * elem->out_num); - s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl)); - iov_discard_front(&iov, &iov_cnt, sizeof(ctrl)); - if (s != sizeof(ctrl)) { - status = VIRTIO_NET_ERR; - } else if (ctrl.class == VIRTIO_NET_CTRL_RX) { - status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) { - status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) { - status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) { - status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) { - status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt); - } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) { - status = virtio_net_handle_offloads(n, ctrl.cmd, iov, iov_cnt); - } - - s = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, sizeof(status)); - assert(s == sizeof(status)); - - virtqueue_push(vq, elem, sizeof(status)); - virtio_notify(vdev, vq); - g_free(iov2); - g_free(elem); } } diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index cce1c554f7..ef234ffe7e 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -221,6 +221,10 @@ struct VirtIONet { struct EBPFRSSContext ebpf_rss; }; +size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev, + const struct iovec *in_sg, unsigned in_num, + const struct iovec *out_sg, + unsigned out_num); void virtio_net_set_netclient_name(VirtIONet *n, const char *name, const char *type); From c381abc37f0aba42ed2e3b41cdace8f8438829e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:29 +0200 Subject: [PATCH 04/25] vdpa: Avoid compiler to squash reads to used idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the next patch we will allow busypolling of this value. The compiler have a running path where shadow_used_idx, last_used_idx, and vring used idx are not modified within the same thread busypolling. This was not an issue before since we always cleared device event notifier before checking it, and that could act as memory barrier. However, the busypoll needs something similar to kernel READ_ONCE. Let's add it here, sepparated from the polling. Signed-off-by: Eugenio Pérez Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index e2184a4481..560d07ab36 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -327,11 +327,12 @@ static void vhost_handle_guest_kick_notifier(EventNotifier *n) static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) { + uint16_t *used_idx = &svq->vring.used->idx; if (svq->last_used_idx != svq->shadow_used_idx) { return true; } - svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); + svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx); return svq->last_used_idx != svq->shadow_used_idx; } From d93a2405ca6efa9dc1c420cee5a34bd8242818d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:30 +0200 Subject: [PATCH 05/25] vhost: Reorder vhost_svq_kick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Future code needs to call it from vhost_svq_add. No functional change intended. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 560d07ab36..043a185b96 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -215,6 +215,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, return true; } +static void vhost_svq_kick(VhostShadowVirtqueue *svq) +{ + /* + * We need to expose the available array entries before checking the used + * flags + */ + smp_mb(); + if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) { + return; + } + + event_notifier_set(&svq->hdev_kick); +} + /** * Add an element to a SVQ. * @@ -235,20 +249,6 @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) return true; } -static void vhost_svq_kick(VhostShadowVirtqueue *svq) -{ - /* - * We need to expose the available array entries before checking the used - * flags - */ - smp_mb(); - if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) { - return; - } - - event_notifier_set(&svq->hdev_kick); -} - /** * Forward available buffers. * From 98b5adef8493a2bfad6655cfee84299e88bedbf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:31 +0200 Subject: [PATCH 06/25] vhost: Move vhost_svq_kick call to vhost_svq_add MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The series needs to expose vhost_svq_add with full functionality, including kick Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 043a185b96..e272c3318a 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -246,6 +246,7 @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) } svq->ring_id_maps[qemu_head] = elem; + vhost_svq_kick(svq); return true; } @@ -306,7 +307,6 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) /* VQ is broken, just return and ignore any other kicks */ return; } - vhost_svq_kick(svq); } virtio_queue_set_notification(svq->vq, true); From f20b70eb5a68cfd8fef74a13ccdd494ef1cb0221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:32 +0200 Subject: [PATCH 07/25] vhost: Check for queue full at vhost_svq_add MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The series need to expose vhost_svq_add with full functionality, including checking for full queue. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 57 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index e272c3318a..11302ea1f2 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -233,21 +233,29 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) * Add an element to a SVQ. * * The caller must check that there is enough slots for the new element. It - * takes ownership of the element: In case of failure, it is free and the SVQ - * is considered broken. + * takes ownership of the element: In case of failure not ENOSPC, it is free. + * + * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full */ -static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) +static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) { unsigned qemu_head; - bool ok = vhost_svq_add_split(svq, elem, &qemu_head); + unsigned ndescs = elem->in_num + elem->out_num; + bool ok; + + if (unlikely(ndescs > vhost_svq_available_slots(svq))) { + return -ENOSPC; + } + + ok = vhost_svq_add_split(svq, elem, &qemu_head); if (unlikely(!ok)) { g_free(elem); - return false; + return -EINVAL; } svq->ring_id_maps[qemu_head] = elem; vhost_svq_kick(svq); - return true; + return 0; } /** @@ -274,7 +282,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) while (true) { VirtQueueElement *elem; - bool ok; + int r; if (svq->next_guest_avail_elem) { elem = g_steal_pointer(&svq->next_guest_avail_elem); @@ -286,25 +294,24 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) break; } - if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) { - /* - * This condition is possible since a contiguous buffer in GPA - * does not imply a contiguous buffer in qemu's VA - * scatter-gather segments. If that happens, the buffer exposed - * to the device needs to be a chain of descriptors at this - * moment. - * - * SVQ cannot hold more available buffers if we are here: - * queue the current guest descriptor and ignore further kicks - * until some elements are used. - */ - svq->next_guest_avail_elem = elem; - return; - } + r = vhost_svq_add(svq, elem); + if (unlikely(r != 0)) { + if (r == -ENOSPC) { + /* + * This condition is possible since a contiguous buffer in + * GPA does not imply a contiguous buffer in qemu's VA + * scatter-gather segments. If that happens, the buffer + * exposed to the device needs to be a chain of descriptors + * at this moment. + * + * SVQ cannot hold more available buffers if we are here: + * queue the current guest descriptor and ignore kicks + * until some elements are used. + */ + svq->next_guest_avail_elem = elem; + } - ok = vhost_svq_add(svq, elem); - if (unlikely(!ok)) { - /* VQ is broken, just return and ignore any other kicks */ + /* VQ is full or broken, just return and ignore kicks */ return; } } From 1f46ae65d85f677b660bda46685dd3e94885a7cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:33 +0200 Subject: [PATCH 08/25] vhost: Decouple vhost_svq_add from VirtQueueElement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VirtQueueElement comes from the guest, but we're heading SVQ to be able to modify the element presented to the device without the guest's knowledge. To do so, make SVQ accept sg buffers directly, instead of using VirtQueueElement. Add vhost_svq_add_element to maintain element convenience. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 33 ++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 11302ea1f2..e3afa2ba44 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -172,30 +172,31 @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, } static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, - VirtQueueElement *elem, unsigned *head) + const struct iovec *out_sg, size_t out_num, + const struct iovec *in_sg, size_t in_num, + unsigned *head) { unsigned avail_idx; vring_avail_t *avail = svq->vring.avail; bool ok; - g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num)); + g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num)); *head = svq->free_head; /* We need some descriptors here */ - if (unlikely(!elem->out_num && !elem->in_num)) { + if (unlikely(!out_num && !in_num)) { qemu_log_mask(LOG_GUEST_ERROR, "Guest provided element with no descriptors"); return false; } - ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, - elem->in_num > 0, false); + ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0, + false); if (unlikely(!ok)) { return false; } - ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, - true); + ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true); if (unlikely(!ok)) { return false; } @@ -237,17 +238,19 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) * * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full */ -static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) +static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, + size_t out_num, const struct iovec *in_sg, + size_t in_num, VirtQueueElement *elem) { unsigned qemu_head; - unsigned ndescs = elem->in_num + elem->out_num; + unsigned ndescs = in_num + out_num; bool ok; if (unlikely(ndescs > vhost_svq_available_slots(svq))) { return -ENOSPC; } - ok = vhost_svq_add_split(svq, elem, &qemu_head); + ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head); if (unlikely(!ok)) { g_free(elem); return -EINVAL; @@ -258,6 +261,14 @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) return 0; } +/* Convenience wrapper to add a guest's element to SVQ */ +static int vhost_svq_add_element(VhostShadowVirtqueue *svq, + VirtQueueElement *elem) +{ + return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg, + elem->in_num, elem); +} + /** * Forward available buffers. * @@ -294,7 +305,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) break; } - r = vhost_svq_add(svq, elem); + r = vhost_svq_add_element(svq, elem); if (unlikely(r != 0)) { if (r == -ENOSPC) { /* From 9e87868fcaf5785c8e1490c290505fa32305ff91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:34 +0200 Subject: [PATCH 09/25] vhost: Add SVQDescState MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow SVQ to add context to the different queue elements. This patch only store the actual element, no functional change intended. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++-------- hw/virtio/vhost-shadow-virtqueue.h | 8 ++++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index e3afa2ba44..e4c09e2d88 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -256,7 +256,7 @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, return -EINVAL; } - svq->ring_id_maps[qemu_head] = elem; + svq->desc_state[qemu_head].elem = elem; vhost_svq_kick(svq); return 0; } @@ -411,21 +411,21 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, return NULL; } - if (unlikely(!svq->ring_id_maps[used_elem.id])) { + if (unlikely(!svq->desc_state[used_elem.id].elem)) { qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used, but it was not available", svq->vdev->name, used_elem.id); return NULL; } - num = svq->ring_id_maps[used_elem.id]->in_num + - svq->ring_id_maps[used_elem.id]->out_num; + num = svq->desc_state[used_elem.id].elem->in_num + + svq->desc_state[used_elem.id].elem->out_num; last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); svq->desc_next[last_used_chain] = svq->free_head; svq->free_head = used_elem.id; *len = used_elem.len; - return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); + return g_steal_pointer(&svq->desc_state[used_elem.id].elem); } static void vhost_svq_flush(VhostShadowVirtqueue *svq, @@ -595,7 +595,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, memset(svq->vring.desc, 0, driver_size); svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size); memset(svq->vring.used, 0, device_size); - svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num); + svq->desc_state = g_new0(SVQDescState, svq->vring.num); svq->desc_next = g_new0(uint16_t, svq->vring.num); for (unsigned i = 0; i < svq->vring.num - 1; i++) { svq->desc_next[i] = cpu_to_le16(i + 1); @@ -620,7 +620,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) for (unsigned i = 0; i < svq->vring.num; ++i) { g_autofree VirtQueueElement *elem = NULL; - elem = g_steal_pointer(&svq->ring_id_maps[i]); + elem = g_steal_pointer(&svq->desc_state[i].elem); if (elem) { virtqueue_detach_element(svq->vq, elem, 0); } @@ -632,7 +632,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) } svq->vq = NULL; g_free(svq->desc_next); - g_free(svq->ring_id_maps); + g_free(svq->desc_state); qemu_vfree(svq->vring.desc); qemu_vfree(svq->vring.used); } diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index c132c994e9..d646c35054 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -15,6 +15,10 @@ #include "standard-headers/linux/vhost_types.h" #include "hw/virtio/vhost-iova-tree.h" +typedef struct SVQDescState { + VirtQueueElement *elem; +} SVQDescState; + /* Shadow virtqueue to relay notifications */ typedef struct VhostShadowVirtqueue { /* Shadow vring */ @@ -47,8 +51,8 @@ typedef struct VhostShadowVirtqueue { /* IOVA mapping */ VhostIOVATree *iova_tree; - /* Map for use the guest's descriptors */ - VirtQueueElement **ring_id_maps; + /* SVQ vring descriptors state */ + SVQDescState *desc_state; /* Next VirtQueue element that guest made available */ VirtQueueElement *next_guest_avail_elem; From ac4cfdc6f39c06732d27554523f9d5f8a53b4ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:35 +0200 Subject: [PATCH 10/25] vhost: Track number of descs in SVQDescState MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A guest's buffer continuos on GPA may need multiple descriptors on qemu's VA, so SVQ should track its length sepparatedly. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 4 ++-- hw/virtio/vhost-shadow-virtqueue.h | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index e4c09e2d88..8314405e01 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -257,6 +257,7 @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, } svq->desc_state[qemu_head].elem = elem; + svq->desc_state[qemu_head].ndescs = ndescs; vhost_svq_kick(svq); return 0; } @@ -418,8 +419,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, return NULL; } - num = svq->desc_state[used_elem.id].elem->in_num + - svq->desc_state[used_elem.id].elem->out_num; + num = svq->desc_state[used_elem.id].ndescs; last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); svq->desc_next[last_used_chain] = svq->free_head; svq->free_head = used_elem.id; diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index d646c35054..5c7e7cbab6 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -17,6 +17,12 @@ typedef struct SVQDescState { VirtQueueElement *elem; + + /* + * Number of descriptors exposed to the device. May or may not match + * guest's + */ + unsigned int ndescs; } SVQDescState; /* Shadow virtqueue to relay notifications */ From 432efd144e990b6e040862de25f8f0b6a6eeb03d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:36 +0200 Subject: [PATCH 11/25] vhost: add vhost_svq_push_elem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function allows external SVQ users to return guest's available buffers. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++++++++++ hw/virtio/vhost-shadow-virtqueue.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 8314405e01..1669b1fcd1 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -428,6 +428,22 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, return g_steal_pointer(&svq->desc_state[used_elem.id].elem); } +/** + * Push an element to SVQ, returning it to the guest. + */ +void vhost_svq_push_elem(VhostShadowVirtqueue *svq, + const VirtQueueElement *elem, uint32_t len) +{ + virtqueue_push(svq->vq, elem, len); + if (svq->next_guest_avail_elem) { + /* + * Avail ring was full when vhost_svq_flush was called, so it's a + * good moment to make more descriptors available if possible. + */ + vhost_handle_guest_kick(svq); + } +} + static void vhost_svq_flush(VhostShadowVirtqueue *svq, bool check_for_avail_queue) { diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index 5c7e7cbab6..d9fc1f1799 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -84,6 +84,9 @@ typedef struct VhostShadowVirtqueue { bool vhost_svq_valid_features(uint64_t features, Error **errp); +void vhost_svq_push_elem(VhostShadowVirtqueue *svq, + const VirtQueueElement *elem, uint32_t len); + void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd); void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd); void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq, From d0291f3f284d3bc220cdb13b0d8ac8a44eb5fd4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:37 +0200 Subject: [PATCH 12/25] vhost: Expose vhost_svq_add MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows external parts of SVQ to forward custom buffers to the device. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 6 +++--- hw/virtio/vhost-shadow-virtqueue.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 1669b1fcd1..c3a75ca89e 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -238,9 +238,9 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) * * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full */ -static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, - size_t out_num, const struct iovec *in_sg, - size_t in_num, VirtQueueElement *elem) +int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, + size_t out_num, const struct iovec *in_sg, size_t in_num, + VirtQueueElement *elem) { unsigned qemu_head; unsigned ndescs = in_num + out_num; diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index d9fc1f1799..dd78f4bec2 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -86,6 +86,9 @@ bool vhost_svq_valid_features(uint64_t features, Error **errp); void vhost_svq_push_elem(VhostShadowVirtqueue *svq, const VirtQueueElement *elem, uint32_t len); +int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, + size_t out_num, const struct iovec *in_sg, size_t in_num, + VirtQueueElement *elem); void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd); void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd); From 3f44d13dda83d390cc9563e56e7d337e4f6223f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:38 +0200 Subject: [PATCH 13/25] vhost: add vhost_svq_poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It allows the Shadow Control VirtQueue to wait for the device to use the available buffers. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 27 +++++++++++++++++++++++++++ hw/virtio/vhost-shadow-virtqueue.h | 1 + 2 files changed, 28 insertions(+) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index c3a75ca89e..cc2ee4780d 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -485,6 +485,33 @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq, } while (!vhost_svq_enable_notification(svq)); } +/** + * Poll the SVQ for one device used buffer. + * + * This function race with main event loop SVQ polling, so extra + * synchronization is needed. + * + * Return the length written by the device. + */ +size_t vhost_svq_poll(VhostShadowVirtqueue *svq) +{ + int64_t start_us = g_get_monotonic_time(); + do { + uint32_t len; + VirtQueueElement *elem = vhost_svq_get_buf(svq, &len); + if (elem) { + return len; + } + + if (unlikely(g_get_monotonic_time() - start_us > 10e6)) { + return 0; + } + + /* Make sure we read new used_idx */ + smp_rmb(); + } while (true); +} + /** * Forward used buffers. * diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index dd78f4bec2..cf442f7dea 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -89,6 +89,7 @@ void vhost_svq_push_elem(VhostShadowVirtqueue *svq, int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, size_t out_num, const struct iovec *in_sg, size_t in_num, VirtQueueElement *elem); +size_t vhost_svq_poll(VhostShadowVirtqueue *svq); void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd); void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd); From e966c0b781aebabd2c0f5eef91678f08ce1d068c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:39 +0200 Subject: [PATCH 14/25] vhost: Add svq avail_handler callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows external handlers to be aware of new buffers that the guest places in the virtqueue. When this callback is defined the ownership of the guest's virtqueue element is transferred to the callback. This means that if the user wants to forward the descriptor it needs to manually inject it. The callback is also free to process the command by itself and use the element with svq_push. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-shadow-virtqueue.c | 14 ++++++++++++-- hw/virtio/vhost-shadow-virtqueue.h | 31 +++++++++++++++++++++++++++++- hw/virtio/vhost-vdpa.c | 3 ++- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index cc2ee4780d..e4956728dd 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -306,7 +306,11 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) break; } - r = vhost_svq_add_element(svq, elem); + if (svq->ops) { + r = svq->ops->avail_handler(svq, elem, svq->ops_opaque); + } else { + r = vhost_svq_add_element(svq, elem); + } if (unlikely(r != 0)) { if (r == -ENOSPC) { /* @@ -685,12 +689,16 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) * shadow methods and file descriptors. * * @iova_tree: Tree to perform descriptors translations + * @ops: SVQ owner callbacks + * @ops_opaque: ops opaque pointer * * Returns the new virtqueue or NULL. * * In case of error, reason is reported through error_report. */ -VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree) +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree, + const VhostShadowVirtqueueOps *ops, + void *ops_opaque) { g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); int r; @@ -712,6 +720,8 @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree) event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); svq->iova_tree = iova_tree; + svq->ops = ops; + svq->ops_opaque = ops_opaque; return g_steal_pointer(&svq); err_init_hdev_call: diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index cf442f7dea..d04c34a589 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -25,6 +25,27 @@ typedef struct SVQDescState { unsigned int ndescs; } SVQDescState; +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; + +/** + * Callback to handle an avail buffer. + * + * @svq: Shadow virtqueue + * @elem: Element placed in the queue by the guest + * @vq_callback_opaque: Opaque + * + * Returns 0 if the vq is running as expected. + * + * Note that ownership of elem is transferred to the callback. + */ +typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq, + VirtQueueElement *elem, + void *vq_callback_opaque); + +typedef struct VhostShadowVirtqueueOps { + VirtQueueAvailCallback avail_handler; +} VhostShadowVirtqueueOps; + /* Shadow virtqueue to relay notifications */ typedef struct VhostShadowVirtqueue { /* Shadow vring */ @@ -69,6 +90,12 @@ typedef struct VhostShadowVirtqueue { */ uint16_t *desc_next; + /* Caller callbacks */ + const VhostShadowVirtqueueOps *ops; + + /* Caller callbacks opaque */ + void *ops_opaque; + /* Next head to expose to the device */ uint16_t shadow_avail_idx; @@ -102,7 +129,9 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, VirtQueue *vq); void vhost_svq_stop(VhostShadowVirtqueue *svq); -VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree); +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree, + const VhostShadowVirtqueueOps *ops, + void *ops_opaque); void vhost_svq_free(gpointer vq); G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free); diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 66f054a12c..0b13e98471 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -418,8 +418,9 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); for (unsigned n = 0; n < hdev->nvqs; ++n) { - g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree); + g_autoptr(VhostShadowVirtqueue) svq; + svq = vhost_svq_new(v->iova_tree, NULL, NULL); if (unlikely(!svq)) { error_setg(errp, "Cannot create svq %u", n); return -1; From 463ba1e3b8cf080812895c5f26d95d8d7db2e692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:40 +0200 Subject: [PATCH 15/25] vdpa: Export vhost_vdpa_dma_map and unmap calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shadow CVQ will copy buffers on qemu VA, so we avoid TOCTOU attacks from the guest that could set a different state in qemu device model and vdpa device. To do so, it needs to be able to map these new buffers to the device. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 7 +++---- include/hw/virtio/vhost-vdpa.h | 4 ++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 0b13e98471..96997210be 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -71,8 +71,8 @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, return false; } -static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, - void *vaddr, bool readonly) +int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, + void *vaddr, bool readonly) { struct vhost_msg_v2 msg = {}; int fd = v->device_fd; @@ -97,8 +97,7 @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, return ret; } -static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, - hwaddr size) +int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size) { struct vhost_msg_v2 msg = {}; int fd = v->device_fd; diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index a29dbb3f53..7214eb47dc 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -39,4 +39,8 @@ typedef struct vhost_vdpa { VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX]; } VhostVDPA; +int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, + void *vaddr, bool readonly); +int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size); + #endif From 94c643732dc110d04bbdf0eb43c41bce23b3593e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:41 +0200 Subject: [PATCH 16/25] vhost-net-vdpa: add stubs for when no virtio-net device is present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/vhost-vdpa.c will need functions that are declared in vhost-shadow-virtqueue.c, that needs functions of virtio-net.c. Copy the vhost-vdpa-stub.c code so only the constructor net_init_vhost_vdpa needs to be defined. Signed-off-by: Eugenio Pérez Signed-off-by: Jason Wang --- net/meson.build | 3 ++- net/vhost-vdpa-stub.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 net/vhost-vdpa-stub.c diff --git a/net/meson.build b/net/meson.build index 754e2d1d40..d1be76daf3 100644 --- a/net/meson.build +++ b/net/meson.build @@ -41,7 +41,8 @@ endif softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix)) softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c')) if have_vhost_net_vdpa - softmmu_ss.add(files('vhost-vdpa.c')) + softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-vdpa.c'), if_false: files('vhost-vdpa-stub.c')) + softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-vdpa-stub.c')) endif vmnet_files = files( diff --git a/net/vhost-vdpa-stub.c b/net/vhost-vdpa-stub.c new file mode 100644 index 0000000000..1732ed2443 --- /dev/null +++ b/net/vhost-vdpa-stub.c @@ -0,0 +1,21 @@ +/* + * vhost-vdpa-stub.c + * + * Copyright (c) 2022 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "clients.h" +#include "net/vhost-vdpa.h" +#include "qapi/error.h" + +int net_init_vhost_vdpa(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) +{ + error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*"); + return -1; +} From bd907ae4b00ebedad5e586af05ea3d6490318d45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:42 +0200 Subject: [PATCH 17/25] vdpa: manual forward CVQ buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do a simple forwarding of CVQ buffers, the same work SVQ could do but through callbacks. No functional change intended. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 3 +- include/hw/virtio/vhost-vdpa.h | 3 ++ net/vhost-vdpa.c | 58 ++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 96997210be..beaaa7049a 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -419,7 +419,8 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, for (unsigned n = 0; n < hdev->nvqs; ++n) { g_autoptr(VhostShadowVirtqueue) svq; - svq = vhost_svq_new(v->iova_tree, NULL, NULL); + svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops, + v->shadow_vq_ops_opaque); if (unlikely(!svq)) { error_setg(errp, "Cannot create svq %u", n); return -1; diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index 7214eb47dc..1111d85643 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -15,6 +15,7 @@ #include #include "hw/virtio/vhost-iova-tree.h" +#include "hw/virtio/vhost-shadow-virtqueue.h" #include "hw/virtio/virtio.h" #include "standard-headers/linux/vhost_types.h" @@ -35,6 +36,8 @@ typedef struct vhost_vdpa { /* IOVA mapping used by the Shadow Virtqueue */ VhostIOVATree *iova_tree; GPtrArray *shadow_vqs; + const VhostShadowVirtqueueOps *shadow_vq_ops; + void *shadow_vq_ops_opaque; struct vhost_dev *dev; VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX]; } VhostVDPA; diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index df1e69ee72..2e3b6b10d8 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -11,11 +11,14 @@ #include "qemu/osdep.h" #include "clients.h" +#include "hw/virtio/virtio-net.h" #include "net/vhost_net.h" #include "net/vhost-vdpa.h" #include "hw/virtio/vhost-vdpa.h" #include "qemu/config-file.h" #include "qemu/error-report.h" +#include "qemu/log.h" +#include "qemu/memalign.h" #include "qemu/option.h" #include "qapi/error.h" #include @@ -187,6 +190,57 @@ static NetClientInfo net_vhost_vdpa_info = { .check_peer_type = vhost_vdpa_check_peer_type, }; +/** + * Forward buffer for the moment. + */ +static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, + VirtQueueElement *elem, + void *opaque) +{ + unsigned int n = elem->out_num + elem->in_num; + g_autofree struct iovec *dev_buffers = g_new(struct iovec, n); + size_t in_len, dev_written; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + int r; + + memcpy(dev_buffers, elem->out_sg, elem->out_num); + memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num); + + r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1], + elem->in_num, elem); + if (unlikely(r != 0)) { + if (unlikely(r == -ENOSPC)) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", + __func__); + } + goto out; + } + + /* + * We can poll here since we've had BQL from the time we sent the + * descriptor. Also, we need to take the answer before SVQ pulls by itself, + * when BQL is released + */ + dev_written = vhost_svq_poll(svq); + if (unlikely(dev_written < sizeof(status))) { + error_report("Insufficient written data (%zu)", dev_written); + } + +out: + in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, + sizeof(status)); + if (unlikely(in_len < sizeof(status))) { + error_report("Bad device CVQ written length"); + } + vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); + g_free(elem); + return r; +} + +static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = { + .avail_handler = vhost_vdpa_net_handle_ctrl_avail, +}; + static NetClientState *net_vhost_vdpa_init(NetClientState *peer, const char *device, const char *name, @@ -211,6 +265,10 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, s->vhost_vdpa.device_fd = vdpa_device_fd; s->vhost_vdpa.index = queue_pair_index; + if (!is_datapath) { + s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; + s->vhost_vdpa.shadow_vq_ops_opaque = s; + } ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); if (ret) { qemu_del_net_client(nc); From 2df4dd31e194c94da7d28c02e92449f4a989fca9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:43 +0200 Subject: [PATCH 18/25] vdpa: Buffer CVQ support on shadow virtqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the control virtqueue support for vDPA shadow virtqueue. This is needed for advanced networking features like rx filtering. Virtio-net control VQ copies the descriptors to qemu's VA, so we avoid TOCTOU with the guest's or device's memory every time there is a device model change. Otherwise, the guest could change the memory content in the time between qemu and the device read it. To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR is implemented. If the virtio-net driver changes MAC the virtio-net device model will be updated with the new one, and a rx filtering change event will be raised. More cvq commands could be added here straightforwardly but they have not been tested. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 213 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 205 insertions(+), 8 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 2e3b6b10d8..502f6f9d3e 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -33,6 +33,9 @@ typedef struct VhostVDPAState { NetClientState nc; struct vhost_vdpa vhost_vdpa; VHostNetState *vhost_net; + + /* Control commands shadow buffers */ + void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer; bool started; } VhostVDPAState; @@ -131,6 +134,8 @@ static void vhost_vdpa_cleanup(NetClientState *nc) { VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); + qemu_vfree(s->cvq_cmd_out_buffer); + qemu_vfree(s->cvq_cmd_in_buffer); if (s->vhost_net) { vhost_net_cleanup(s->vhost_net); g_free(s->vhost_net); @@ -190,24 +195,191 @@ static NetClientInfo net_vhost_vdpa_info = { .check_peer_type = vhost_vdpa_check_peer_type, }; +static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr) +{ + VhostIOVATree *tree = v->iova_tree; + DMAMap needle = { + /* + * No need to specify size or to look for more translations since + * this contiguous chunk was allocated by us. + */ + .translated_addr = (hwaddr)(uintptr_t)addr, + }; + const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle); + int r; + + if (unlikely(!map)) { + error_report("Cannot locate expected map"); + return; + } + + r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1); + if (unlikely(r != 0)) { + error_report("Device cannot unmap: %s(%d)", g_strerror(r), r); + } + + vhost_iova_tree_remove(tree, map); +} + +static size_t vhost_vdpa_net_cvq_cmd_len(void) +{ + /* + * MAC_TABLE_SET is the ctrl command that produces the longer out buffer. + * In buffer is always 1 byte, so it should fit here + */ + return sizeof(struct virtio_net_ctrl_hdr) + + 2 * sizeof(struct virtio_net_ctrl_mac) + + MAC_TABLE_ENTRIES * ETH_ALEN; +} + +static size_t vhost_vdpa_net_cvq_cmd_page_len(void) +{ + return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size()); +} + +/** Copy and map a guest buffer. */ +static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, + const struct iovec *out_data, + size_t out_num, size_t data_len, void *buf, + size_t *written, bool write) +{ + DMAMap map = {}; + int r; + + if (unlikely(!data_len)) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n", + __func__, write ? "in" : "out"); + return false; + } + + *written = iov_to_buf(out_data, out_num, 0, buf, data_len); + map.translated_addr = (hwaddr)(uintptr_t)buf; + map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1; + map.perm = write ? IOMMU_RW : IOMMU_RO, + r = vhost_iova_tree_map_alloc(v->iova_tree, &map); + if (unlikely(r != IOVA_OK)) { + error_report("Cannot map injected element"); + return false; + } + + r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf, + !write); + if (unlikely(r < 0)) { + goto dma_map_err; + } + + return true; + +dma_map_err: + vhost_iova_tree_remove(v->iova_tree, &map); + return false; +} + /** - * Forward buffer for the moment. + * Copy the guest element into a dedicated buffer suitable to be sent to NIC + * + * @iov: [0] is the out buffer, [1] is the in one + */ +static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s, + VirtQueueElement *elem, + struct iovec *iov) +{ + size_t in_copied; + bool ok; + + iov[0].iov_base = s->cvq_cmd_out_buffer; + ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num, + vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base, + &iov[0].iov_len, false); + if (unlikely(!ok)) { + return false; + } + + iov[1].iov_base = s->cvq_cmd_in_buffer; + ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0, + sizeof(virtio_net_ctrl_ack), iov[1].iov_base, + &in_copied, true); + if (unlikely(!ok)) { + vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); + return false; + } + + iov[1].iov_len = sizeof(virtio_net_ctrl_ack); + return true; +} + +/** + * Do not forward commands not supported by SVQ. Otherwise, the device could + * accept it and qemu would not know how to update the device model. + */ +static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out, + size_t out_num) +{ + struct virtio_net_ctrl_hdr ctrl; + size_t n; + + n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl)); + if (unlikely(n < sizeof(ctrl))) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: invalid legnth of out buffer %zu\n", __func__, n); + return false; + } + + switch (ctrl.class) { + case VIRTIO_NET_CTRL_MAC: + switch (ctrl.cmd) { + case VIRTIO_NET_CTRL_MAC_ADDR_SET: + return true; + default: + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n", + __func__, ctrl.cmd); + }; + break; + default: + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n", + __func__, ctrl.class); + }; + + return false; +} + +/** + * Validate and copy control virtqueue commands. + * + * Following QEMU guidelines, we offer a copy of the buffers to the device to + * prevent TOCTOU bugs. */ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, VirtQueueElement *elem, void *opaque) { - unsigned int n = elem->out_num + elem->in_num; - g_autofree struct iovec *dev_buffers = g_new(struct iovec, n); + VhostVDPAState *s = opaque; size_t in_len, dev_written; virtio_net_ctrl_ack status = VIRTIO_NET_ERR; - int r; + /* out and in buffers sent to the device */ + struct iovec dev_buffers[2] = { + { .iov_base = s->cvq_cmd_out_buffer }, + { .iov_base = s->cvq_cmd_in_buffer }, + }; + /* in buffer used for device model */ + const struct iovec in = { + .iov_base = &status, + .iov_len = sizeof(status), + }; + int r = -EINVAL; + bool ok; - memcpy(dev_buffers, elem->out_sg, elem->out_num); - memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num); + ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers); + if (unlikely(!ok)) { + goto out; + } - r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1], - elem->in_num, elem); + ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1); + if (unlikely(!ok)) { + goto out; + } + + r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem); if (unlikely(r != 0)) { if (unlikely(r == -ENOSPC)) { qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", @@ -224,6 +396,18 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, dev_written = vhost_svq_poll(svq); if (unlikely(dev_written < sizeof(status))) { error_report("Insufficient written data (%zu)", dev_written); + goto out; + } + + memcpy(&status, dev_buffers[1].iov_base, sizeof(status)); + if (status != VIRTIO_NET_OK) { + goto out; + } + + status = VIRTIO_NET_ERR; + virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1); + if (status != VIRTIO_NET_OK) { + error_report("Bad CVQ processing in model"); } out: @@ -234,6 +418,12 @@ out: } vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); g_free(elem); + if (dev_buffers[0].iov_base) { + vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base); + } + if (dev_buffers[1].iov_base) { + vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base); + } return r; } @@ -266,6 +456,13 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, s->vhost_vdpa.device_fd = vdpa_device_fd; s->vhost_vdpa.index = queue_pair_index; if (!is_datapath) { + s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(), + vhost_vdpa_net_cvq_cmd_page_len()); + memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len()); + s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size(), + vhost_vdpa_net_cvq_cmd_page_len()); + memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len()); + s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; s->vhost_vdpa.shadow_vq_ops_opaque = s; } From 8170ab3f43989680491d00f1017f60b25d346114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:44 +0200 Subject: [PATCH 19/25] vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To know the device features is needed for CVQ SVQ, so SVQ knows if it can handle all commands or not. Extract from vhost_vdpa_get_max_queue_pairs so we can reuse it. Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 502f6f9d3e..6e3e9f312a 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -474,20 +474,24 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, return nc; } -static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) +static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp) +{ + int ret = ioctl(fd, VHOST_GET_FEATURES, features); + if (unlikely(ret < 0)) { + error_setg_errno(errp, errno, + "Fail to query features from vhost-vDPA device"); + } + return ret; +} + +static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features, + int *has_cvq, Error **errp) { unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); g_autofree struct vhost_vdpa_config *config = NULL; __virtio16 *max_queue_pairs; - uint64_t features; int ret; - ret = ioctl(fd, VHOST_GET_FEATURES, &features); - if (ret) { - error_setg(errp, "Fail to query features from vhost-vDPA device"); - return ret; - } - if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) { *has_cvq = 1; } else { @@ -517,10 +521,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, NetClientState *peer, Error **errp) { const NetdevVhostVDPAOptions *opts; + uint64_t features; int vdpa_device_fd; g_autofree NetClientState **ncs = NULL; NetClientState *nc; - int queue_pairs, i, has_cvq = 0; + int queue_pairs, r, i, has_cvq = 0; assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA); opts = &netdev->u.vhost_vdpa; @@ -534,7 +539,12 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, return -errno; } - queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, + r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp); + if (unlikely(r < 0)) { + return r; + } + + queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features, &has_cvq, errp); if (queue_pairs < 0) { qemu_close(vdpa_device_fd); From c156d5bf2b142dcc06808ccee06882144f230aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:45 +0200 Subject: [PATCH 20/25] vdpa: Add device migration blocker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the vhost-vdpa device is exposing _F_LOG, adding a migration blocker if it uses CVQ. However, qemu is able to migrate simple devices with no CVQ as long as they use SVQ. To allow it, add a placeholder error to vhost_vdpa, and only add to vhost_dev when used. vhost_dev machinery place the migration blocker if needed. Signed-off-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/virtio/vhost-vdpa.c | 15 +++++++++++++++ include/hw/virtio/vhost-vdpa.h | 1 + 2 files changed, 16 insertions(+) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index beaaa7049a..291cd19054 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -20,6 +20,7 @@ #include "hw/virtio/vhost-shadow-virtqueue.h" #include "hw/virtio/vhost-vdpa.h" #include "exec/address-spaces.h" +#include "migration/blocker.h" #include "qemu/cutils.h" #include "qemu/main-loop.h" #include "cpu.h" @@ -1022,6 +1023,13 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) return true; } + if (v->migration_blocker) { + int r = migrate_add_blocker(v->migration_blocker, &err); + if (unlikely(r < 0)) { + return false; + } + } + for (i = 0; i < v->shadow_vqs->len; ++i) { VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); @@ -1064,6 +1072,10 @@ err: vhost_svq_stop(svq); } + if (v->migration_blocker) { + migrate_del_blocker(v->migration_blocker); + } + return false; } @@ -1083,6 +1095,9 @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev) } } + if (v->migration_blocker) { + migrate_del_blocker(v->migration_blocker); + } return true; } diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index 1111d85643..d10a89303e 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -35,6 +35,7 @@ typedef struct vhost_vdpa { bool shadow_vqs_enabled; /* IOVA mapping used by the Shadow Virtqueue */ VhostIOVATree *iova_tree; + Error *migration_blocker; GPtrArray *shadow_vqs; const VhostShadowVirtqueueOps *shadow_vq_ops; void *shadow_vq_ops_opaque; From 1576dbb5bbc49344c606e969ec749be70c0fd94e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 20 Jul 2022 08:59:46 +0200 Subject: [PATCH 21/25] vdpa: Add x-svq to NetdevVhostVDPAOptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finally offering the possibility to enable SVQ from the command line. Signed-off-by: Eugenio Pérez Acked-by: Markus Armbruster Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- net/vhost-vdpa.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++-- qapi/net.json | 9 +++++- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 6e3e9f312a..6abad276a6 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -75,6 +75,28 @@ const int vdpa_feature_bits[] = { VHOST_INVALID_FEATURE_BIT }; +/** Supported device specific feature bits with SVQ */ +static const uint64_t vdpa_svq_device_features = + BIT_ULL(VIRTIO_NET_F_CSUM) | + BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | + BIT_ULL(VIRTIO_NET_F_MTU) | + BIT_ULL(VIRTIO_NET_F_MAC) | + BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | + BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | + BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | + BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | + BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | + BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | + BIT_ULL(VIRTIO_NET_F_HOST_ECN) | + BIT_ULL(VIRTIO_NET_F_HOST_UFO) | + BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | + BIT_ULL(VIRTIO_NET_F_STATUS) | + BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | + BIT_ULL(VIRTIO_F_ANY_LAYOUT) | + BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | + BIT_ULL(VIRTIO_NET_F_RSC_EXT) | + BIT_ULL(VIRTIO_NET_F_STANDBY); + VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc) { VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); @@ -133,9 +155,13 @@ err_init: static void vhost_vdpa_cleanup(NetClientState *nc) { VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); + struct vhost_dev *dev = &s->vhost_net->dev; qemu_vfree(s->cvq_cmd_out_buffer); qemu_vfree(s->cvq_cmd_in_buffer); + if (dev->vq_index + dev->nvqs == dev->vq_index_end) { + g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete); + } if (s->vhost_net) { vhost_net_cleanup(s->vhost_net); g_free(s->vhost_net); @@ -437,7 +463,9 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, int vdpa_device_fd, int queue_pair_index, int nvqs, - bool is_datapath) + bool is_datapath, + bool svq, + VhostIOVATree *iova_tree) { NetClientState *nc = NULL; VhostVDPAState *s; @@ -455,6 +483,8 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, s->vhost_vdpa.device_fd = vdpa_device_fd; s->vhost_vdpa.index = queue_pair_index; + s->vhost_vdpa.shadow_vqs_enabled = svq; + s->vhost_vdpa.iova_tree = iova_tree; if (!is_datapath) { s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(), vhost_vdpa_net_cvq_cmd_page_len()); @@ -465,6 +495,8 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; s->vhost_vdpa.shadow_vq_ops_opaque = s; + error_setg(&s->vhost_vdpa.migration_blocker, + "Migration disabled: vhost-vdpa uses CVQ."); } ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); if (ret) { @@ -474,6 +506,14 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, return nc; } +static int vhost_vdpa_get_iova_range(int fd, + struct vhost_vdpa_iova_range *iova_range) +{ + int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range); + + return ret < 0 ? -errno : 0; +} + static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp) { int ret = ioctl(fd, VHOST_GET_FEATURES, features); @@ -524,6 +564,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, uint64_t features; int vdpa_device_fd; g_autofree NetClientState **ncs = NULL; + g_autoptr(VhostIOVATree) iova_tree = NULL; NetClientState *nc; int queue_pairs, r, i, has_cvq = 0; @@ -551,22 +592,45 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, return queue_pairs; } + if (opts->x_svq) { + struct vhost_vdpa_iova_range iova_range; + + uint64_t invalid_dev_features = + features & ~vdpa_svq_device_features & + /* Transport are all accepted at this point */ + ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START, + VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START); + + if (invalid_dev_features) { + error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64, + invalid_dev_features); + goto err_svq; + } + + vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range); + iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last); + } + ncs = g_malloc0(sizeof(*ncs) * queue_pairs); for (i = 0; i < queue_pairs; i++) { ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, - vdpa_device_fd, i, 2, true); + vdpa_device_fd, i, 2, true, opts->x_svq, + iova_tree); if (!ncs[i]) goto err; } if (has_cvq) { nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, - vdpa_device_fd, i, 1, false); + vdpa_device_fd, i, 1, false, + opts->x_svq, iova_tree); if (!nc) goto err; } + /* iova_tree ownership belongs to last NetClientState */ + g_steal_pointer(&iova_tree); return 0; err: @@ -575,6 +639,8 @@ err: qemu_del_net_client(ncs[i]); } } + +err_svq: qemu_close(vdpa_device_fd); return -1; diff --git a/qapi/net.json b/qapi/net.json index 9af11e9a3b..75ba2cb989 100644 --- a/qapi/net.json +++ b/qapi/net.json @@ -445,12 +445,19 @@ # @queues: number of queues to be created for multiqueue vhost-vdpa # (default: 1) # +# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1) +# (default: false) +# +# Features: +# @unstable: Member @x-svq is experimental. +# # Since: 5.1 ## { 'struct': 'NetdevVhostVDPAOptions', 'data': { '*vhostdev': 'str', - '*queues': 'int' } } + '*queues': 'int', + '*x-svq': {'type': 'bool', 'features' : [ 'unstable'] } } } ## # @NetdevVmnetHostOptions: From 669846c530dc4ab4afa0d2ad827fec651cb7510c Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Fri, 1 Apr 2022 11:46:59 +0800 Subject: [PATCH 22/25] softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH If the checkpoint occurs when the guest finishes restarting but has not started running, the runstate_set() may reject the transition from COLO to PRELAUNCH with the crash log: {"timestamp": {"seconds": 1593484591, "microseconds": 26605},\ "event": "RESET", "data": {"guest": true, "reason": "guest-reset"}} qemu-system-x86_64: invalid runstate transition: 'colo' -> 'prelaunch' Long-term testing says that it's pretty safe. Signed-off-by: Like Xu Signed-off-by: Zhang Chen Acked-by: Dr. David Alan Gilbert Signed-off-by: Jason Wang --- softmmu/runstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/softmmu/runstate.c b/softmmu/runstate.c index fac7b63259..168e1b78a0 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -126,6 +126,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH }, { RUN_STATE_COLO, RUN_STATE_RUNNING }, + { RUN_STATE_COLO, RUN_STATE_PRELAUNCH }, { RUN_STATE_COLO, RUN_STATE_SHUTDOWN}, { RUN_STATE_RUNNING, RUN_STATE_DEBUG }, From a18d436954c534b74ed57fc126bb737247d22cba Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Fri, 1 Apr 2022 11:47:00 +0800 Subject: [PATCH 23/25] net/colo: Fix a "double free" crash to clear the conn_list We notice the QEMU may crash when the guest has too many incoming network connections with the following log: 15197@1593578622.668573:colo_proxy_main : colo proxy connection hashtable full, clear it free(): invalid pointer [1] 15195 abort (core dumped) qemu-system-x86_64 .... This is because we create the s->connection_track_table with g_hash_table_new_full() which is defined as: GHashTable * g_hash_table_new_full (GHashFunc hash_func, GEqualFunc key_equal_func, GDestroyNotify key_destroy_func, GDestroyNotify value_destroy_func); The fourth parameter connection_destroy() will be called to free the memory allocated for all 'Connection' values in the hashtable when we call g_hash_table_remove_all() in the connection_hashtable_reset(). But both connection_track_table and conn_list reference to the same conn instance. It will trigger double free in conn_list clear. So this patch remove free action on hash table side to avoid double free the conn. Signed-off-by: Like Xu Signed-off-by: Zhang Chen Signed-off-by: Jason Wang --- net/colo-compare.c | 2 +- net/filter-rewriter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/colo-compare.c b/net/colo-compare.c index d5d0965805..787c740f14 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -1323,7 +1323,7 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) s->connection_track_table = g_hash_table_new_full(connection_key_hash, connection_key_equal, g_free, - connection_destroy); + NULL); colo_compare_iothread(s); diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c index bf05023dc3..c18c4c2019 100644 --- a/net/filter-rewriter.c +++ b/net/filter-rewriter.c @@ -383,7 +383,7 @@ static void colo_rewriter_setup(NetFilterState *nf, Error **errp) s->connection_track_table = g_hash_table_new_full(connection_key_hash, connection_key_equal, g_free, - connection_destroy); + NULL); s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf); } From 94c36c48751bf5ff644e6c8e17a21003edacfc5d Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Fri, 1 Apr 2022 11:47:01 +0800 Subject: [PATCH 24/25] net/colo.c: No need to track conn_list for filter-rewriter Filter-rewriter no need to track connection in conn_list. This patch fix the glib g_queue_is_empty assertion when COLO guest keep a lot of network connection. Signed-off-by: Zhang Chen Reviewed-by: Li Zhijian Signed-off-by: Jason Wang --- net/colo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/colo.c b/net/colo.c index 1f8162f59f..694f3c93ef 100644 --- a/net/colo.c +++ b/net/colo.c @@ -218,7 +218,7 @@ Connection *connection_get(GHashTable *connection_track_table, /* * clear the conn_list */ - while (!g_queue_is_empty(conn_list)) { + while (conn_list && !g_queue_is_empty(conn_list)) { connection_destroy(g_queue_pop_head(conn_list)); } } From 8bdab83b34efb0b598be4e5b98e4f466ca5f2f80 Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Fri, 1 Apr 2022 11:47:02 +0800 Subject: [PATCH 25/25] net/colo.c: fix segmentation fault when packet is not parsed correctly When COLO use only one vnet_hdr_support parameter between filter-redirector and filter-mirror(or colo-compare), COLO will crash with segmentation fault. Back track as follow: Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. 0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0) at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296 296 uint16_t proto = be16_to_cpu(PKT_GET_ETH_HDR(p)->h_proto); (gdb) bt 0 0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0) at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296 1 0x0000555555cb22b4 in parse_packet_early (pkt=0x555556a44840) at net/colo.c:49 2 0x0000555555cb2b91 in is_tcp_packet (pkt=0x555556a44840) at net/filter-rewriter.c:63 So wrong vnet_hdr_len will cause pkt->data become NULL. Add check to raise error and add trace-events to track vnet_hdr_len. Signed-off-by: Tao Xu Signed-off-by: Zhang Chen Reviewed-by: Li Zhijian Signed-off-by: Jason Wang --- net/colo.c | 9 ++++++++- net/trace-events | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/net/colo.c b/net/colo.c index 694f3c93ef..6b0ff562ad 100644 --- a/net/colo.c +++ b/net/colo.c @@ -46,7 +46,14 @@ int parse_packet_early(Packet *pkt) static const uint8_t vlan[] = {0x81, 0x00}; uint8_t *data = pkt->data + pkt->vnet_hdr_len; uint16_t l3_proto; - ssize_t l2hdr_len = eth_get_l2_hdr_length(data); + ssize_t l2hdr_len; + + if (data == NULL) { + trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, " + "pkt->vnet_hdr_len", pkt->vnet_hdr_len); + return 1; + } + l2hdr_len = eth_get_l2_hdr_length(data); if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) { trace_colo_proxy_main("pkt->size < ETH_HLEN"); diff --git a/net/trace-events b/net/trace-events index d7a17256cc..6af927b4b9 100644 --- a/net/trace-events +++ b/net/trace-events @@ -9,6 +9,7 @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d" # colo.c colo_proxy_main(const char *chr) ": %s" +colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d" # colo-compare.c colo_compare_main(const char *chr) ": %s"