From 923e2d98ede7404882656aeb4364c3964a95db3d Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 13 Nov 2015 15:24:09 +0800 Subject: [PATCH 01/15] vhost: let SET_VRING_ENABLE message depends on protocol feature But not depend on PROTOCOL_F_MQ feature bit. So that we could use SET_VRING_ENABLE to sign the backend on stop, even if MQ is disabled. That's reasonable, since we will have one queue pair at least. Signed-off-by: Yuanhan Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index c44360219f..3404b81eda 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -338,7 +338,7 @@ static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) .num = enable, }; - if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ))) { + if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) { return -1; } From 12b8cbac3c8243b3dd485aaebb82547aefa06adb Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 13 Nov 2015 15:24:10 +0800 Subject: [PATCH 02/15] vhost: don't send RESET_OWNER at stop First of all, RESET_OWNER message is sent incorrectly, as it's sent before GET_VRING_BASE. And the reset message would let the later call get nothing correct. And, sending SET_VRING_ENABLE at stop, which has already been done, makes more sense than RESET_OWNER. Signed-off-by: Yuanhan Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index d91b7b155e..14337a486c 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -292,12 +292,6 @@ static void vhost_net_stop_one(struct vhost_net *net, int r = vhost_ops->vhost_net_set_backend(&net->dev, &file); assert(r >= 0); } - } else if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER) { - for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - const VhostOps *vhost_ops = net->dev.vhost_ops; - int r = vhost_ops->vhost_reset_device(&net->dev); - assert(r >= 0); - } } if (net->nc->info->poll) { net->nc->info->poll(net->nc, true); From a586e65bbd017ab55fe4149dd1bcba5c3a72bcd1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 15 Nov 2015 21:25:11 +0200 Subject: [PATCH 03/15] vhost-user: update spec description Clarify logging setup to make sure all clients comply in a way that is future-proof. Document how rings are started/stopped. Signed-off-by: Michael S. Tsirkin Reviewed-by: Victor Kaplansky --- docs/specs/vhost-user.txt | 64 +++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/docs/specs/vhost-user.txt b/docs/specs/vhost-user.txt index 26dde2ec42..df40cec54e 100644 --- a/docs/specs/vhost-user.txt +++ b/docs/specs/vhost-user.txt @@ -87,6 +87,14 @@ Depending on the request type, payload can be: User address: a 64-bit user address mmap offset: 64-bit offset where region starts in the mapped memory +* Log description + --------------------------- + | log size | log offset | + --------------------------- + log size: size of area used for logging + log offset: offset from start of supplied file descriptor + where logging starts (i.e. where guest address 0 would be logged) + In QEMU the vhost-user message is implemented with the following struct: typedef struct VhostUserMsg { @@ -138,6 +146,23 @@ As older slaves don't support negotiating protocol features, a feature bit was dedicated for this purpose: #define VHOST_USER_F_PROTOCOL_FEATURES 30 +Starting and stopping rings +---------------------- +Each ring is initialized in a stopped state, client must not process it until +ring is enabled. + +If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, client must start and +stop ring processing upon receiving VHOST_USER_SET_VRING_ENABLE with parameters +1 and 0 respoectively. + +If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, client must start +ring processing upon receiving a kick (that is, detecting that file descriptor +is readable) on the descriptor specified by VHOST_USER_SET_VRING_KICK, and stop +ring processing upon receiving VHOST_USER_GET_VRING_BASE. + +While rings are running, client must support changing some configuration +aspects on the fly. + Multiple queue support ---------------------- @@ -162,9 +187,13 @@ the slave makes to the memory mapped regions. The client should mark the dirty pages in a log. Once it complies to this logging, it may declare the VHOST_F_LOG_ALL vhost feature. +To start/stop logging of data/used ring writes, server may send messages +VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with +VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively. + All the modifications to memory pointed by vring "descriptor" should be marked. Modifications to "used" vring should be marked if -VHOST_VRING_F_LOG is part of ring's features. +VHOST_VRING_F_LOG is part of ring's flags. Dirty pages are of size: #define VHOST_LOG_PAGE 0x1000 @@ -173,22 +202,35 @@ The log memory fd is provided in the ancillary data of VHOST_USER_SET_LOG_BASE message when the slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature. -The size of the log may be computed by using all the known guest -addresses. The log covers from address 0 to the maximum of guest +The size of the log is supplied as part of VhostUserMsg +which should be large enough to cover all known guest +addresses. Log starts at the supplied offset in the +supplied file descriptor. +The log covers from address 0 to the maximum of guest regions. In pseudo-code, to mark page at "addr" as dirty: page = addr / VHOST_LOG_PAGE log[page / 8] |= 1 << page % 8 +Where addr is the guest physical address. + Use atomic operations, as the log may be concurrently manipulated. +Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG +is set for this ring), log_guest_addr should be used to calculate the log +offset: the write to first byte of the used ring is logged at this offset from +log start. Also note that this value might be outside the legal guest physical +address range (i.e. does not have to be covered by the VhostUserMemory table), +but the bit offset of the last byte of the ring must fall within +the size supplied by VhostUserLog. + VHOST_USER_SET_LOG_FD is an optional message with an eventfd in ancillary data, it may be used to inform the master that the log has been modified. -Once the source has finished migration, VHOST_USER_RESET_OWNER message -will be sent by the source. No further update must be done before the -destination takes over with new regions & rings. +Once the source has finished migration, rings will be stopped by +the source. No further update must be done before rings are +restarted. Protocol features ----------------- @@ -259,11 +301,13 @@ Message types * VHOST_USER_RESET_OWNER Id: 4 - Equivalent ioctl: VHOST_RESET_OWNER Master payload: N/A - Issued when a new connection is about to be closed. The Master will no - longer own this connection (and will usually close it). + This is no longer used. Used to be sent to request stopping + all rings, but some clients interpreted it to also discard + connection state (this interpretation would lead to bugs). + It is recommended that clients either ignore this message, + or use it to stop all rings. * VHOST_USER_SET_MEM_TABLE @@ -388,6 +432,8 @@ Message types Master payload: vring state description Signal slave to enable or disable corresponding vring. + This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES + has been negotiated. * VHOST_USER_SEND_RARP From 87656d50181e1be475303c1b88be6df0963c5bfd Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Nov 2015 13:33:36 +0200 Subject: [PATCH 04/15] vhost-user-test: support VHOST_USER_SET_VRING_ENABLE vhost-user-test is broken now: it assumes QEMU sends RESET_OWNER, and we stopped doing that. Wait for ENABLE_RING with 0 instead. Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-test.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c index 01cfc7e25d..022223b2a7 100644 --- a/tests/vhost-user-test.c +++ b/tests/vhost-user-test.c @@ -70,6 +70,7 @@ typedef enum VhostUserRequest { VHOST_USER_SET_VRING_ERR = 14, VHOST_USER_GET_PROTOCOL_FEATURES = 15, VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_SET_VRING_ENABLE = 18, VHOST_USER_MAX } VhostUserRequest; @@ -315,8 +316,10 @@ static void chr_read(void *opaque, const uint8_t *buf, int size) g_cond_signal(&s->data_cond); break; - case VHOST_USER_RESET_OWNER: - s->fds_num = 0; + case VHOST_USER_SET_VRING_ENABLE: + if (!msg.payload.state.num) { + s->fds_num = 0; + } break; default: From 5421f318ecc82294ad089fd54924df787b67c971 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Nov 2015 13:55:53 +0200 Subject: [PATCH 05/15] vhost-user: print original request on error When we get an unexpected response, print out the original request. Helps debug protocol errors tremendously. Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 3404b81eda..5bc6c45dee 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -121,8 +121,8 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) r = qemu_chr_fe_read_all(chr, p, size); if (r != size) { - error_report("Failed to read msg header. Read %d instead of %d.", r, - size); + error_report("Failed to read msg header. Read %d instead of %d." + " Original request %d.", r, size, msg->request); goto fail; } From dc3db6adde329548771ab2addc2ef8376b2b8b32 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Nov 2015 18:40:18 +0200 Subject: [PATCH 06/15] vhost-user: start/stop all rings We are currently only sending VRING_ENABLE message for the first ring, that's wrong: we must start/stop them all. Reported-by: Victor Kaplansky Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 5bc6c45dee..71c3e16f67 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -333,18 +333,23 @@ static int vhost_user_set_vring_base(struct vhost_dev *dev, static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) { - struct vhost_vring_state state = { - .index = dev->vq_index, - .num = enable, - }; + int i; if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) { return -1; } - return vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state); -} + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_vring_state state = { + .index = dev->vq_index + i, + .num = enable, + }; + vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state); + } + + return 0; +} static int vhost_user_get_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) From 1f8431f42d833e8914f2d16ce4a49b7b72b90db0 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Fri, 13 Nov 2015 01:55:47 -0500 Subject: [PATCH 07/15] q35: Check propery to determine if iommu is set The helper function machine_iommu() isn't necesary. We can directly check for the property. Signed-off-by: Bandan Das Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Bandan Das --- hw/core/machine.c | 5 ----- hw/pci-host/q35.c | 2 +- include/hw/boards.h | 1 - 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index f4db340468..acca00db22 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -462,11 +462,6 @@ bool machine_usb(MachineState *machine) return machine->usb; } -bool machine_iommu(MachineState *machine) -{ - return machine->iommu; -} - bool machine_kernel_irqchip_allowed(MachineState *machine) { return machine->kernel_irqchip_allowed; diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index c81507d710..1fb470758b 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -506,7 +506,7 @@ static void mch_realize(PCIDevice *d, Error **errp) PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE); } /* Intel IOMMU (VT-d) */ - if (machine_iommu(current_machine)) { + if (object_property_get_bool(qdev_get_machine(), "iommu", NULL)) { mch_init_dmar(mch); } } diff --git a/include/hw/boards.h b/include/hw/boards.h index 3e9a92c055..24eb6f0e77 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -33,7 +33,6 @@ MachineClass *find_default_machine(void); extern MachineState *current_machine; bool machine_usb(MachineState *machine); -bool machine_iommu(MachineState *machine); bool machine_kernel_irqchip_allowed(MachineState *machine); bool machine_kernel_irqchip_required(MachineState *machine); int machine_kvm_shadow_mem(MachineState *machine); From 8d211f622b11ac2877c344f29de284d5a842d9d7 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Fri, 13 Nov 2015 01:55:48 -0500 Subject: [PATCH 08/15] i440fx: print an error message if user tries to enable iommu There's no indication of any sort that i440fx doesn't support "iommu=on" Reviewed-by: Eric Blake Signed-off-by: Bandan Das Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Blake Signed-off-by: Bandan Das --- hw/pci-host/piix.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/pci-host/piix.c b/hw/pci-host/piix.c index 7b2fbf9598..715208b22a 100644 --- a/hw/pci-host/piix.c +++ b/hw/pci-host/piix.c @@ -34,6 +34,7 @@ #include "sysemu/sysemu.h" #include "hw/i386/ioapic.h" #include "qapi/visitor.h" +#include "qemu/error-report.h" /* * I440FX chipset data sheet. @@ -301,6 +302,10 @@ static void i440fx_pcihost_realize(DeviceState *dev, Error **errp) static void i440fx_realize(PCIDevice *dev, Error **errp) { dev->config[I440FX_SMRAM] = 0x02; + + if (object_property_get_bool(qdev_get_machine(), "iommu", NULL)) { + error_report("warning: i440fx doesn't support emulated iommu"); + } } PCIBus *i440fx_init(const char *host_type, const char *pci_type, From 5c93c47338dbaa8a21a8ccc9d95dc5ade3f7fa19 Mon Sep 17 00:00:00 2001 From: Victor Kaplansky Date: Tue, 17 Nov 2015 12:04:06 +0200 Subject: [PATCH 09/15] tests/vhost-user-bridge: implement logging of dirty pages During migration devices continue writing to the guest's memory. The writes has to be reported to QEMU. This change implements minimal support in vhost-user-bridge required for successful migration of a guest with virtio-net device. Signed-off-by: Victor Kaplansky Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 220 +++++++++++++++++++++++++++++++++----- 1 file changed, 195 insertions(+), 25 deletions(-) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 864f69e738..7bdfc98615 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -13,16 +13,22 @@ /* * TODO: * - main should get parameters from the command line. - * - implement all request handlers. + * - implement all request handlers. Still not implemented: + * vubr_get_queue_num_exec() + * vubr_send_rarp_exec() * - test for broken requests and virtqueue. * - implement features defined by Virtio 1.0 spec. * - support mergeable buffers and indirect descriptors. - * - implement RESET_DEVICE request. * - implement clean shutdown. * - implement non-blocking writes to UDP backend. * - implement polling strategy. + * - implement clean starting/stopping of vq processing + * - implement clean starting/stopping of used and buffers + * dirty page logging. */ +#define _FILE_OFFSET_BITS 64 + #include #include #include @@ -166,6 +172,8 @@ typedef struct VubrVirtq { struct vring_desc *desc; struct vring_avail *avail; struct vring_used *used; + uint64_t log_guest_addr; + int enable; } VubrVirtq; /* Based on qemu/hw/virtio/vhost-user.c */ @@ -173,6 +181,8 @@ typedef struct VubrVirtq { #define VHOST_MEMORY_MAX_NREGIONS 8 #define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_LOG_PAGE 4096 + enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_MQ = 0, VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, @@ -220,6 +230,11 @@ typedef struct VhostUserMemory { VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; } VhostUserMemory; +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + typedef struct VhostUserMsg { VhostUserRequest request; @@ -234,6 +249,7 @@ typedef struct VhostUserMsg { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; + VhostUserLog log; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; int fd_num; @@ -265,8 +281,13 @@ typedef struct VubrDev { uint32_t nregions; VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; VubrVirtq vq[MAX_NR_VIRTQUEUE]; + int log_call_fd; + uint64_t log_size; + uint8_t *log_table; int backend_udp_sock; struct sockaddr_in backend_udp_dest; + int ready; + uint64_t features; } VubrDev; static const char *vubr_request_str[] = { @@ -368,7 +389,12 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg) rc = recvmsg(conn_fd, &msg, 0); - if (rc <= 0) { + if (rc == 0) { + vubr_die("recvmsg"); + fprintf(stderr, "Peer disconnected.\n"); + exit(1); + } + if (rc < 0) { vubr_die("recvmsg"); } @@ -395,7 +421,12 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg) if (vmsg->size) { rc = read(conn_fd, &vmsg->payload, vmsg->size); - if (rc <= 0) { + if (rc == 0) { + vubr_die("recvmsg"); + fprintf(stderr, "Peer disconnected.\n"); + exit(1); + } + if (rc < 0) { vubr_die("recvmsg"); } @@ -455,6 +486,16 @@ vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len) vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen); } +/* Kick the log_call_fd if required. */ +static void +vubr_log_kick(VubrDev *dev) +{ + if (dev->log_call_fd != -1) { + DPRINT("Kicking the QEMU's log...\n"); + eventfd_write(dev->log_call_fd, 1); + } +} + /* Kick the guest if necessary. */ static void vubr_virtqueue_kick(VubrVirtq *vq) @@ -465,12 +506,40 @@ vubr_virtqueue_kick(VubrVirtq *vq) } } +static void +vubr_log_page(uint8_t *log_table, uint64_t page) +{ + DPRINT("Logged dirty guest page: %"PRId64"\n", page); + atomic_or(&log_table[page / 8], 1 << (page % 8)); +} + +static void +vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length) +{ + uint64_t page; + + if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || + !dev->log_table || !length) { + return; + } + + assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); + + page = address / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < address + length) { + vubr_log_page(dev->log_table, page); + page += VHOST_LOG_PAGE; + } + vubr_log_kick(dev); +} + static void vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) { - struct vring_desc *desc = vq->desc; + struct vring_desc *desc = vq->desc; struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; + struct vring_used *used = vq->used; + uint64_t log_guest_addr = vq->log_guest_addr; unsigned int size = vq->size; @@ -510,6 +579,7 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) if (len <= chunk_len) { memcpy(chunk_start, buf, len); + vubr_log_write(dev, desc[i].addr, len); } else { fprintf(stderr, "Received too long packet from the backend. Dropping...\n"); @@ -519,11 +589,17 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) /* Add descriptor to the used ring. */ used->ring[u_index].id = d_index; used->ring[u_index].len = len; + vubr_log_write(dev, + log_guest_addr + offsetof(struct vring_used, ring[u_index]), + sizeof(used->ring[u_index])); vq->last_avail_index++; vq->last_used_index++; atomic_mb_set(&used->idx, vq->last_used_index); + vubr_log_write(dev, + log_guest_addr + offsetof(struct vring_used, idx), + sizeof(used->idx)); /* Kick the guest if necessary. */ vubr_virtqueue_kick(vq); @@ -532,9 +608,10 @@ vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) static int vubr_process_desc(VubrDev *dev, VubrVirtq *vq) { - struct vring_desc *desc = vq->desc; + struct vring_desc *desc = vq->desc; struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; + struct vring_used *used = vq->used; + uint64_t log_guest_addr = vq->log_guest_addr; unsigned int size = vq->size; @@ -552,6 +629,8 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq) void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr); uint32_t chunk_len = desc[i].len; + assert(!(desc[i].flags & VRING_DESC_F_WRITE)); + if (len + chunk_len < buf_size) { memcpy(buf + len, chunk_start, chunk_len); DPRINT("%d ", chunk_len); @@ -577,6 +656,9 @@ vubr_process_desc(VubrDev *dev, VubrVirtq *vq) /* Add descriptor to the used ring. */ used->ring[u_index].id = d_index; used->ring[u_index].len = len; + vubr_log_write(dev, + log_guest_addr + offsetof(struct vring_used, ring[u_index]), + sizeof(used->ring[u_index])); vubr_consume_raw_packet(dev, buf, len); @@ -588,6 +670,7 @@ vubr_process_avail(VubrDev *dev, VubrVirtq *vq) { struct vring_avail *avail = vq->avail; struct vring_used *used = vq->used; + uint64_t log_guest_addr = vq->log_guest_addr; while (vq->last_avail_index != atomic_mb_read(&avail->idx)) { vubr_process_desc(dev, vq); @@ -596,6 +679,9 @@ vubr_process_avail(VubrDev *dev, VubrVirtq *vq) } atomic_mb_set(&used->idx, vq->last_used_index); + vubr_log_write(dev, + log_guest_addr + offsetof(struct vring_used, idx), + sizeof(used->idx)); } static void @@ -609,6 +695,10 @@ vubr_backend_recv_cb(int sock, void *ctx) int buflen = sizeof(buf); int len; + if (!dev->ready) { + return; + } + DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx); @@ -656,14 +746,14 @@ vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg) { vmsg->payload.u64 = ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_NET_F_CTRL_VQ) | - (1ULL << VIRTIO_NET_F_CTRL_RX) | - (1ULL << VHOST_F_LOG_ALL)); + (1ULL << VHOST_F_LOG_ALL) | + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)); + vmsg->size = sizeof(vmsg->payload.u64); DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - /* reply */ + /* Reply */ return 1; } @@ -671,6 +761,7 @@ static int vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg) { DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + dev->features = vmsg->payload.u64; return 0; } @@ -680,10 +771,28 @@ vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg) return 0; } +static void +vubr_close_log(VubrDev *dev) +{ + if (dev->log_table) { + if (munmap(dev->log_table, dev->log_size) != 0) { + vubr_die("munmap()"); + } + + dev->log_table = 0; + } + if (dev->log_call_fd != -1) { + close(dev->log_call_fd); + dev->log_call_fd = -1; + } +} + static int vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg) { - DPRINT("Function %s() not implemented yet.\n", __func__); + vubr_close_log(dev); + dev->ready = 0; + dev->features = 0; return 0; } @@ -710,9 +819,9 @@ vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) DPRINT(" mmap_offset 0x%016"PRIx64"\n", msg_region->mmap_offset); - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; dev_region->mmap_offset = msg_region->mmap_offset; /* We don't use offset argument of mmap() since the @@ -736,14 +845,38 @@ vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) static int vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg) { - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; + int fd; + uint64_t log_mmap_size, log_mmap_offset; + void *rc; + + assert(vmsg->fd_num == 1); + fd = vmsg->fds[0]; + + assert(vmsg->size == sizeof(vmsg->payload.log)); + log_mmap_offset = vmsg->payload.log.mmap_offset; + log_mmap_size = vmsg->payload.log.mmap_size; + DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); + DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); + + rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, + log_mmap_offset); + if (rc == MAP_FAILED) { + vubr_die("mmap"); + } + dev->log_table = rc; + dev->log_size = log_mmap_size; + + vmsg->size = sizeof(vmsg->payload.u64); + /* Reply */ + return 1; } static int vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg) { - DPRINT("Function %s() not implemented yet.\n", __func__); + assert(vmsg->fd_num == 1); + dev->log_call_fd = vmsg->fds[0]; + DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); return 0; } @@ -777,6 +910,7 @@ vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg) vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr); vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr); vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr); + vq->log_guest_addr = vra->log_guest_addr; DPRINT("Setting virtq addresses:\n"); DPRINT(" vring_desc at %p\n", vq->desc); @@ -803,8 +937,18 @@ vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) static int vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) { - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; + unsigned int index = vmsg->payload.state.index; + + DPRINT("State.index: %d\n", index); + vmsg->payload.state.num = dev->vq[index].last_avail_index; + vmsg->size = sizeof(vmsg->payload.state); + /* FIXME: this is a work-around for a bug in QEMU enabling + * too early vrings. When protocol features are enabled, + * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */ + dev->ready = 0; + + /* Reply */ + return 1; } static int @@ -829,7 +973,17 @@ vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) DPRINT("Waiting for kicks on fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); } + /* We temporarily use this hack to determine that both TX and RX + * queues are set up and ready for processing. + * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and + * actual kicks. */ + if (dev->vq[0].kick_fd != -1 && + dev->vq[1].kick_fd != -1) { + dev->ready = 1; + DPRINT("vhost-user-bridge is ready for processing queues.\n"); + } return 0; + } static int @@ -858,9 +1012,12 @@ vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg) static int vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) { - /* FIXME: unimplented */ + vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - return 0; + vmsg->size = sizeof(vmsg->payload.u64); + + /* Reply */ + return 1; } static int @@ -881,7 +1038,12 @@ vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg) static int vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg) { - DPRINT("Function %s() not implemented yet.\n", __func__); + unsigned int index = vmsg->payload.state.index; + unsigned int enable = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.enable: %d\n", enable); + dev->vq[index].enable = enable; return 0; } @@ -987,7 +1149,7 @@ vubr_accept_cb(int sock, void *ctx) socklen_t len = sizeof(un); conn_fd = accept(sock, (struct sockaddr *) &un, &len); - if (conn_fd == -1) { + if (conn_fd == -1) { vubr_die("accept()"); } DPRINT("Got connection from remote peer on sock %d\n", conn_fd); @@ -1009,9 +1171,17 @@ vubr_new(const char *path) .size = 0, .last_avail_index = 0, .last_used_index = 0, .desc = 0, .avail = 0, .used = 0, + .enable = 0, }; } + /* Init log */ + dev->log_call_fd = -1; + dev->log_size = 0; + dev->log_table = 0; + dev->ready = 0; + dev->features = 0; + /* Get a UNIX socket. */ dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); if (dev->sock == -1) { From 7ebcfe569211f6ff5402b558b85e2ce1e1066cf6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 17 Nov 2015 13:55:48 +0200 Subject: [PATCH 10/15] specs/vhost-user: fix spec to match reality We wanted to start/stop rings on VRING_ENABLE, but that is not what QEMU does. Rather than tweaking code some more, with risk to stability, let's just document it as it is. We'll be able to fix this in the future with a new protocol feature bit. Reported-by: Victor Kaplansky Signed-off-by: Michael S. Tsirkin --- docs/specs/vhost-user.txt | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/docs/specs/vhost-user.txt b/docs/specs/vhost-user.txt index df40cec54e..7b9cd6d0dd 100644 --- a/docs/specs/vhost-user.txt +++ b/docs/specs/vhost-user.txt @@ -148,20 +148,26 @@ a feature bit was dedicated for this purpose: Starting and stopping rings ---------------------- +Client must only process each ring when it is both started and enabled. + +If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized +in an enabled state. + +If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized +in a disabled state. Client must not process it until ring is enabled by +VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by +VHOST_USER_SET_VRING_ENABLE with parameter 0. + Each ring is initialized in a stopped state, client must not process it until -ring is enabled. +ring is started, or after it has been stopped. -If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, client must start and -stop ring processing upon receiving VHOST_USER_SET_VRING_ENABLE with parameters -1 and 0 respoectively. +Client must start ring upon receiving a kick (that is, detecting that file +descriptor is readable) on the descriptor specified by +VHOST_USER_SET_VRING_KICK, and stop ring upon receiving +VHOST_USER_GET_VRING_BASE. -If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, client must start -ring processing upon receiving a kick (that is, detecting that file descriptor -is readable) on the descriptor specified by VHOST_USER_SET_VRING_KICK, and stop -ring processing upon receiving VHOST_USER_GET_VRING_BASE. - -While rings are running, client must support changing some configuration -aspects on the fly. +While processing the rings (when they are started and enabled), client must +support changing some configuration aspects on the fly. Multiple queue support ---------------------- From 72018d1e1917a56d05e24aedc9f582b7c8385e19 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 17 Nov 2015 16:55:17 +0200 Subject: [PATCH 11/15] vhost-user: ignore qemu-only features Some features (such as ctrl vq) are supported by qemu without need to communicate with the backend. Drop them from the feature mask so we set them unconditionally. Reported-by: Victor Kaplansky Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 14337a486c..318c3e6ad2 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -77,14 +77,8 @@ static const int user_feature_bits[] = { VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MRG_RXBUF, - VIRTIO_NET_F_STATUS, - VIRTIO_NET_F_CTRL_VQ, - VIRTIO_NET_F_CTRL_RX, - VIRTIO_NET_F_CTRL_VLAN, - VIRTIO_NET_F_CTRL_RX_EXTRA, - VIRTIO_NET_F_CTRL_MAC_ADDR, - VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, + /* This bit implies RARP isn't sent by QEMU out of band */ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, From 48854f57ce3e6aa4bd13368559e5c292e1c44e49 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Nov 2015 16:13:54 +0200 Subject: [PATCH 12/15] vhost-user: fix log size commit 2b8819c6eee517c1582983773f8555bb3f9ed645 ("vhost-user: modify SET_LOG_BASE to pass mmap size and offset") passes log size in units of 4 byte chunks instead of the expected size in bytes. Fix this up. Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 71c3e16f67..1b6c5ac238 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -206,7 +206,7 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, VhostUserMsg msg = { .request = VHOST_USER_SET_LOG_BASE, .flags = VHOST_USER_VERSION, - .payload.log.mmap_size = log->size, + .payload.log.mmap_size = log->size * sizeof(*(log->log)), .payload.log.mmap_offset = 0, .size = sizeof(msg.payload.log), }; From d9a3b33d2c9f996537b7f1d0246dee2d0120cefb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Nov 2015 15:14:07 +0200 Subject: [PATCH 13/15] acpi: fix buffer overrun on migration ich calls acpi_gpe_init with length ICH9_PMIO_GPE0_LEN so ICH9_PMIO_GPE0_LEN/2 bytes are allocated, but then the full ICH9_PMIO_GPE0_LEN bytes are migrated. As a quick work-around, allocate twice the memory. We'll probably want to tweak code to avoid migrating the extra ICH9_PMIO_GPE0_LEN/2 bytes, but that is a bit trickier to do without breaking migration compatibility. Tested-by: "Dr. David Alan Gilbert" Reported-by: "Dr. David Alan Gilbert" Cc: qemu-stable@nongnu.org Signed-off-by: Michael S. Tsirkin --- hw/acpi/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/acpi/core.c b/hw/acpi/core.c index fe6215af4a..21e113d713 100644 --- a/hw/acpi/core.c +++ b/hw/acpi/core.c @@ -625,8 +625,12 @@ void acpi_pm1_cnt_reset(ACPIREGS *ar) void acpi_gpe_init(ACPIREGS *ar, uint8_t len) { ar->gpe.len = len; - ar->gpe.sts = g_malloc0(len / 2); - ar->gpe.en = g_malloc0(len / 2); + /* Only first len / 2 bytes are ever used, + * but the caller in ich9.c migrates full len bytes. + * TODO: fix ich9.c and drop the extra allocation. + */ + ar->gpe.sts = g_malloc0(len); + ar->gpe.en = g_malloc0(len); } void acpi_gpe_reset(ACPIREGS *ar) From 421f4448cec3e42f8477499c5c584699e2cf656b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Mon, 26 Oct 2015 15:32:00 +0100 Subject: [PATCH 14/15] tests: re-enable vhost-user-test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7fe34ca9c2e actually disabled vhost-user-test altogether, since CONFIG_VHOST_NET is a per-target config variable. tests/vhost-user-test is already x86/x64 softmmu specific test, in order to enable it correctly, kvm & vhost-net are also conditions. To check that, set CONFIG_VHOST_NET_TEST_$target when kvm is also enabled. Since "check-qtest-x86_64-y = $(check-qtest-i386-y)", avoid duplication when both x86 & x64 are enabled. Other targets than x86 aren't enabled yet, and is intentionally left as a future improvement, since I can't easily test those. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- configure | 1 + tests/Makefile | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/configure b/configure index f75df4b68f..0866bd870f 100755 --- a/configure +++ b/configure @@ -5663,6 +5663,7 @@ case "$target_name" in echo "CONFIG_KVM=y" >> $config_target_mak if test "$vhost_net" = "yes" ; then echo "CONFIG_VHOST_NET=y" >> $config_target_mak + echo "CONFIG_VHOST_NET_TEST_$target_name=y" >> $config_host_mak fi fi esac diff --git a/tests/Makefile b/tests/Makefile index 90c4141ac5..b9379841d8 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -197,8 +197,9 @@ gcov-files-i386-y += hw/usb/hcd-xhci.c check-qtest-i386-y += tests/pc-cpu-test$(EXESUF) check-qtest-i386-y += tests/q35-test$(EXESUF) gcov-files-i386-y += hw/pci-host/q35.c -ifeq ($(CONFIG_VHOST_NET),y) -check-qtest-i386-$(CONFIG_LINUX) += tests/vhost-user-test$(EXESUF) +check-qtest-i386-$(CONFIG_VHOST_NET_TEST_i386) += tests/vhost-user-test$(EXESUF) +ifeq ($(CONFIG_VHOST_NET_TEST_i386),) +check-qtest-x86_64-$(CONFIG_VHOST_NET_TEST_x86_64) += tests/vhost-user-test$(EXESUF) endif check-qtest-i386-y += tests/test-netfilter$(EXESUF) check-qtest-x86_64-y = $(check-qtest-i386-y) From 1c7ba94a184df1eddd589d5400d879568d3e5d08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 18 Nov 2015 10:02:58 +0100 Subject: [PATCH 15/15] exec: silence hugetlbfs warning under qtest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vhost-user-test prints a warning. A test should not need to run on hugetlbfs, let's silence the warning under qtest. The condition can't check on qtest_enabled() since vhost-user-test actually doesn't use qtest accel. However, qtest_driver() can be used, if qtest_init() is called early enough. For that reason, move chardev and qtest initialization early. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- exec.c | 5 ++++- vl.c | 28 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/exec.c b/exec.c index b09f18b2a4..acbd4a2cb5 100644 --- a/exec.c +++ b/exec.c @@ -51,6 +51,7 @@ #include "qemu/main-loop.h" #include "translate-all.h" #include "sysemu/replay.h" +#include "sysemu/qtest.h" #include "exec/memory-internal.h" #include "exec/ram_addr.h" @@ -1196,8 +1197,10 @@ static long gethugepagesize(const char *path, Error **errp) return 0; } - if (fs.f_type != HUGETLBFS_MAGIC) + if (!qtest_driver() && + fs.f_type != HUGETLBFS_MAGIC) { fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path); + } return fs.f_bsize; } diff --git a/vl.c b/vl.c index 7d993a5243..f9c661a4d0 100644 --- a/vl.c +++ b/vl.c @@ -4288,14 +4288,23 @@ int main(int argc, char **argv, char **envp) page_size_init(); socket_init(); - if (qemu_opts_foreach(qemu_find_opts("object"), - object_create, - object_create_initial, NULL)) { + if (qemu_opts_foreach(qemu_find_opts("chardev"), + chardev_init_func, NULL, NULL)) { exit(1); } - if (qemu_opts_foreach(qemu_find_opts("chardev"), - chardev_init_func, NULL, NULL)) { + if (qtest_chrdev) { + Error *local_err = NULL; + qtest_init(qtest_chrdev, qtest_log, &local_err); + if (local_err) { + error_report_err(local_err); + exit(1); + } + } + + if (qemu_opts_foreach(qemu_find_opts("object"), + object_create, + object_create_initial, NULL)) { exit(1); } @@ -4325,15 +4334,6 @@ int main(int argc, char **argv, char **envp) configure_accelerator(current_machine); - if (qtest_chrdev) { - Error *local_err = NULL; - qtest_init(qtest_chrdev, qtest_log, &local_err); - if (local_err) { - error_report_err(local_err); - exit(1); - } - } - machine_opts = qemu_get_machine_opts(); kernel_filename = qemu_opt_get(machine_opts, "kernel"); initrd_filename = qemu_opt_get(machine_opts, "initrd");