From 861dc73518a049887b709f031359713e5f6b284e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 20 Jun 2019 13:44:32 -0400
Subject: [PATCH 01/22] pcie: don't skip multi-mask events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If we are trying to set multiple bits at once, testing that just one of
them is already set gives a false positive. As a result we won't
interrupt guest if e.g. presence detection change and attention button
press are both set. This happens with multi-function device removal.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
---
 hw/pci/pcie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 88c30ff74c..b22527000d 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -383,7 +383,7 @@ static void pcie_cap_slot_event(PCIDevice *dev, PCIExpressHotPlugEvent event)
 {
     /* Minor optimization: if nothing changed - no event is needed. */
     if (pci_word_test_and_set_mask(dev->config + dev->exp.exp_cap +
-                                   PCI_EXP_SLTSTA, event)) {
+                                   PCI_EXP_SLTSTA, event) == event) {
         return;
     }
     hotplug_event_notify(dev);

From 2841ab435bca9f102311e01bf157d5fa878935dc Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 21 Jun 2019 00:12:22 -0400
Subject: [PATCH 02/22] pcie: check that slt ctrl changed before deleting

During boot, linux would sometimes overwrites control of a powered off
slot before powering it on. Unfortunately QEMU interprets that as a
power off request and ejects the device.

For example:

/x86_64-softmmu/qemu-system-x86_64 -enable-kvm -S -machine q35  \
    -device pcie-root-port,id=pcie_root_port_0,slot=2,chassis=2,addr=0x2,bus=pcie.0 \
    -monitor stdio disk.qcow2
(qemu)device_add virtio-balloon-pci,id=balloon,bus=pcie_root_port_0
(qemu)cont

Balloon is deleted during guest boot.

To fix, save control beforehand and check that power
or led state actually change before ejecting.

Note: this is more a hack than a solution, ideally we'd
find a better way to detect ejects, or move away
from ejects completely and instead monitor whether
it's safe to delete device due to e.g. its power state.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Tested-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/pci-bridge/pcie_root_port.c     |  5 ++++-
 hw/pci-bridge/xio3130_downstream.c |  5 ++++-
 hw/pci/pcie.c                      | 14 ++++++++++++--
 include/hw/pci/pcie.h              |  3 ++-
 4 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c
index 92f253c924..09019ca05d 100644
--- a/hw/pci-bridge/pcie_root_port.c
+++ b/hw/pci-bridge/pcie_root_port.c
@@ -31,10 +31,13 @@ static void rp_write_config(PCIDevice *d, uint32_t address,
 {
     uint32_t root_cmd =
         pci_get_long(d->config + d->exp.aer_cap + PCI_ERR_ROOT_COMMAND);
+    uint16_t slt_ctl, slt_sta;
+
+    pcie_cap_slot_get(d, &slt_ctl, &slt_sta);
 
     pci_bridge_write_config(d, address, val, len);
     rp_aer_vector_update(d);
-    pcie_cap_slot_write_config(d, address, val, len);
+    pcie_cap_slot_write_config(d, slt_ctl, slt_sta, address, val, len);
     pcie_aer_write_config(d, address, val, len);
     pcie_aer_root_write_config(d, address, val, len, root_cmd);
 }
diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c
index 264e37d6a6..899b0fd6c9 100644
--- a/hw/pci-bridge/xio3130_downstream.c
+++ b/hw/pci-bridge/xio3130_downstream.c
@@ -41,9 +41,12 @@
 static void xio3130_downstream_write_config(PCIDevice *d, uint32_t address,
                                          uint32_t val, int len)
 {
+    uint16_t slt_ctl, slt_sta;
+
+    pcie_cap_slot_get(d, &slt_sta, &slt_ctl);
     pci_bridge_write_config(d, address, val, len);
     pcie_cap_flr_write_config(d, address, val, len);
-    pcie_cap_slot_write_config(d, address, val, len);
+    pcie_cap_slot_write_config(d, slt_ctl, slt_sta, address, val, len);
     pcie_aer_write_config(d, address, val, len);
 }
 
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index b22527000d..f8490a00de 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -594,7 +594,15 @@ void pcie_cap_slot_reset(PCIDevice *dev)
     hotplug_event_update_event_status(dev);
 }
 
-void pcie_cap_slot_write_config(PCIDevice *dev,
+void pcie_cap_slot_get(PCIDevice *dev, uint16_t *slt_ctl, uint16_t *slt_sta)
+{
+    uint32_t pos = dev->exp.exp_cap;
+    uint8_t *exp_cap = dev->config + pos;
+    *slt_ctl = pci_get_word(exp_cap + PCI_EXP_SLTCTL);
+    *slt_sta = pci_get_word(exp_cap + PCI_EXP_SLTSTA);
+}
+
+void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slt_ctl, uint16_t slt_sta,
                                 uint32_t addr, uint32_t val, int len)
 {
     uint32_t pos = dev->exp.exp_cap;
@@ -623,7 +631,9 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
      * controller is off, it is safe to detach the devices.
      */
     if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
-        ((val & PCI_EXP_SLTCTL_PIC_OFF) == PCI_EXP_SLTCTL_PIC_OFF)) {
+        (val & PCI_EXP_SLTCTL_PIC_OFF) == PCI_EXP_SLTCTL_PIC_OFF &&
+        (!(slt_ctl & PCI_EXP_SLTCTL_PCC) ||
+        (slt_ctl & PCI_EXP_SLTCTL_PIC_OFF) != PCI_EXP_SLTCTL_PIC_OFF)) {
         PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev));
         pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
                             pcie_unplug_device, NULL);
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index e30334d74d..8d90c0e193 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -107,7 +107,8 @@ void pcie_cap_lnkctl_reset(PCIDevice *dev);
 
 void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
 void pcie_cap_slot_reset(PCIDevice *dev);
-void pcie_cap_slot_write_config(PCIDevice *dev,
+void pcie_cap_slot_get(PCIDevice *dev, uint16_t *slot_ctl, uint16_t *slt_sta);
+void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slot_ctl, uint16_t slt_sta,
                                 uint32_t addr, uint32_t val, int len);
 int pcie_cap_slot_post_load(void *opaque, int version_id);
 void pcie_cap_slot_push_attention_button(PCIDevice *dev);

From 110c477c2ede4d06d3a95a2bfda5c4c58d2af516 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 21 Jun 2019 02:38:41 -0400
Subject: [PATCH 03/22] pcie: work around for racy guest init

During boot, linux guests tend to clear all bits in pcie slot status
register which is used for hotplug.
If they clear bits that weren't set this is racy and will lose events:
not a big problem for manual hotplug on bare-metal, but a problem for us.

For example, the following is broken ATM:

/x86_64-softmmu/qemu-system-x86_64 -enable-kvm -S -machine q35  \
    -device pcie-root-port,id=pcie_root_port_0,slot=2,chassis=2,addr=0x2,bus=pcie.0 \
    -device virtio-balloon-pci,id=balloon,bus=pcie_root_port_0 \
    -monitor stdio disk.qcow2
(qemu)device_del balloon
(qemu)cont

Balloon isn't deleted as it should.

As a work-around, detect this attempt to clear slot status and revert
status to what it was before the write.

Note: in theory this can be detected as a duplicate button press
which cancels the previous press. Does not seem to happen in
practice as guests seem to only have this bug during init.

Note2: the right thing to do is probably to fix Linux to
read status before clearing it, and act on the bits that are set.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Tested-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/pci/pcie.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index f8490a00de..c605d32dd4 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -610,6 +610,25 @@ void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slt_ctl, uint16_t slt_s
     uint16_t sltsta = pci_get_word(exp_cap + PCI_EXP_SLTSTA);
 
     if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
+        /*
+         * Guests tend to clears all bits during init.
+         * If they clear bits that weren't set this is racy and will lose events:
+         * not a big problem for manual button presses, but a problem for us.
+         * As a work-around, detect this and revert status to what it was
+         * before the write.
+         *
+         * Note: in theory this can be detected as a duplicate button press
+         * which cancels the previous press. Does not seem to happen in
+         * practice as guests seem to only have this bug during init.
+         */
+#define PCIE_SLOT_EVENTS (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD | \
+                          PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_PDC | \
+                          PCI_EXP_SLTSTA_CC)
+
+        if (val & ~slt_sta & PCIE_SLOT_EVENTS) {
+            sltsta = (sltsta & ~PCIE_SLOT_EVENTS) | (slt_sta & PCIE_SLOT_EVENTS);
+            pci_set_word(exp_cap + PCI_EXP_SLTSTA, sltsta);
+        }
         hotplug_event_clear(dev);
     }
 

From d85d65cc297fa85b7b2db3b9025c6341d6986d7e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 1 Jul 2019 05:29:51 -0400
Subject: [PATCH 04/22] pcie: minor cleanups for slot control/status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename function arguments to make intent clearer.
Better documentation for slot control logic.

Suggested-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/pci/pcie.c         | 17 +++++++++++------
 include/hw/pci/pcie.h |  3 ++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index c605d32dd4..a6beb567bd 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -602,7 +602,8 @@ void pcie_cap_slot_get(PCIDevice *dev, uint16_t *slt_ctl, uint16_t *slt_sta)
     *slt_sta = pci_get_word(exp_cap + PCI_EXP_SLTSTA);
 }
 
-void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slt_ctl, uint16_t slt_sta,
+void pcie_cap_slot_write_config(PCIDevice *dev,
+                                uint16_t old_slt_ctl, uint16_t old_slt_sta,
                                 uint32_t addr, uint32_t val, int len)
 {
     uint32_t pos = dev->exp.exp_cap;
@@ -625,8 +626,8 @@ void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slt_ctl, uint16_t slt_s
                           PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_PDC | \
                           PCI_EXP_SLTSTA_CC)
 
-        if (val & ~slt_sta & PCIE_SLOT_EVENTS) {
-            sltsta = (sltsta & ~PCIE_SLOT_EVENTS) | (slt_sta & PCIE_SLOT_EVENTS);
+        if (val & ~old_slt_sta & PCIE_SLOT_EVENTS) {
+            sltsta = (sltsta & ~PCIE_SLOT_EVENTS) | (old_slt_sta & PCIE_SLOT_EVENTS);
             pci_set_word(exp_cap + PCI_EXP_SLTSTA, sltsta);
         }
         hotplug_event_clear(dev);
@@ -646,13 +647,17 @@ void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slt_ctl, uint16_t slt_s
     }
 
     /*
-     * If the slot is polulated, power indicator is off and power
+     * If the slot is populated, power indicator is off and power
      * controller is off, it is safe to detach the devices.
+     *
+     * Note: don't detach if condition was already true:
+     * this is a work around for guests that overwrite
+     * control of powered off slots before powering them on.
      */
     if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
         (val & PCI_EXP_SLTCTL_PIC_OFF) == PCI_EXP_SLTCTL_PIC_OFF &&
-        (!(slt_ctl & PCI_EXP_SLTCTL_PCC) ||
-        (slt_ctl & PCI_EXP_SLTCTL_PIC_OFF) != PCI_EXP_SLTCTL_PIC_OFF)) {
+        (!(old_slt_ctl & PCI_EXP_SLTCTL_PCC) ||
+        (old_slt_ctl & PCI_EXP_SLTCTL_PIC_OFF) != PCI_EXP_SLTCTL_PIC_OFF)) {
         PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev));
         pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
                             pcie_unplug_device, NULL);
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index 8d90c0e193..34f277735c 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -108,7 +108,8 @@ void pcie_cap_lnkctl_reset(PCIDevice *dev);
 void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot);
 void pcie_cap_slot_reset(PCIDevice *dev);
 void pcie_cap_slot_get(PCIDevice *dev, uint16_t *slot_ctl, uint16_t *slt_sta);
-void pcie_cap_slot_write_config(PCIDevice *dev, uint16_t slot_ctl, uint16_t slt_sta,
+void pcie_cap_slot_write_config(PCIDevice *dev,
+                                uint16_t old_slot_ctl, uint16_t old_slt_sta,
                                 uint32_t addr, uint32_t val, int len);
 int pcie_cap_slot_post_load(void *opaque, int version_id);
 void pcie_cap_slot_push_attention_button(PCIDevice *dev);

From 5f503cd9f38856285a83afe5b928c94f40e43fb7 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Wed, 19 Jun 2019 15:19:01 +0530
Subject: [PATCH 05/22] virtio-pmem: add virtio device

This is the implementation of virtio-pmem device. Support will require
machine changes for the architectures that will support it, so it will
not yet be compiled. It can be unlocked with VIRTIO_PMEM_SUPPORTED per
machine and disabled globally via VIRTIO_PMEM.

We cannot use the "addr" property as that is already used e.g. for
virtio-pci/pci devices. And we will have e.g. virtio-pmem-pci as a proxy.
So we have to choose a different one (unfortunately). "memaddr" it is.
That name should ideally be used by all other virtio-* based memory
devices in the future.
    -device virtio-pmem-pci,id=p0,bus=bux0,addr=0x01,memaddr=0x1000000...

Acked-by: Markus Armbruster <armbru@redhat.com>
[ QAPI bits ]
Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
[ MemoryDevice/MemoryRegion changes, cleanups, addr property "memaddr",
  split up patches, unplug handler ]
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20190619094907.10131-2-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 hw/virtio/Kconfig               |  10 ++
 hw/virtio/Makefile.objs         |   1 +
 hw/virtio/virtio-pmem.c         | 189 ++++++++++++++++++++++++++++++++
 include/hw/virtio/virtio-pmem.h |  49 +++++++++
 qapi/misc.json                  |  28 ++++-
 5 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/virtio-pmem.c
 create mode 100644 include/hw/virtio/virtio-pmem.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index e0452de4ba..3724ff8bac 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -29,3 +29,13 @@ config VIRTIO_CRYPTO
     bool
     default y
     depends on VIRTIO
+
+config VIRTIO_PMEM_SUPPORTED
+    bool
+
+config VIRTIO_PMEM
+    bool
+    default y
+    depends on VIRTIO
+    depends on VIRTIO_PMEM_SUPPORTED
+    select MEM_DEVICE
diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 5570ea8df8..5857e3b8e1 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -12,6 +12,7 @@ common-obj-$(CONFIG_VIRTIO_MMIO) += virtio-mmio.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon.o
 obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
 obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += virtio-crypto-pci.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
new file mode 100644
index 0000000000..adbfb603ab
--- /dev/null
+++ b/hw/virtio/virtio-pmem.c
@@ -0,0 +1,189 @@
+/*
+ * Virtio PMEM device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ *  Pankaj Gupta <pagupta@redhat.com>
+ *  David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/virtio-pmem.h"
+#include "hw/virtio/virtio-access.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "standard-headers/linux/virtio_pmem.h"
+#include "block/aio.h"
+#include "block/thread-pool.h"
+
+typedef struct VirtIODeviceRequest {
+    VirtQueueElement elem;
+    int fd;
+    VirtIOPMEM *pmem;
+    VirtIODevice *vdev;
+    struct virtio_pmem_req req;
+    struct virtio_pmem_resp resp;
+} VirtIODeviceRequest;
+
+static int worker_cb(void *opaque)
+{
+    VirtIODeviceRequest *req_data = opaque;
+    int err = 0;
+
+    /* flush raw backing image */
+    err = fsync(req_data->fd);
+    if (err != 0) {
+        err = 1;
+    }
+
+    virtio_stw_p(req_data->vdev, &req_data->resp.ret, err);
+
+    return 0;
+}
+
+static void done_cb(void *opaque, int ret)
+{
+    VirtIODeviceRequest *req_data = opaque;
+    int len = iov_from_buf(req_data->elem.in_sg, req_data->elem.in_num, 0,
+                              &req_data->resp, sizeof(struct virtio_pmem_resp));
+
+    /* Callbacks are serialized, so no need to use atomic ops. */
+    virtqueue_push(req_data->pmem->rq_vq, &req_data->elem, len);
+    virtio_notify((VirtIODevice *)req_data->pmem, req_data->pmem->rq_vq);
+    g_free(req_data);
+}
+
+static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIODeviceRequest *req_data;
+    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
+    HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev);
+    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+
+    req_data = virtqueue_pop(vq, sizeof(VirtIODeviceRequest));
+    if (!req_data) {
+        virtio_error(vdev, "virtio-pmem missing request data");
+        return;
+    }
+
+    if (req_data->elem.out_num < 1 || req_data->elem.in_num < 1) {
+        virtio_error(vdev, "virtio-pmem request not proper");
+        g_free(req_data);
+        return;
+    }
+    req_data->fd   = memory_region_get_fd(&backend->mr);
+    req_data->pmem = pmem;
+    req_data->vdev = vdev;
+    thread_pool_submit_aio(pool, worker_cb, req_data, done_cb, req_data);
+}
+
+static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
+    struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *) config;
+
+    virtio_stq_p(vdev, &pmemcfg->start, pmem->start);
+    virtio_stq_p(vdev, &pmemcfg->size, memory_region_size(&pmem->memdev->mr));
+}
+
+static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t features,
+                                        Error **errp)
+{
+    return features;
+}
+
+static void virtio_pmem_realize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOPMEM *pmem = VIRTIO_PMEM(dev);
+
+    if (!pmem->memdev) {
+        error_setg(errp, "virtio-pmem memdev not set");
+        return;
+    }
+
+    if (host_memory_backend_is_mapped(pmem->memdev)) {
+        char *path = object_get_canonical_path_component(OBJECT(pmem->memdev));
+        error_setg(errp, "can't use already busy memdev: %s", path);
+        g_free(path);
+        return;
+    }
+
+    host_memory_backend_set_mapped(pmem->memdev, true);
+    virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
+                sizeof(struct virtio_pmem_config));
+    pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
+}
+
+static void virtio_pmem_unrealize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOPMEM *pmem = VIRTIO_PMEM(dev);
+
+    host_memory_backend_set_mapped(pmem->memdev, false);
+    virtio_cleanup(vdev);
+}
+
+static void virtio_pmem_fill_device_info(const VirtIOPMEM *pmem,
+                                         VirtioPMEMDeviceInfo *vi)
+{
+    vi->memaddr = pmem->start;
+    vi->size = pmem->memdev ? memory_region_size(&pmem->memdev->mr) : 0;
+    vi->memdev = object_get_canonical_path(OBJECT(pmem->memdev));
+}
+
+static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem,
+                                                   Error **errp)
+{
+    if (!pmem->memdev) {
+        error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP);
+        return NULL;
+    }
+
+    return &pmem->memdev->mr;
+}
+
+static Property virtio_pmem_properties[] = {
+    DEFINE_PROP_UINT64(VIRTIO_PMEM_ADDR_PROP, VirtIOPMEM, start, 0),
+    DEFINE_PROP_LINK(VIRTIO_PMEM_MEMDEV_PROP, VirtIOPMEM, memdev,
+                     TYPE_MEMORY_BACKEND, HostMemoryBackend *),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_pmem_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+    VirtIOPMEMClass *vpc = VIRTIO_PMEM_CLASS(klass);
+
+    dc->props = virtio_pmem_properties;
+
+    vdc->realize = virtio_pmem_realize;
+    vdc->unrealize = virtio_pmem_unrealize;
+    vdc->get_config = virtio_pmem_get_config;
+    vdc->get_features = virtio_pmem_get_features;
+
+    vpc->fill_device_info = virtio_pmem_fill_device_info;
+    vpc->get_memory_region = virtio_pmem_get_memory_region;
+}
+
+static TypeInfo virtio_pmem_info = {
+    .name          = TYPE_VIRTIO_PMEM,
+    .parent        = TYPE_VIRTIO_DEVICE,
+    .class_size    = sizeof(VirtIOPMEMClass),
+    .class_init    = virtio_pmem_class_init,
+    .instance_size = sizeof(VirtIOPMEM),
+};
+
+static void virtio_register_types(void)
+{
+    type_register_static(&virtio_pmem_info);
+}
+
+type_init(virtio_register_types)
diff --git a/include/hw/virtio/virtio-pmem.h b/include/hw/virtio/virtio-pmem.h
new file mode 100644
index 0000000000..19b6ee6d75
--- /dev/null
+++ b/include/hw/virtio/virtio-pmem.h
@@ -0,0 +1,49 @@
+/*
+ * Virtio PMEM device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ *  Pankaj Gupta <pagupta@redhat.com>
+ *  David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_VIRTIO_PMEM_H
+#define HW_VIRTIO_PMEM_H
+
+#include "hw/virtio/virtio.h"
+#include "sysemu/hostmem.h"
+
+#define TYPE_VIRTIO_PMEM "virtio-pmem"
+
+#define VIRTIO_PMEM(obj) \
+        OBJECT_CHECK(VirtIOPMEM, (obj), TYPE_VIRTIO_PMEM)
+#define VIRTIO_PMEM_CLASS(oc) \
+        OBJECT_CLASS_CHECK(VirtIOPMEMClass, (oc), TYPE_VIRTIO_PMEM)
+#define VIRTIO_PMEM_GET_CLASS(obj) \
+        OBJECT_GET_CLASS(VirtIOPMEMClass, (obj), TYPE_VIRTIO_PMEM)
+
+#define VIRTIO_PMEM_ADDR_PROP "memaddr"
+#define VIRTIO_PMEM_MEMDEV_PROP "memdev"
+
+typedef struct VirtIOPMEM {
+    VirtIODevice parent_obj;
+
+    VirtQueue *rq_vq;
+    uint64_t start;
+    HostMemoryBackend *memdev;
+} VirtIOPMEM;
+
+typedef struct VirtIOPMEMClass {
+    /* private */
+    VirtIODevice parent;
+
+    /* public */
+    void (*fill_device_info)(const VirtIOPMEM *pmem, VirtioPMEMDeviceInfo *vi);
+    MemoryRegion *(*get_memory_region)(VirtIOPMEM *pmem, Error **errp);
+} VirtIOPMEMClass;
+
+#endif
diff --git a/qapi/misc.json b/qapi/misc.json
index dc4cf9da20..6f1bff10e4 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -2738,16 +2738,42 @@
           }
 }
 
+##
+# @VirtioPMEMDeviceInfo:
+#
+# VirtioPMEM state information
+#
+# @id: device's ID
+#
+# @memaddr: physical address in memory, where device is mapped
+#
+# @size: size of memory that the device provides
+#
+# @memdev: memory backend linked with device
+#
+# Since: 4.1
+##
+{ 'struct': 'VirtioPMEMDeviceInfo',
+  'data': { '*id': 'str',
+            'memaddr': 'size',
+            'size': 'size',
+            'memdev': 'str'
+          }
+}
+
 ##
 # @MemoryDeviceInfo:
 #
 # Union containing information about a memory device
 #
+# nvdimm is included since 2.12. virtio-pmem is included since 4.1.
+#
 # Since: 2.1
 ##
 { 'union': 'MemoryDeviceInfo',
   'data': { 'dimm': 'PCDIMMDeviceInfo',
-            'nvdimm': 'PCDIMMDeviceInfo'
+            'nvdimm': 'PCDIMMDeviceInfo',
+            'virtio-pmem': 'VirtioPMEMDeviceInfo'
           }
 }
 

From 1e33b513f261a24b4340eca5a6f8f5726b92e097 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 19 Jun 2019 15:19:02 +0530
Subject: [PATCH 06/22] virtio-pci: Allow to specify additional interfaces for
 the base type

Let's allow to specify additional interfaces for the base type (e.g.
later TYPE_MEMORY_DEVICE), something that was possible before the
rework of virtio PCI device instantiation.

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20190619094907.10131-3-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-pci.c | 1 +
 hw/virtio/virtio-pci.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index e6d5467e54..62e78e98f5 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1947,6 +1947,7 @@ void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t)
         .class_init    = virtio_pci_base_class_init,
         .class_data    = (void *)t,
         .abstract      = true,
+        .interfaces    = t->interfaces,
     };
     TypeInfo generic_type_info = {
         .name = t->generic_name,
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index bfea2892a5..619d9098c1 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -252,6 +252,7 @@ typedef struct VirtioPCIDeviceTypeInfo {
     size_t class_size;
     void (*instance_init)(Object *obj);
     void (*class_init)(ObjectClass *klass, void *data);
+    InterfaceInfo *interfaces;
 } VirtioPCIDeviceTypeInfo;
 
 /* Register virtio-pci type(s).  @t must be static. */

From 9f583bdd47189416c7ba331f1baf3a890f2671a6 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Wed, 19 Jun 2019 15:19:03 +0530
Subject: [PATCH 07/22] virtio-pmem: sync linux headers

Add linux headers for virtio pmem. These are not yet upstream - include
them temporarily as merge window in which this is supposed to be is
coming up shortly. If virtio-pmem ends up not being merged
then this will be reverted and accordingly virtio-pmem dropped.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Message-Id: <20190619094907.10131-4-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/standard-headers/linux/virtio_ids.h  |  1 +
 include/standard-headers/linux/virtio_pmem.h | 34 ++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 include/standard-headers/linux/virtio_pmem.h

diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h
index 6d5c3b2d4f..32b2f94d1f 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h
@@ -43,5 +43,6 @@
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
+#define VIRTIO_ID_PMEM         27 /* virtio pmem */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/standard-headers/linux/virtio_pmem.h b/include/standard-headers/linux/virtio_pmem.h
new file mode 100644
index 0000000000..7e3d43b121
--- /dev/null
+++ b/include/standard-headers/linux/virtio_pmem.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Definitions for virtio-pmem devices.
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * Author(s): Pankaj Gupta <pagupta@redhat.com>
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_PMEM_H
+#define _UAPI_LINUX_VIRTIO_PMEM_H
+
+#include "standard-headers/linux/types.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "standard-headers/linux/virtio_config.h"
+
+struct virtio_pmem_config {
+	uint64_t start;
+	uint64_t size;
+};
+
+#define VIRTIO_PMEM_REQ_TYPE_FLUSH      0
+
+struct virtio_pmem_resp {
+	/* Host return status corresponding to flush request */
+	uint32_t ret;
+};
+
+struct virtio_pmem_req {
+	/* command type */
+	uint32_t type;
+};
+
+#endif

From adf0748a494151fd7c10050820f2a3988768828a Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Wed, 19 Jun 2019 15:19:04 +0530
Subject: [PATCH 08/22] virtio-pci: Proxy for virtio-pmem

We need a proxy device for virtio-pmem, and this device has to be the
actual memory device so we can cleanly hotplug it.

Forward memory device class functions either to the actual device or use
properties of the virtio-pmem device to implement these in the proxy.

virtio-pmem will only be compiled for selected, supported architectures
(that can deal with virtio/pci devices being memory devices). An
architecture that is prepared for that can simply enable
CONFIG_VIRTIO_PMEM to make it work.

As not all architectures support memory devices (and CONFIG_VIRTIO_PMEM
will be enabled per supported architecture), we have to move the PCI proxy
to a separate file.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
[ split up patches, memory-device changes, move pci proxy]
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20190619094907.10131-5-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/Makefile.objs     |   1 +
 hw/virtio/virtio-pmem-pci.c | 131 ++++++++++++++++++++++++++++++++++++
 hw/virtio/virtio-pmem-pci.h |  34 ++++++++++
 include/hw/pci/pci.h        |   1 +
 4 files changed, 167 insertions(+)
 create mode 100644 hw/virtio/virtio-pmem-pci.c
 create mode 100644 hw/virtio/virtio-pmem-pci.h

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 5857e3b8e1..964ce78607 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -13,6 +13,7 @@ obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon.o
 obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
 obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += virtio-crypto-pci.o
 obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
+common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pmem-pci.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
diff --git a/hw/virtio/virtio-pmem-pci.c b/hw/virtio/virtio-pmem-pci.c
new file mode 100644
index 0000000000..8b2d0dbccc
--- /dev/null
+++ b/hw/virtio/virtio-pmem-pci.c
@@ -0,0 +1,131 @@
+/*
+ * Virtio PMEM PCI device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ *  Pankaj Gupta <pagupta@redhat.com>
+ *  David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "virtio-pmem-pci.h"
+#include "hw/mem/memory-device.h"
+#include "qapi/error.h"
+
+static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VirtIOPMEMPCI *pmem_pci = VIRTIO_PMEM_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&pmem_pci->vdev);
+
+    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void virtio_pmem_pci_set_addr(MemoryDeviceState *md, uint64_t addr,
+                                     Error **errp)
+{
+    object_property_set_uint(OBJECT(md), addr, VIRTIO_PMEM_ADDR_PROP, errp);
+}
+
+static uint64_t virtio_pmem_pci_get_addr(const MemoryDeviceState *md)
+{
+    return object_property_get_uint(OBJECT(md), VIRTIO_PMEM_ADDR_PROP,
+                                    &error_abort);
+}
+
+static MemoryRegion *virtio_pmem_pci_get_memory_region(MemoryDeviceState *md,
+                                                       Error **errp)
+{
+    VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md);
+    VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev);
+    VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem);
+
+    return vpc->get_memory_region(pmem, errp);
+}
+
+static uint64_t virtio_pmem_pci_get_plugged_size(const MemoryDeviceState *md,
+                                                 Error **errp)
+{
+    VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md);
+    VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev);
+    VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem);
+    MemoryRegion *mr = vpc->get_memory_region(pmem, errp);
+
+    /* the plugged size corresponds to the region size */
+    return mr ? 0 : memory_region_size(mr);
+}
+
+static void virtio_pmem_pci_fill_device_info(const MemoryDeviceState *md,
+                                             MemoryDeviceInfo *info)
+{
+    VirtioPMEMDeviceInfo *vi = g_new0(VirtioPMEMDeviceInfo, 1);
+    VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md);
+    VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev);
+    VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem);
+    DeviceState *dev = DEVICE(md);
+
+    if (dev->id) {
+        vi->has_id = true;
+        vi->id = g_strdup(dev->id);
+    }
+
+    /* let the real device handle everything else */
+    vpc->fill_device_info(pmem, vi);
+
+    info->u.virtio_pmem.data = vi;
+    info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM;
+}
+
+static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+    MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
+
+    k->realize = virtio_pmem_pci_realize;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_PMEM;
+    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+    pcidev_k->class_id = PCI_CLASS_OTHERS;
+
+    mdc->get_addr = virtio_pmem_pci_get_addr;
+    mdc->set_addr = virtio_pmem_pci_set_addr;
+    mdc->get_plugged_size = virtio_pmem_pci_get_plugged_size;
+    mdc->get_memory_region = virtio_pmem_pci_get_memory_region;
+    mdc->fill_device_info = virtio_pmem_pci_fill_device_info;
+}
+
+static void virtio_pmem_pci_instance_init(Object *obj)
+{
+    VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_PMEM);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_pmem_pci_info = {
+    .base_name             = TYPE_VIRTIO_PMEM_PCI,
+    .generic_name          = "virtio-pmem-pci",
+    .transitional_name     = "virtio-pmem-pci-transitional",
+    .non_transitional_name = "virtio-pmem-pci-non-transitional",
+    .instance_size = sizeof(VirtIOPMEMPCI),
+    .instance_init = virtio_pmem_pci_instance_init,
+    .class_init    = virtio_pmem_pci_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_MEMORY_DEVICE },
+        { }
+    },
+};
+
+static void virtio_pmem_pci_register_types(void)
+{
+    virtio_pci_types_register(&virtio_pmem_pci_info);
+}
+type_init(virtio_pmem_pci_register_types)
diff --git a/hw/virtio/virtio-pmem-pci.h b/hw/virtio/virtio-pmem-pci.h
new file mode 100644
index 0000000000..616abef093
--- /dev/null
+++ b/hw/virtio/virtio-pmem-pci.h
@@ -0,0 +1,34 @@
+/*
+ * Virtio PMEM PCI device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ *  Pankaj Gupta <pagupta@redhat.com>
+ *  David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_VIRTIO_PMEM_PCI_H
+#define QEMU_VIRTIO_PMEM_PCI_H
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-pmem.h"
+
+typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
+
+/*
+ * virtio-pmem-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci-base"
+#define VIRTIO_PMEM_PCI(obj) \
+        OBJECT_CHECK(VirtIOPMEMPCI, (obj), TYPE_VIRTIO_PMEM_PCI)
+
+struct VirtIOPMEMPCI {
+    VirtIOPCIProxy parent_obj;
+    VirtIOPMEM vdev;
+};
+
+#endif /* QEMU_VIRTIO_PMEM_PCI_H */
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index d082707dfa..aaf1b9f70d 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -85,6 +85,7 @@ extern bool pci_available;
 #define PCI_DEVICE_ID_VIRTIO_RNG         0x1005
 #define PCI_DEVICE_ID_VIRTIO_9P          0x1009
 #define PCI_DEVICE_ID_VIRTIO_VSOCK       0x1012
+#define PCI_DEVICE_ID_VIRTIO_PMEM        0x1013
 
 #define PCI_VENDOR_ID_REDHAT             0x1b36
 #define PCI_DEVICE_ID_REDHAT_BRIDGE      0x0001

From d766b22bbd18c7766e2e0a28ea0d07c18db13f7e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 19 Jun 2019 15:19:05 +0530
Subject: [PATCH 09/22] hmp: Handle virtio-pmem when printing memory device
 infos

Print the memory device info just like for PCDIMM/NVDIMM.

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20190619094907.10131-6-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 monitor/hmp-cmds.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index c283dde0e9..1c97926432 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -2653,6 +2653,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
     Error *err = NULL;
     MemoryDeviceInfoList *info_list = qmp_query_memory_devices(&err);
     MemoryDeviceInfoList *info;
+    VirtioPMEMDeviceInfo *vpi;
     MemoryDeviceInfo *value;
     PCDIMMDeviceInfo *di;
 
@@ -2662,19 +2663,9 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
         if (value) {
             switch (value->type) {
             case MEMORY_DEVICE_INFO_KIND_DIMM:
-                di = value->u.dimm.data;
-                break;
-
             case MEMORY_DEVICE_INFO_KIND_NVDIMM:
-                di = value->u.nvdimm.data;
-                break;
-
-            default:
-                di = NULL;
-                break;
-            }
-
-            if (di) {
+                di = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ?
+                     value->u.dimm.data : value->u.nvdimm.data;
                 monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
                                MemoryDeviceInfoKind_str(value->type),
                                di->id ? di->id : "");
@@ -2687,6 +2678,18 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
                                di->hotplugged ? "true" : "false");
                 monitor_printf(mon, "  hotpluggable: %s\n",
                                di->hotpluggable ? "true" : "false");
+                break;
+            case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM:
+                vpi = value->u.virtio_pmem.data;
+                monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
+                               MemoryDeviceInfoKind_str(value->type),
+                               vpi->id ? vpi->id : "");
+                monitor_printf(mon, "  memaddr: 0x%" PRIx64 "\n", vpi->memaddr);
+                monitor_printf(mon, "  size: %" PRIu64 "\n", vpi->size);
+                monitor_printf(mon, "  memdev: %s\n", vpi->memdev);
+                break;
+            default:
+                g_assert_not_reached();
             }
         }
     }

From cae02c3480b806ce1fb30a3a5c54ae1b43a412e7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 19 Jun 2019 15:19:06 +0530
Subject: [PATCH 10/22] numa: Handle virtio-pmem in NUMA stats

Account the memory to node 0 for now. Once (if ever) virtio-pmem
supports NUMA, we can account it to the right node.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20190619094907.10131-7-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 numa.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/numa.c b/numa.c
index 955ec0c830..cfcdb8d943 100644
--- a/numa.c
+++ b/numa.c
@@ -549,6 +549,7 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
     MemoryDeviceInfoList *info_list = qmp_memory_device_list();
     MemoryDeviceInfoList *info;
     PCDIMMDeviceInfo     *pcdimm_info;
+    VirtioPMEMDeviceInfo *vpi;
 
     for (info = info_list; info; info = info->next) {
         MemoryDeviceInfo *value = info->value;
@@ -556,22 +557,21 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
         if (value) {
             switch (value->type) {
             case MEMORY_DEVICE_INFO_KIND_DIMM:
-                pcdimm_info = value->u.dimm.data;
-                break;
-
             case MEMORY_DEVICE_INFO_KIND_NVDIMM:
-                pcdimm_info = value->u.nvdimm.data;
-                break;
-
-            default:
-                pcdimm_info = NULL;
-                break;
-            }
-
-            if (pcdimm_info) {
+                pcdimm_info = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ?
+                              value->u.dimm.data : value->u.nvdimm.data;
                 node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
                 node_mem[pcdimm_info->node].node_plugged_mem +=
                     pcdimm_info->size;
+                break;
+            case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM:
+                vpi = value->u.virtio_pmem.data;
+                /* TODO: once we support numa, assign to right node */
+                node_mem[0].node_mem += vpi->size;
+                node_mem[0].node_plugged_mem += vpi->size;
+                break;
+            default:
+                g_assert_not_reached();
             }
         }
     }

From a0a49813f7f2fc23bfe8a4fc6760e2a60c9a3e59 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 19 Jun 2019 15:19:07 +0530
Subject: [PATCH 11/22] pc: Support for virtio-pmem-pci

Override the device hotplug handler to properly handle the memory device
part via virtio-pmem-pci callbacks from the machine hotplug handler and
forward to the actual PCI bus hotplug handler.

As PCI hotplug has not been properly factored out into hotplug handlers,
most magic is performed in the (un)realize functions. Also some PCI host
buses don't have a PCI hotplug handler at all yet, just to be sure that
we alway have a hotplug handler on x86, add a simple error check.

Unlocking virtio-pmem will unlock virtio-pmem-pci.

Signed-off-by: David Hildenbrand <david@redhat.com>
[ Disable virtio-pmem hotunplug ]
Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Message-Id: <20190619094907.10131-8-pagupta@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/Kconfig |  1 +
 hw/i386/pc.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 9817888216..4ddf2a9c55 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -30,6 +30,7 @@ config PC
     # For ACPI builder:
     select SERIAL_ISA
     select ACPI_VMGENID
+    select VIRTIO_PMEM_SUPPORTED
 
 config PC_PCI
     bool
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index e96360b47a..7b99b60cd3 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -79,6 +79,8 @@
 #include "hw/i386/intel_iommu.h"
 #include "hw/net/ne2000-isa.h"
 #include "standard-headers/asm-x86/bootparam.h"
+#include "hw/virtio/virtio-pmem-pci.h"
+#include "hw/mem/memory-device.h"
 
 /* debug PC/ISA interrupts */
 //#define DEBUG_IRQ
@@ -2395,6 +2397,65 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev,
     numa_cpu_pre_plug(cpu_slot, dev, errp);
 }
 
+static void pc_virtio_pmem_pci_pre_plug(HotplugHandler *hotplug_dev,
+                                        DeviceState *dev, Error **errp)
+{
+    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+    Error *local_err = NULL;
+
+    if (!hotplug_dev2) {
+        /*
+         * Without a bus hotplug handler, we cannot control the plug/unplug
+         * order. This should never be the case on x86, however better add
+         * a safety net.
+         */
+        error_setg(errp, "virtio-pmem-pci not supported on this bus.");
+        return;
+    }
+    /*
+     * First, see if we can plug this memory device at all. If that
+     * succeeds, branch of to the actual hotplug handler.
+     */
+    memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
+                           &local_err);
+    if (!local_err) {
+        hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
+    }
+    error_propagate(errp, local_err);
+}
+
+static void pc_virtio_pmem_pci_plug(HotplugHandler *hotplug_dev,
+                                    DeviceState *dev, Error **errp)
+{
+    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+    Error *local_err = NULL;
+
+    /*
+     * Plug the memory device first and then branch off to the actual
+     * hotplug handler. If that one fails, we can easily undo the memory
+     * device bits.
+     */
+    memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+    hotplug_handler_plug(hotplug_dev2, dev, &local_err);
+    if (local_err) {
+        memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+    }
+    error_propagate(errp, local_err);
+}
+
+static void pc_virtio_pmem_pci_unplug_request(HotplugHandler *hotplug_dev,
+                                              DeviceState *dev, Error **errp)
+{
+    /* We don't support virtio pmem hot unplug */
+    error_setg(errp, "virtio pmem device unplug not supported.");
+}
+
+static void pc_virtio_pmem_pci_unplug(HotplugHandler *hotplug_dev,
+                                      DeviceState *dev, Error **errp)
+{
+    /* We don't support virtio pmem hot unplug */
+}
+
 static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                                           DeviceState *dev, Error **errp)
 {
@@ -2402,6 +2463,8 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
         pc_memory_pre_plug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_pre_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
+        pc_virtio_pmem_pci_pre_plug(hotplug_dev, dev, errp);
     }
 }
 
@@ -2412,6 +2475,8 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev,
         pc_memory_plug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
+        pc_virtio_pmem_pci_plug(hotplug_dev, dev, errp);
     }
 }
 
@@ -2422,6 +2487,8 @@ static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
         pc_memory_unplug_request(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_unplug_request_cb(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
+        pc_virtio_pmem_pci_unplug_request(hotplug_dev, dev, errp);
     } else {
         error_setg(errp, "acpi: device unplug request for not supported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -2435,6 +2502,8 @@ static void pc_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
         pc_memory_unplug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_unplug_cb(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
+        pc_virtio_pmem_pci_unplug(hotplug_dev, dev, errp);
     } else {
         error_setg(errp, "acpi: device unplug for not supported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -2445,7 +2514,8 @@ static HotplugHandler *pc_get_hotplug_handler(MachineState *machine,
                                              DeviceState *dev)
 {
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
-        object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
         return HOTPLUG_HANDLER(machine);
     }
 

From 683c1d89efd1eeb111c129a9a91f629b94d90d45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Wed, 26 Jun 2019 01:23:33 +0200
Subject: [PATCH 12/22] virtio-pci: fix missing device properties
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit a4ee4c8baa37154 ("virtio: Helper for registering virtio
device types"), virtio-gpu-pci, virtio-vga, and virtio-crypto-pci lost
some properties: "ioeventfd" and "vectors". This may cause various
issues, such as failing migration or invalid properties.

Since those VirtioPCI devices do not have a base name, their class are
initialized with virtio_pci_generic_base_class_init(). However, if the
VirtioPCIDeviceTypeInfo provided a class_init which sets dc->props,
the properties were overwritten by virtio_pci_generic_class_init().

Instead, introduce an intermediary base-type to register the generic
properties.

Fixes: a4ee4c8baa37154f42b4dc6a13fee79268d15238
Cc: qemu-stable@nongnu.org
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190625232333.30752-1-marcandre.lureau@redhat.com>
---
 hw/virtio/virtio-pci.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 62e78e98f5..ce928f2429 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1913,13 +1913,6 @@ static void virtio_pci_generic_class_init(ObjectClass *klass, void *data)
     dc->props = virtio_pci_generic_properties;
 }
 
-/* Used when the generic type and the base type is the same */
-static void virtio_pci_generic_base_class_init(ObjectClass *klass, void *data)
-{
-    virtio_pci_base_class_init(klass, data);
-    virtio_pci_generic_class_init(klass, NULL);
-}
-
 static void virtio_pci_transitional_instance_init(Object *obj)
 {
     VirtIOPCIProxy *proxy = VIRTIO_PCI(obj);
@@ -1938,14 +1931,13 @@ static void virtio_pci_non_transitional_instance_init(Object *obj)
 
 void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t)
 {
+    char *base_name = NULL;
     TypeInfo base_type_info = {
         .name          = t->base_name,
         .parent        = t->parent ? t->parent : TYPE_VIRTIO_PCI,
         .instance_size = t->instance_size,
         .instance_init = t->instance_init,
         .class_size    = t->class_size,
-        .class_init    = virtio_pci_base_class_init,
-        .class_data    = (void *)t,
         .abstract      = true,
         .interfaces    = t->interfaces,
     };
@@ -1962,13 +1954,20 @@ void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t)
 
     if (!base_type_info.name) {
         /* No base type -> register a single generic device type */
-        base_type_info.name = t->generic_name;
-        base_type_info.class_init = virtio_pci_generic_base_class_init;
-        base_type_info.interfaces = generic_type_info.interfaces;
-        base_type_info.abstract = false;
-        generic_type_info.name = NULL;
+        /* use intermediate %s-base-type to add generic device props */
+        base_name = g_strdup_printf("%s-base-type", t->generic_name);
+        base_type_info.name = base_name;
+        base_type_info.class_init = virtio_pci_generic_class_init;
+
+        generic_type_info.parent = base_name;
+        generic_type_info.class_init = virtio_pci_base_class_init;
+        generic_type_info.class_data = (void *)t;
+
         assert(!t->non_transitional_name);
         assert(!t->transitional_name);
+    } else {
+        base_type_info.class_init = virtio_pci_base_class_init;
+        base_type_info.class_data = (void *)t;
     }
 
     type_register(&base_type_info);
@@ -2006,6 +2005,7 @@ void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t)
         };
         type_register(&transitional_type_info);
     }
+    g_free(base_name);
 }
 
 /* virtio-pci-bus */

From e57f2c31b6529d990eeafacdb19b9eb1904c23c6 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Wed, 26 Jun 2019 10:31:26 +0800
Subject: [PATCH 13/22] virtio: add "use-started" property

In order to avoid migration issues, we introduce a "use-started"
property to the base virtio device to indicate whether use
"started" flag or not. This property will be true by default and
set to false when machine type <= 4.0.

Suggested-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Message-Id: <20190626023130.31315-2-xieyongji@baidu.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Tested-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/block/vhost-user-blk.c  |  4 ++--
 hw/core/machine.c          |  1 +
 hw/virtio/virtio.c         | 18 +++++++-----------
 include/hw/virtio/virtio.h | 21 +++++++++++++++++++++
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 9cb61336a6..85bc4017e7 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -191,7 +191,7 @@ static void vhost_user_blk_stop(VirtIODevice *vdev)
 static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
-    bool should_start = vdev->started;
+    bool should_start = virtio_device_started(vdev, status);
     int ret;
 
     if (!vdev->vm_running) {
@@ -317,7 +317,7 @@ static int vhost_user_blk_connect(DeviceState *dev)
     }
 
     /* restore vhost state */
-    if (vdev->started) {
+    if (virtio_device_started(vdev, vdev->status)) {
         ret = vhost_user_blk_start(vdev);
         if (ret < 0) {
             error_report("vhost-user-blk: vhost start failed: %s",
diff --git a/hw/core/machine.c b/hw/core/machine.c
index ea5a01aa49..ea84bd6788 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -30,6 +30,7 @@ GlobalProperty hw_compat_4_0[] = {
     { "bochs-display",  "edid", "false" },
     { "virtio-vga",     "edid", "false" },
     { "virtio-gpu-pci", "edid", "false" },
+    { "virtio-device", "use-started", "false" },
 };
 const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0);
 
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index e1e90fcfd6..c9a6ca04b8 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1162,10 +1162,8 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val)
             }
         }
     }
-    vdev->started = val & VIRTIO_CONFIG_S_DRIVER_OK;
-    if (unlikely(vdev->start_on_kick && vdev->started)) {
-        vdev->start_on_kick = false;
-    }
+
+    virtio_set_started(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK);
 
     if (k->set_status) {
         k->set_status(vdev, val);
@@ -1536,8 +1534,7 @@ static bool virtio_queue_notify_aio_vq(VirtQueue *vq)
         ret = vq->handle_aio_output(vdev, vq);
 
         if (unlikely(vdev->start_on_kick)) {
-            vdev->started = true;
-            vdev->start_on_kick = false;
+            virtio_set_started(vdev, true);
         }
     }
 
@@ -1557,8 +1554,7 @@ static void virtio_queue_notify_vq(VirtQueue *vq)
         vq->handle_output(vdev, vq);
 
         if (unlikely(vdev->start_on_kick)) {
-            vdev->started = true;
-            vdev->start_on_kick = false;
+            virtio_set_started(vdev, true);
         }
     }
 }
@@ -1579,8 +1575,7 @@ void virtio_queue_notify(VirtIODevice *vdev, int n)
     }
 
     if (unlikely(vdev->start_on_kick)) {
-        vdev->started = true;
-        vdev->start_on_kick = false;
+        virtio_set_started(vdev, true);
     }
 }
 
@@ -2291,7 +2286,7 @@ static void virtio_vmstate_change(void *opaque, int running, RunState state)
     VirtIODevice *vdev = opaque;
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-    bool backend_run = running && vdev->started;
+    bool backend_run = running && virtio_device_started(vdev, vdev->status);
     vdev->vm_running = running;
 
     if (backend_run) {
@@ -2669,6 +2664,7 @@ static void virtio_device_instance_finalize(Object *obj)
 
 static Property virtio_properties[] = {
     DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
+    DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 27c0efc3d0..15d5366939 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -105,6 +105,7 @@ struct VirtIODevice
     uint16_t device_id;
     bool vm_running;
     bool broken; /* device in invalid state, needs reset */
+    bool use_started;
     bool started;
     bool start_on_kick; /* virtio 1.0 transitional devices support that */
     VMChangeStateEntry *vmstate;
@@ -351,4 +352,24 @@ static inline bool virtio_is_big_endian(VirtIODevice *vdev)
     /* Devices conforming to VIRTIO 1.0 or later are always LE. */
     return false;
 }
+
+static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
+{
+    if (vdev->use_started) {
+        return vdev->started;
+    }
+
+    return status & VIRTIO_CONFIG_S_DRIVER_OK;
+}
+
+static inline void virtio_set_started(VirtIODevice *vdev, bool started)
+{
+    if (started) {
+        vdev->start_on_kick = false;
+    }
+
+    if (vdev->use_started) {
+        vdev->started = started;
+    }
+}
 #endif

From 7abccd088c9ef4c66dc891ff634a1f381ed4b2a8 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Wed, 26 Jun 2019 10:31:27 +0800
Subject: [PATCH 14/22] virtio: Set "start_on_kick" for legacy devices

Besides virtio 1.0 transitional devices, we should also
set "start_on_kick" flag for legacy devices (virtio 0.9).

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Message-Id: <20190626023130.31315-3-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio.c         | 6 ++----
 include/hw/virtio/virtio.h | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index c9a6ca04b8..f7504d1395 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1212,8 +1212,7 @@ void virtio_reset(void *opaque)
         k->reset(vdev);
     }
 
-    vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) &&
-                          !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1));
+    vdev->start_on_kick = !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1);
     vdev->started = false;
     vdev->broken = false;
     vdev->guest_features = 0;
@@ -2325,8 +2324,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
             g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
     }
 
-    vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) &&
-                          !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1));
+    vdev->start_on_kick = !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1);
     vdev->started = false;
     vdev->device_id = device_id;
     vdev->status = 0;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 15d5366939..b189788cb2 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -107,7 +107,7 @@ struct VirtIODevice
     bool broken; /* device in invalid state, needs reset */
     bool use_started;
     bool started;
-    bool start_on_kick; /* virtio 1.0 transitional devices support that */
+    bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */
     VMChangeStateEntry *vmstate;
     char *bus_name;
     uint8_t device_endian;

From 868a8f44f57f15670b30631a95e3e31e5cf268d0 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Wed, 26 Jun 2019 10:31:28 +0800
Subject: [PATCH 15/22] virtio: Set "start_on_kick" on virtio_set_features()

The guest feature is not set correctly on virtio_reset() and
virtio_init(). So we should not use it to set "start_on_kick" at that
point. This patch set "start_on_kick" on virtio_set_features() instead.

Fixes: badaf79cfdbd3 ("virtio: Introduce started flag to VirtioDevice")
Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Message-Id: <20190626023130.31315-4-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index f7504d1395..5fd25d98a9 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1212,7 +1212,7 @@ void virtio_reset(void *opaque)
         k->reset(vdev);
     }
 
-    vdev->start_on_kick = !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1);
+    vdev->start_on_kick = false;
     vdev->started = false;
     vdev->broken = false;
     vdev->guest_features = 0;
@@ -2063,14 +2063,21 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val)
         return -EINVAL;
     }
     ret = virtio_set_features_nocheck(vdev, val);
-    if (!ret && virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
-        /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches.  */
-        int i;
-        for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
-            if (vdev->vq[i].vring.num != 0) {
-                virtio_init_region_cache(vdev, i);
+    if (!ret) {
+        if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+            /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches.  */
+            int i;
+            for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+                if (vdev->vq[i].vring.num != 0) {
+                    virtio_init_region_cache(vdev, i);
+                }
             }
         }
+
+        if (!virtio_device_started(vdev, vdev->status) &&
+            !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+            vdev->start_on_kick = true;
+        }
     }
     return ret;
 }
@@ -2222,6 +2229,11 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
         }
     }
 
+    if (!virtio_device_started(vdev, vdev->status) &&
+        !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+        vdev->start_on_kick = true;
+    }
+
     rcu_read_lock();
     for (i = 0; i < num; i++) {
         if (vdev->vq[i].vring.desc) {
@@ -2324,7 +2336,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
             g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
     }
 
-    vdev->start_on_kick = !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1);
+    vdev->start_on_kick = false;
     vdev->started = false;
     vdev->device_id = device_id;
     vdev->status = 0;

From 8b04e2c797b01a4baf379ec4f40b3f0e9cdf7a25 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Wed, 26 Jun 2019 10:31:29 +0800
Subject: [PATCH 16/22] virtio: Make sure we get correct state of device on
 handle_aio_output()

We should set the flags: "start_on_kick" and "started" after we call
the kick functions (handle_aio_output() and handle_output()).

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Message-Id: <20190626023130.31315-5-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 5fd25d98a9..e098fc8ef0 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1571,10 +1571,10 @@ void virtio_queue_notify(VirtIODevice *vdev, int n)
         event_notifier_set(&vq->host_notifier);
     } else if (vq->handle_output) {
         vq->handle_output(vdev, vq);
-    }
 
-    if (unlikely(vdev->start_on_kick)) {
-        virtio_set_started(vdev, true);
+        if (unlikely(vdev->start_on_kick)) {
+            virtio_set_started(vdev, true);
+        }
     }
 }
 

From 4c5cf37b504b1444b031d5c9af6b6ad2f171d006 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@baidu.com>
Date: Wed, 26 Jun 2019 10:31:30 +0800
Subject: [PATCH 17/22] virtio: Don't change "started" flag on
 virtio_vmstate_change()

We will call virtio_set_status() on virtio_vmstate_change().
The "started" flag should not be changed in this case. Otherwise,
we may get an incorrect value when we set "started" flag but
not set DRIVER_OK in source VM.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Message-Id: <20190626023130.31315-6-xieyongji@baidu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index e098fc8ef0..18f9f4c372 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1163,7 +1163,10 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val)
         }
     }
 
-    virtio_set_started(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK);
+    if ((vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) !=
+        (val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        virtio_set_started(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK);
+    }
 
     if (k->set_status) {
         k->set_status(vdev, val);

From 457cfcccdd107a968d106934df63b836b5dd743e Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Fri, 28 Jun 2019 17:02:27 -0300
Subject: [PATCH 18/22] pc: Move compat_apic_id_mode variable to PCMachineClass

Replace the static variable with a PCMachineClass field.  This
will help us eventually get rid of the pc_compat_*() init
functions.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Message-Id: <20190628200227.1053-1-ehabkost@redhat.com>
Reviewed-by: Philippe Mathieu-Daude <philmd@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/pc.c         | 22 +++++++++-------------
 hw/i386/pc_piix.c    |  3 ++-
 include/hw/i386/pc.h |  3 +++
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7b99b60cd3..b380bd7d74 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -915,14 +915,6 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length)
     return false;
 }
 
-/* Enables contiguous-apic-ID mode, for compatibility */
-static bool compat_apic_id_mode;
-
-void enable_compat_apic_id_mode(void)
-{
-    compat_apic_id_mode = true;
-}
-
 /* Calculates initial APIC ID for a specific CPU index
  *
  * Currently we need to be able to calculate the APIC ID from the CPU index
@@ -930,13 +922,15 @@ void enable_compat_apic_id_mode(void)
  * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
  * all CPUs up to max_cpus.
  */
-static uint32_t x86_cpu_apic_id_from_index(unsigned int cpu_index)
+static uint32_t x86_cpu_apic_id_from_index(PCMachineState *pcms,
+                                           unsigned int cpu_index)
 {
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
     uint32_t correct_id;
     static bool warned;
 
     correct_id = x86_apicid_from_cpu_idx(smp_cores, smp_threads, cpu_index);
-    if (compat_apic_id_mode) {
+    if (pcmc->compat_apic_id_mode) {
         if (cpu_index != correct_id && !warned && !qtest_enabled()) {
             error_report("APIC IDs set in compatibility mode, "
                          "CPU topology won't match the configuration");
@@ -1535,7 +1529,8 @@ static void pc_new_cpu(const char *typename, int64_t apic_id, Error **errp)
 void pc_hot_add_cpu(const int64_t id, Error **errp)
 {
     MachineState *ms = MACHINE(qdev_get_machine());
-    int64_t apic_id = x86_cpu_apic_id_from_index(id);
+    PCMachineState *pcms = PC_MACHINE(ms);
+    int64_t apic_id = x86_cpu_apic_id_from_index(pcms, id);
     Error *local_err = NULL;
 
     if (id < 0) {
@@ -1571,7 +1566,7 @@ void pc_cpus_init(PCMachineState *pcms)
      *
      * This is used for FW_CFG_MAX_CPUS. See comments on bochs_bios_init().
      */
-    pcms->apic_id_limit = x86_cpu_apic_id_from_index(max_cpus - 1) + 1;
+    pcms->apic_id_limit = x86_cpu_apic_id_from_index(pcms, max_cpus - 1) + 1;
     possible_cpus = mc->possible_cpu_arch_ids(ms);
     for (i = 0; i < smp_cpus; i++) {
         pc_new_cpu(possible_cpus->cpus[i].type, possible_cpus->cpus[i].arch_id,
@@ -2730,6 +2725,7 @@ static int64_t pc_get_default_cpu_node_id(const MachineState *ms, int idx)
 
 static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms)
 {
+    PCMachineState *pcms = PC_MACHINE(ms);
     int i;
 
     if (ms->possible_cpus) {
@@ -2749,7 +2745,7 @@ static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms)
 
         ms->possible_cpus->cpus[i].type = ms->cpu_type;
         ms->possible_cpus->cpus[i].vcpus_count = 1;
-        ms->possible_cpus->cpus[i].arch_id = x86_cpu_apic_id_from_index(i);
+        ms->possible_cpus->cpus[i].arch_id = x86_cpu_apic_id_from_index(pcms, i);
         x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id,
                                  smp_cores, smp_threads, &topo);
         ms->possible_cpus->cpus[i].props.has_socket_id = true;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index c07c4a5b38..f29de58636 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -358,7 +358,6 @@ static void pc_compat_1_4_fn(MachineState *machine)
 static void pc_compat_1_3(MachineState *machine)
 {
     pc_compat_1_4_fn(machine);
-    enable_compat_apic_id_mode();
 }
 
 /* PC compat function for pc-0.14 to pc-1.2 */
@@ -708,6 +707,7 @@ DEFINE_I440FX_MACHINE(v1_4, "pc-i440fx-1.4", pc_compat_1_4_fn,
 
 static void pc_i440fx_1_3_machine_options(MachineClass *m)
 {
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
     static GlobalProperty compat[] = {
         PC_CPU_MODEL_IDS("1.3.0")
         { "usb-tablet", "usb_version", "1" },
@@ -718,6 +718,7 @@ static void pc_i440fx_1_3_machine_options(MachineClass *m)
 
     pc_i440fx_1_4_machine_options(m);
     m->hw_version = "1.3.0";
+    pcmc->compat_apic_id_mode = true;
     compat_props_add(m->compat_props, compat, G_N_ELEMENTS(compat));
 }
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index c54cc54a47..853502f277 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -134,6 +134,9 @@ typedef struct PCMachineClass {
 
     /* use PVH to load kernels that support this feature */
     bool pvh_enabled;
+
+    /* Enables contiguous-apic-ID mode */
+    bool compat_apic_id_mode;
 } PCMachineClass;
 
 #define TYPE_PC_MACHINE "generic-pc-machine"

From db68f4ff06cbe0517ed0d9b5634f6cddaed2547c Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 26 Jun 2019 08:48:12 +0100
Subject: [PATCH 19/22] libvhost-user: add vmsg_set_reply_u64() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The VhostUserMsg request is reused as the reply by message processing
functions.  This is risky since request fields may corrupt the reply if
the vhost-user message handler function forgets to re-initialize them.

Changing this practice would be very invasive but we can introduce a
helper function to make u64 payload replies safe.  This also eliminates
code duplication in message processing functions.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190626074815.19994-2-stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 443b7e08c3..a8657c7af2 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -216,6 +216,15 @@ vmsg_close_fds(VhostUserMsg *vmsg)
     }
 }
 
+/* Set reply payload.u64 and clear request flags and fd_num */
+static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val)
+{
+    vmsg->flags = 0; /* defaults will be set by vu_send_reply() */
+    vmsg->size = sizeof(vmsg->payload.u64);
+    vmsg->payload.u64 = val;
+    vmsg->fd_num = 0;
+}
+
 /* A test to see if we have userfault available */
 static bool
 have_userfault(void)
@@ -1168,10 +1177,7 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
         features |= dev->iface->get_protocol_features(dev);
     }
 
-    vmsg->payload.u64 = features;
-    vmsg->size = sizeof(vmsg->payload.u64);
-    vmsg->fd_num = 0;
-
+    vmsg_set_reply_u64(vmsg, features);
     return true;
 }
 
@@ -1307,17 +1313,14 @@ out:
 static bool
 vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
 {
-    vmsg->payload.u64 = -1;
-    vmsg->size = sizeof(vmsg->payload.u64);
-
     if (dev->nregions) {
         vu_panic(dev, "Regions already registered at postcopy-listen");
+        vmsg_set_reply_u64(vmsg, -1);
         return true;
     }
     dev->postcopy_listening = true;
 
-    vmsg->flags = VHOST_USER_VERSION |  VHOST_USER_REPLY_MASK;
-    vmsg->payload.u64 = 0; /* Success */
+    vmsg_set_reply_u64(vmsg, 0);
     return true;
 }
 
@@ -1332,10 +1335,7 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
         DPRINT("%s: Done close\n", __func__);
     }
 
-    vmsg->fd_num = 0;
-    vmsg->payload.u64 = 0;
-    vmsg->size = sizeof(vmsg->payload.u64);
-    vmsg->flags = VHOST_USER_VERSION |  VHOST_USER_REPLY_MASK;
+    vmsg_set_reply_u64(vmsg, 0);
     DPRINT("%s: exit\n", __func__);
     return true;
 }

From 6f5fd837889814e57a4bb473bf80ce08e355a12d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 26 Jun 2019 08:48:13 +0100
Subject: [PATCH 20/22] libvhost-user: support many virtqueues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently libvhost-user is hardcoded to at most 8 virtqueues.  The
device backend should decide the number of virtqueues, not
libvhost-user.  This is important for multiqueue device backends where
the guest driver needs an accurate number of virtqueues.

This change breaks libvhost-user and libvhost-user-glib API stability.
There is no stability guarantee yet, so make this change now and update
all in-tree library users.

This patch touches up vhost-user-blk, vhost-user-gpu, vhost-user-input,
vhost-user-scsi, and vhost-user-bridge.  If the device has a fixed
number of queues that exact number is used.  Otherwise the previous
default of 8 virtqueues is used.

vu_init() and vug_init() can now fail if malloc() returns NULL.  I
considered aborting with an error in libvhost-user but it should be safe
to instantiate new vhost-user instances at runtime without risk of
terminating the process.  Therefore callers need to handle the vu_init()
failure now.

vhost-user-blk and vhost-user-scsi duplicate virtqueue index checks that
are already performed by libvhost-user.  This code would need to be
modified to use max_queues but remove it completely instead since it's
redundant.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190626074815.19994-3-stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user-glib.c | 12 +++++--
 contrib/libvhost-user/libvhost-user-glib.h |  2 +-
 contrib/libvhost-user/libvhost-user.c      | 33 ++++++++++++-----
 contrib/libvhost-user/libvhost-user.h      | 10 ++++--
 contrib/vhost-user-blk/vhost-user-blk.c    | 16 +++++----
 contrib/vhost-user-gpu/main.c              |  9 ++++-
 contrib/vhost-user-input/main.c            | 11 +++++-
 contrib/vhost-user-scsi/vhost-user-scsi.c  | 21 +++++------
 tests/vhost-user-bridge.c                  | 42 ++++++++++++++--------
 9 files changed, 105 insertions(+), 51 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user-glib.c b/contrib/libvhost-user/libvhost-user-glib.c
index 42660a1b36..99edd2f3de 100644
--- a/contrib/libvhost-user/libvhost-user-glib.c
+++ b/contrib/libvhost-user/libvhost-user-glib.c
@@ -131,18 +131,24 @@ static void vug_watch(VuDev *dev, int condition, void *data)
     }
 }
 
-void
-vug_init(VugDev *dev, int socket,
+bool
+vug_init(VugDev *dev, uint16_t max_queues, int socket,
          vu_panic_cb panic, const VuDevIface *iface)
 {
     g_assert(dev);
     g_assert(iface);
 
-    vu_init(&dev->parent, socket, panic, set_watch, remove_watch, iface);
+    if (!vu_init(&dev->parent, max_queues, socket, panic, set_watch,
+                 remove_watch, iface)) {
+        return false;
+    }
+
     dev->fdmap = g_hash_table_new_full(NULL, NULL, NULL,
                                        (GDestroyNotify) g_source_destroy);
 
     dev->src = vug_source_new(dev, socket, G_IO_IN, vug_watch, NULL);
+
+    return true;
 }
 
 void
diff --git a/contrib/libvhost-user/libvhost-user-glib.h b/contrib/libvhost-user/libvhost-user-glib.h
index d3200f3afc..64d539d93a 100644
--- a/contrib/libvhost-user/libvhost-user-glib.h
+++ b/contrib/libvhost-user/libvhost-user-glib.h
@@ -25,7 +25,7 @@ typedef struct VugDev {
     GSource *src;
 } VugDev;
 
-void vug_init(VugDev *dev, int socket,
+bool vug_init(VugDev *dev, uint16_t max_queues, int socket,
               vu_panic_cb panic, const VuDevIface *iface);
 void vug_deinit(VugDev *dev);
 
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index a8657c7af2..90bea856dd 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -493,9 +493,9 @@ vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 static void
 vu_set_enable_all_rings(VuDev *dev, bool enabled)
 {
-    int i;
+    uint16_t i;
 
-    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
+    for (i = 0; i < dev->max_queues; i++) {
         dev->vq[i].enable = enabled;
     }
 }
@@ -916,7 +916,7 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
 {
     int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 
-    if (index >= VHOST_MAX_NR_VIRTQUEUE) {
+    if (index >= dev->max_queues) {
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Invalid queue index: %u", index);
         return false;
@@ -1213,7 +1213,7 @@ vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
     DPRINT("State.index: %d\n", index);
     DPRINT("State.enable:   %d\n", enable);
 
-    if (index >= VHOST_MAX_NR_VIRTQUEUE) {
+    if (index >= dev->max_queues) {
         vu_panic(dev, "Invalid vring_enable index: %u", index);
         return false;
     }
@@ -1582,7 +1582,7 @@ vu_deinit(VuDev *dev)
     }
     dev->nregions = 0;
 
-    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
+    for (i = 0; i < dev->max_queues; i++) {
         VuVirtq *vq = &dev->vq[i];
 
         if (vq->call_fd != -1) {
@@ -1627,18 +1627,23 @@ vu_deinit(VuDev *dev)
     if (dev->sock != -1) {
         close(dev->sock);
     }
+
+    free(dev->vq);
+    dev->vq = NULL;
 }
 
-void
+bool
 vu_init(VuDev *dev,
+        uint16_t max_queues,
         int socket,
         vu_panic_cb panic,
         vu_set_watch_cb set_watch,
         vu_remove_watch_cb remove_watch,
         const VuDevIface *iface)
 {
-    int i;
+    uint16_t i;
 
+    assert(max_queues > 0);
     assert(socket >= 0);
     assert(set_watch);
     assert(remove_watch);
@@ -1654,18 +1659,28 @@ vu_init(VuDev *dev,
     dev->iface = iface;
     dev->log_call_fd = -1;
     dev->slave_fd = -1;
-    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
+    dev->max_queues = max_queues;
+
+    dev->vq = malloc(max_queues * sizeof(dev->vq[0]));
+    if (!dev->vq) {
+        DPRINT("%s: failed to malloc virtqueues\n", __func__);
+        return false;
+    }
+
+    for (i = 0; i < max_queues; i++) {
         dev->vq[i] = (VuVirtq) {
             .call_fd = -1, .kick_fd = -1, .err_fd = -1,
             .notification = true,
         };
     }
+
+    return true;
 }
 
 VuVirtq *
 vu_get_queue(VuDev *dev, int qidx)
 {
-    assert(qidx < VHOST_MAX_NR_VIRTQUEUE);
+    assert(qidx < dev->max_queues);
     return &dev->vq[qidx];
 }
 
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 3b888ff0a5..46b600799b 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -25,7 +25,6 @@
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 #define VHOST_LOG_PAGE 4096
 
-#define VHOST_MAX_NR_VIRTQUEUE 8
 #define VIRTQUEUE_MAX_SIZE 1024
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
@@ -353,7 +352,7 @@ struct VuDev {
     int sock;
     uint32_t nregions;
     VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-    VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
+    VuVirtq *vq;
     VuDevInflightInfo inflight_info;
     int log_call_fd;
     int slave_fd;
@@ -362,6 +361,7 @@ struct VuDev {
     uint64_t features;
     uint64_t protocol_features;
     bool broken;
+    uint16_t max_queues;
 
     /* @set_watch: add or update the given fd to the watch set,
      * call cb when condition is met */
@@ -391,6 +391,7 @@ typedef struct VuVirtqElement {
 /**
  * vu_init:
  * @dev: a VuDev context
+ * @max_queues: maximum number of virtqueues
  * @socket: the socket connected to vhost-user master
  * @panic: a panic callback
  * @set_watch: a set_watch callback
@@ -398,8 +399,11 @@ typedef struct VuVirtqElement {
  * @iface: a VuDevIface structure with vhost-user device callbacks
  *
  * Intializes a VuDev vhost-user context.
+ *
+ * Returns: true on success, false on failure.
  **/
-void vu_init(VuDev *dev,
+bool vu_init(VuDev *dev,
+             uint16_t max_queues,
              int socket,
              vu_panic_cb panic,
              vu_set_watch_cb set_watch,
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index 86a3987744..ae61034656 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -25,6 +25,10 @@
 #include <sys/ioctl.h>
 #endif
 
+enum {
+    VHOST_USER_BLK_MAX_QUEUES = 8,
+};
+
 struct virtio_blk_inhdr {
     unsigned char status;
 };
@@ -334,12 +338,6 @@ static void vub_process_vq(VuDev *vu_dev, int idx)
     VuVirtq *vq;
     int ret;
 
-    if ((idx < 0) || (idx >= VHOST_MAX_NR_VIRTQUEUE)) {
-        fprintf(stderr, "VQ Index out of range: %d\n", idx);
-        vub_panic_cb(vu_dev, NULL);
-        return;
-    }
-
     gdev = container_of(vu_dev, VugDev, parent);
     vdev_blk = container_of(gdev, VubDev, parent);
     assert(vdev_blk);
@@ -631,7 +629,11 @@ int main(int argc, char **argv)
         vdev_blk->enable_ro = true;
     }
 
-    vug_init(&vdev_blk->parent, csock, vub_panic_cb, &vub_iface);
+    if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock,
+                  vub_panic_cb, &vub_iface)) {
+        fprintf(stderr, "Failed to initialized libvhost-user-glib\n");
+        goto err;
+    }
 
     g_main_loop_run(vdev_blk->loop);
 
diff --git a/contrib/vhost-user-gpu/main.c b/contrib/vhost-user-gpu/main.c
index 04b753046f..b45d2019b4 100644
--- a/contrib/vhost-user-gpu/main.c
+++ b/contrib/vhost-user-gpu/main.c
@@ -25,6 +25,10 @@
 #include "virgl.h"
 #include "vugbm.h"
 
+enum {
+    VHOST_USER_GPU_MAX_QUEUES = 2,
+};
+
 struct virtio_gpu_simple_resource {
     uint32_t resource_id;
     uint32_t width;
@@ -1169,7 +1173,10 @@ main(int argc, char *argv[])
         exit(EXIT_FAILURE);
     }
 
-    vug_init(&g.dev, fd, vg_panic, &vuiface);
+    if (!vug_init(&g.dev, VHOST_USER_GPU_MAX_QUEUES, fd, vg_panic, &vuiface)) {
+        g_printerr("Failed to initialize libvhost-user-glib.\n");
+        exit(EXIT_FAILURE);
+    }
 
     loop = g_main_loop_new(NULL, FALSE);
     g_main_loop_run(loop);
diff --git a/contrib/vhost-user-input/main.c b/contrib/vhost-user-input/main.c
index 8b4e7d2536..449fd2171a 100644
--- a/contrib/vhost-user-input/main.c
+++ b/contrib/vhost-user-input/main.c
@@ -17,6 +17,10 @@
 #include "standard-headers/linux/virtio_input.h"
 #include "qapi/error.h"
 
+enum {
+    VHOST_USER_INPUT_MAX_QUEUES = 2,
+};
+
 typedef struct virtio_input_event virtio_input_event;
 typedef struct virtio_input_config virtio_input_config;
 
@@ -384,7 +388,12 @@ main(int argc, char *argv[])
         g_printerr("Invalid vhost-user socket.\n");
         exit(EXIT_FAILURE);
     }
-    vug_init(&vi.dev, fd, vi_panic, &vuiface);
+
+    if (!vug_init(&vi.dev, VHOST_USER_INPUT_MAX_QUEUES, fd, vi_panic,
+                  &vuiface)) {
+        g_printerr("Failed to initialize libvhost-user-glib.\n");
+        exit(EXIT_FAILURE);
+    }
 
     loop = g_main_loop_new(NULL, FALSE);
     g_main_loop_run(loop);
diff --git a/contrib/vhost-user-scsi/vhost-user-scsi.c b/contrib/vhost-user-scsi/vhost-user-scsi.c
index 496dd6e693..0fc14d7899 100644
--- a/contrib/vhost-user-scsi/vhost-user-scsi.c
+++ b/contrib/vhost-user-scsi/vhost-user-scsi.c
@@ -19,6 +19,10 @@
 
 #define VUS_ISCSI_INITIATOR "iqn.2016-11.com.nutanix:vhost-user-scsi"
 
+enum {
+    VHOST_USER_SCSI_MAX_QUEUES = 8,
+};
+
 typedef struct VusIscsiLun {
     struct iscsi_context *iscsi_ctx;
     int iscsi_lun;
@@ -231,11 +235,6 @@ static void vus_proc_req(VuDev *vu_dev, int idx)
 
     gdev = container_of(vu_dev, VugDev, parent);
     vdev_scsi = container_of(gdev, VusDev, parent);
-    if (idx < 0 || idx >= VHOST_MAX_NR_VIRTQUEUE) {
-        g_warning("VQ Index out of range: %d", idx);
-        vus_panic_cb(vu_dev, NULL);
-        return;
-    }
 
     vq = vu_get_queue(vu_dev, idx);
     if (!vq) {
@@ -295,12 +294,6 @@ static void vus_queue_set_started(VuDev *vu_dev, int idx, bool started)
 
     assert(vu_dev);
 
-    if (idx < 0 || idx >= VHOST_MAX_NR_VIRTQUEUE) {
-        g_warning("VQ Index out of range: %d", idx);
-        vus_panic_cb(vu_dev, NULL);
-        return;
-    }
-
     vq = vu_get_queue(vu_dev, idx);
 
     if (idx == 0 || idx == 1) {
@@ -398,7 +391,11 @@ int main(int argc, char **argv)
         goto err;
     }
 
-    vug_init(&vdev_scsi->parent, csock, vus_panic_cb, &vus_iface);
+    if (!vug_init(&vdev_scsi->parent, VHOST_USER_SCSI_MAX_QUEUES, csock,
+                  vus_panic_cb, &vus_iface)) {
+        g_printerr("Failed to initialize libvhost-user-glib\n");
+        goto err;
+    }
 
     g_main_loop_run(vdev_scsi->loop);
 
diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c
index 0bb03af0e5..c4e350e1f5 100644
--- a/tests/vhost-user-bridge.c
+++ b/tests/vhost-user-bridge.c
@@ -45,6 +45,10 @@
         } \
     } while (0)
 
+enum {
+    VHOST_USER_BRIDGE_MAX_QUEUES = 8,
+};
+
 typedef void (*CallbackFunc)(int sock, void *ctx);
 
 typedef struct Event {
@@ -512,12 +516,16 @@ vubr_accept_cb(int sock, void *ctx)
     }
     DPRINT("Got connection from remote peer on sock %d\n", conn_fd);
 
-    vu_init(&dev->vudev,
-            conn_fd,
-            vubr_panic,
-            vubr_set_watch,
-            vubr_remove_watch,
-            &vuiface);
+    if (!vu_init(&dev->vudev,
+                 VHOST_USER_BRIDGE_MAX_QUEUES,
+                 conn_fd,
+                 vubr_panic,
+                 vubr_set_watch,
+                 vubr_remove_watch,
+                 &vuiface)) {
+        fprintf(stderr, "Failed to initialize libvhost-user\n");
+        exit(1);
+    }
 
     dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb);
     dispatcher_remove(&dev->dispatcher, sock);
@@ -560,12 +568,18 @@ vubr_new(const char *path, bool client)
         if (connect(dev->sock, (struct sockaddr *)&un, len) == -1) {
             vubr_die("connect");
         }
-        vu_init(&dev->vudev,
-                dev->sock,
-                vubr_panic,
-                vubr_set_watch,
-                vubr_remove_watch,
-                &vuiface);
+
+        if (!vu_init(&dev->vudev,
+                     VHOST_USER_BRIDGE_MAX_QUEUES,
+                     dev->sock,
+                     vubr_panic,
+                     vubr_set_watch,
+                     vubr_remove_watch,
+                     &vuiface)) {
+            fprintf(stderr, "Failed to initialize libvhost-user\n");
+            exit(1);
+        }
+
         cb = vubr_receive_cb;
     }
 
@@ -584,7 +598,7 @@ static void *notifier_thread(void *arg)
     int qidx;
 
     while (true) {
-        for (qidx = 0; qidx < VHOST_MAX_NR_VIRTQUEUE; qidx++) {
+        for (qidx = 0; qidx < VHOST_USER_BRIDGE_MAX_QUEUES; qidx++) {
             uint16_t *n = vubr->notifier.addr + pagesize * qidx;
 
             if (*n == qidx) {
@@ -616,7 +630,7 @@ vubr_host_notifier_setup(VubrDev *dev)
     void *addr;
     int fd;
 
-    length = getpagesize() * VHOST_MAX_NR_VIRTQUEUE;
+    length = getpagesize() * VHOST_USER_BRIDGE_MAX_QUEUES;
 
     fd = mkstemp(template);
     if (fd < 0) {

From ea1b802ca27e5b38798b1e346352ad724aa841cd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 26 Jun 2019 08:48:14 +0100
Subject: [PATCH 21/22] libvhost-user: implement VHOST_USER_PROTOCOL_F_MQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Existing vhost-user device backends, including vhost-user-scsi and
vhost-user-blk, support multiqueue but libvhost-user currently does not
advertise this.

VHOST_USER_PROTOCOL_F_MQ enables the VHOST_USER_GET_QUEUE_NUM request
needed for a vhost-user master to query the number of queues.  For
example, QEMU's vhost-user-net master depends on
VHOST_USER_PROTOCOL_F_MQ for multiqueue.

If you're wondering how any device backend with more than one virtqueue
functions today, it's because device types with a fixed number of
virtqueues do not require querying the number of queues.  Therefore the
vhost-user master for vhost-user-input with 2 virtqueues, for example,
doesn't actually depend on VHOST_USER_PROTOCOL_F_MQ.  It just enables
virtqueues 0 and 1 without asking.

Let there be multiqueue!

Suggested-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190626074815.19994-4-stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 90bea856dd..4b36e35a82 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -1160,7 +1160,8 @@ vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
 static bool
 vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
-    uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
+    uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ |
+                        1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
                         1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
                         1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
                         1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD;
@@ -1200,8 +1201,8 @@ vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 static bool
 vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
-    DPRINT("Function %s() not implemented yet.\n", __func__);
-    return false;
+    vmsg_set_reply_u64(vmsg, dev->max_queues);
+    return true;
 }
 
 static bool

From 3ef4dff2b397c8932fd3b4d955cd6ba620245475 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 26 Jun 2019 08:48:15 +0100
Subject: [PATCH 22/22] docs: avoid vhost-user-net specifics in multiqueue
 section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Multiple queue support" section makes references to vhost-user-net
"queue pairs".  This is confusing for two reasons:
1. This actually applies to all device types, not just vhost-user-net.
2. VHOST_USER_GET_QUEUE_NUM returns the number of virtqueues, not the
   number of queue pairs.

Reword the section so that the vhost-user-net specific part is relegated
to the very end: we acknowledge that vhost-user-net historically
automatically enabled the first queue pair.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20190626074815.19994-5-stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 docs/interop/vhost-user.rst | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index dc0ff9211f..5750668aba 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -324,19 +324,20 @@ must support changing some configuration aspects on the fly.
 Multiple queue support
 ----------------------
 
-Multiple queue is treated as a protocol extension, hence the slave has
-to implement protocol features first. The multiple queues feature is
-supported only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ``
-(bit 0) is set.
+Multiple queue support allows the slave to advertise the maximum number of
+queues.  This is treated as a protocol extension, hence the slave has to
+implement protocol features first. The multiple queues feature is supported
+only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ`` (bit 0) is set.
 
-The max number of queue pairs the slave supports can be queried with
-message ``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the
-number of requested queues is bigger than that.
+The max number of queues the slave supports can be queried with message
+``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the number of requested
+queues is bigger than that.
 
 As all queues share one connection, the master uses a unique index for each
-queue in the sent message to identify a specified queue. One queue pair
-is enabled initially. More queues are enabled dynamically, by sending
-message ``VHOST_USER_SET_VRING_ENABLE``.
+queue in the sent message to identify a specified queue.
+
+The master enables queues by sending message ``VHOST_USER_SET_VRING_ENABLE``.
+vhost-user-net has historically automatically enabled the first queue pair.
 
 Migration
 ---------