From 0ac8ef71329ee242951074eb2dc7136f99421d8c Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 4 Jan 2011 12:37:50 -0700
Subject: [PATCH 01/12] qdev: Track runtime machine modifications

Create a trivial interface to track whether the machine has been
modified since boot.  Adding or removing devices will trigger this
to return true.  An example usage scenario for such an interface is
the rtl8139 driver which includes a cpu_register_io_memory() value
in it's migration stream.  For the majority of migrations, where
no hotplug has occured in the machine, this works correctly.  Once
the machine is modified, we can use this interface to detect that
and include a subsection for the device to prevent migrations to
rtl8139 versions with this bug.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/qdev.c | 10 ++++++++++
 hw/qdev.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/hw/qdev.c b/hw/qdev.c
index 31eb464f23..5b8d3742ec 100644
--- a/hw/qdev.c
+++ b/hw/qdev.c
@@ -32,6 +32,8 @@
 #include "blockdev.h"
 
 static int qdev_hotplug = 0;
+static bool qdev_hot_added = false;
+static bool qdev_hot_removed = false;
 
 /* This is a nasty hack to allow passing a NULL bus to qdev_create.  */
 static BusState *main_system_bus;
@@ -93,6 +95,7 @@ static DeviceState *qdev_create_from_info(BusState *bus, DeviceInfo *info)
     if (qdev_hotplug) {
         assert(bus->allow_hotplug);
         dev->hotplugged = 1;
+        qdev_hot_added = true;
     }
     dev->instance_id_alias = -1;
     dev->state = DEV_STATE_CREATED;
@@ -294,6 +297,8 @@ int qdev_unplug(DeviceState *dev)
     }
     assert(dev->info->unplug != NULL);
 
+    qdev_hot_removed = true;
+
     return dev->info->unplug(dev);
 }
 
@@ -395,6 +400,11 @@ void qdev_machine_creation_done(void)
     qdev_hotplug = 1;
 }
 
+bool qdev_machine_modified(void)
+{
+    return qdev_hot_added || qdev_hot_removed;
+}
+
 /* Get a character (serial) device interface.  */
 CharDriverState *qdev_init_chardev(DeviceState *dev)
 {
diff --git a/hw/qdev.h b/hw/qdev.h
index 2be775f9e8..e520aaa786 100644
--- a/hw/qdev.h
+++ b/hw/qdev.h
@@ -132,6 +132,7 @@ int qdev_unplug(DeviceState *dev);
 void qdev_free(DeviceState *dev);
 int qdev_simple_unplug_cb(DeviceState *dev);
 void qdev_machine_creation_done(void);
+bool qdev_machine_modified(void);
 
 qemu_irq qdev_get_gpio_in(DeviceState *dev, int n);
 void qdev_connect_gpio_out(DeviceState *dev, int n, qemu_irq pin);

From c574ba5a4ce0faee6a687412804d6045ef815327 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 4 Jan 2011 12:38:02 -0700
Subject: [PATCH 02/12] rtl8139: Use subsection to restrict migration after
 hotplug

rtl8139 includes a cpu_register_io_memory acquired value in it's
migration data.  This is not only unecessary, but we should treat
these values as unique to the VM instances since the value depends
on call order.  In most cases, this miraculously still works.
However, if devices are added or removed from the system, it may
represent an ordering change, which could cause the target rtl8139
device to make use of another device's cpu_register_io_memory value.
If we detect that a hot-add/remove has occured, include a subsection
to restrict migrations only to driver versions known to include this
fix.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/rtl8139.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index a8aed89074..a22530cf89 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -495,6 +495,8 @@ typedef struct RTL8139State {
     QEMUTimer *timer;
     int64_t TimerExpire;
 
+    /* Support migration to/from old versions */
+    int rtl8139_mmio_io_addr_dummy;
 } RTL8139State;
 
 static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
@@ -3162,6 +3164,21 @@ static int rtl8139_post_load(void *opaque, int version_id)
     return 0;
 }
 
+static bool rtl8139_hotplug_ready_needed(void *opaque)
+{
+    return qdev_machine_modified();
+}
+
+static const VMStateDescription vmstate_rtl8139_hotplug_ready ={
+    .name = "rtl8139/hotplug_ready",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField []) {
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static void rtl8139_pre_save(void *opaque)
 {
     RTL8139State* s = opaque;
@@ -3171,6 +3188,7 @@ static void rtl8139_pre_save(void *opaque)
     rtl8139_set_next_tctr_time(s, current_time);
     s->TCTR = muldiv64(current_time - s->TCTR_base, PCI_FREQUENCY,
                        get_ticks_per_sec());
+    s->rtl8139_mmio_io_addr_dummy = s->rtl8139_mmio_io_addr;
 }
 
 static const VMStateDescription vmstate_rtl8139 = {
@@ -3223,7 +3241,7 @@ static const VMStateDescription vmstate_rtl8139 = {
 
         VMSTATE_UNUSED(4),
         VMSTATE_MACADDR(conf.macaddr, RTL8139State),
-        VMSTATE_INT32(rtl8139_mmio_io_addr, RTL8139State),
+        VMSTATE_INT32(rtl8139_mmio_io_addr_dummy, RTL8139State),
 
         VMSTATE_UINT32(currTxDesc, RTL8139State),
         VMSTATE_UINT32(currCPlusRxDesc, RTL8139State),
@@ -3252,6 +3270,14 @@ static const VMStateDescription vmstate_rtl8139 = {
 
         VMSTATE_UINT32_V(cplus_enabled, RTL8139State, 4),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (VMStateSubsection []) {
+        {
+            .vmsd = &vmstate_rtl8139_hotplug_ready,
+            .needed = rtl8139_hotplug_ready_needed,
+        }, {
+            /* empty */
+        }
     }
 };
 

From 180c22e18b0a9be21445271f94347238b0bc0a25 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 6 Jan 2011 15:14:37 +0100
Subject: [PATCH 03/12] pci: allow devices being tagged as not hotpluggable.

This patch adds a field to PCIDeviceInfo to tag devices as being
not hotpluggable.  Any attempt to plug-in or -out such a device
will throw an error.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci.c | 10 ++++++++++
 hw/pci.h |  3 +++
 qerror.c |  4 ++++
 qerror.h |  3 +++
 4 files changed, 20 insertions(+)

diff --git a/hw/pci.c b/hw/pci.c
index d0b51b80bd..8d0e3df2e5 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1624,6 +1624,11 @@ static int pci_qdev_init(DeviceState *qdev, DeviceInfo *base)
                                      info->is_bridge);
     if (pci_dev == NULL)
         return -1;
+    if (qdev->hotplugged && info->no_hotplug) {
+        qerror_report(QERR_DEVICE_NO_HOTPLUG, info->qdev.name);
+        do_pci_unregister_device(pci_dev);
+        return -1;
+    }
     rc = info->init(pci_dev);
     if (rc != 0) {
         do_pci_unregister_device(pci_dev);
@@ -1656,7 +1661,12 @@ static int pci_qdev_init(DeviceState *qdev, DeviceInfo *base)
 static int pci_unplug_device(DeviceState *qdev)
 {
     PCIDevice *dev = DO_UPCAST(PCIDevice, qdev, qdev);
+    PCIDeviceInfo *info = container_of(qdev->info, PCIDeviceInfo, qdev);
 
+    if (info->no_hotplug) {
+        qerror_report(QERR_DEVICE_NO_HOTPLUG, info->qdev.name);
+        return -1;
+    }
     return dev->bus->hotplug(dev->bus->hotplug_qdev, dev,
                              PCI_HOTPLUG_DISABLED);
 }
diff --git a/hw/pci.h b/hw/pci.h
index 052960e3ea..bc8d5bb3c7 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -436,6 +436,9 @@ typedef struct {
     /* pcie stuff */
     int is_express;   /* is this device pci express? */
 
+    /* device isn't hot-pluggable */
+    int no_hotplug;
+
     /* rom bar */
     const char *romfile;
 } PCIDeviceInfo;
diff --git a/qerror.c b/qerror.c
index ac2cdafa65..9d0cdeb45c 100644
--- a/qerror.c
+++ b/qerror.c
@@ -100,6 +100,10 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_DEVICE_NO_BUS,
         .desc      = "Device '%(device)' has no child bus",
     },
+    {
+        .error_fmt = QERR_DEVICE_NO_HOTPLUG,
+        .desc      = "Device '%(device)' does not support hotplugging",
+    },
     {
         .error_fmt = QERR_DUPLICATE_ID,
         .desc      = "Duplicate ID '%(id)' for %(object)",
diff --git a/qerror.h b/qerror.h
index 943a24b4e5..b0f69dabe5 100644
--- a/qerror.h
+++ b/qerror.h
@@ -90,6 +90,9 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_DEVICE_NO_BUS \
     "{ 'class': 'DeviceNoBus', 'data': { 'device': %s } }"
 
+#define QERR_DEVICE_NO_HOTPLUG \
+    "{ 'class': 'DeviceNoHotplug', 'data': { 'device': %s } }"
+
 #define QERR_DUPLICATE_ID \
     "{ 'class': 'DuplicateId', 'data': { 'id': %s, 'object': %s } }"
 

From 0965f12da61cbfe62252d21a8e6fa309753760e8 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 6 Jan 2011 15:14:38 +0100
Subject: [PATCH 04/12] piix: tag as not hotpluggable.

This patch tags all pci devices which belong to the piix3/4 chipsets as
not hotpluggable (Host bridge, ISA bridge, IDE controller, ACPI bridge).

Acked-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi_piix4.c | 2 ++
 hw/ide/piix.c   | 2 ++
 hw/piix4.c      | 1 +
 hw/piix_pci.c   | 2 ++
 4 files changed, 7 insertions(+)

diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 173d78148d..273097d480 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -428,6 +428,8 @@ static PCIDeviceInfo piix4_pm_info = {
     .qdev.desc          = "PM",
     .qdev.size          = sizeof(PIIX4PMState),
     .qdev.vmsd          = &vmstate_acpi,
+    .qdev.no_user       = 1,
+    .no_hotplug         = 1,
     .init               = piix4_pm_initfn,
     .config_write       = pm_write_config,
     .qdev.props         = (Property[]) {
diff --git a/hw/ide/piix.c b/hw/ide/piix.c
index 1cad9066a0..d4289af9c4 100644
--- a/hw/ide/piix.c
+++ b/hw/ide/piix.c
@@ -194,11 +194,13 @@ static PCIDeviceInfo piix_ide_info[] = {
         .qdev.name    = "piix3-ide",
         .qdev.size    = sizeof(PCIIDEState),
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = pci_piix3_ide_initfn,
     },{
         .qdev.name    = "piix4-ide",
         .qdev.size    = sizeof(PCIIDEState),
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = pci_piix4_ide_initfn,
     },{
         /* end of list */
diff --git a/hw/piix4.c b/hw/piix4.c
index 5489386d68..72073cd0a0 100644
--- a/hw/piix4.c
+++ b/hw/piix4.c
@@ -113,6 +113,7 @@ static PCIDeviceInfo piix4_info[] = {
         .qdev.desc    = "ISA bridge",
         .qdev.size    = sizeof(PCIDevice),
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = piix4_initfn,
     },{
         /* end of list */
diff --git a/hw/piix_pci.c b/hw/piix_pci.c
index 38f9d9eea4..358da58a80 100644
--- a/hw/piix_pci.c
+++ b/hw/piix_pci.c
@@ -348,6 +348,7 @@ static PCIDeviceInfo i440fx_info[] = {
         .qdev.size    = sizeof(PCII440FXState),
         .qdev.vmsd    = &vmstate_i440fx,
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = i440fx_initfn,
         .config_write = i440fx_write_config,
     },{
@@ -356,6 +357,7 @@ static PCIDeviceInfo i440fx_info[] = {
         .qdev.size    = sizeof(PIIX3State),
         .qdev.vmsd    = &vmstate_piix3,
         .qdev.no_user = 1,
+        .no_hotplug   = 1,
         .init         = piix3_initfn,
     },{
         /* end of list */

From be92bbf73dfd7d8a4786dc5f6c71590f4fbc5a32 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 6 Jan 2011 15:14:39 +0100
Subject: [PATCH 05/12] vga: tag as not hotplugable.

This patch tags all vga cards as not hotpluggable.  The qemu
standard vga will never ever be hotpluggable.  For cirrus + vmware
it might be possible to get that work some day.  Todays we can't
handle that for a number of reasons though.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/cirrus_vga.c | 1 +
 hw/vga-pci.c    | 1 +
 hw/vmware_vga.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c
index 4f5040cf4b..0d522a0540 100644
--- a/hw/cirrus_vga.c
+++ b/hw/cirrus_vga.c
@@ -3227,6 +3227,7 @@ static PCIDeviceInfo cirrus_vga_info = {
     .qdev.desc    = "Cirrus CLGD 54xx VGA",
     .qdev.size    = sizeof(PCICirrusVGAState),
     .qdev.vmsd    = &vmstate_pci_cirrus_vga,
+    .no_hotplug   = 1,
     .init         = pci_cirrus_vga_initfn,
     .romfile      = VGABIOS_CIRRUS_FILENAME,
     .config_write = pci_cirrus_write_config,
diff --git a/hw/vga-pci.c b/hw/vga-pci.c
index 791ca22763..ce9ec45777 100644
--- a/hw/vga-pci.c
+++ b/hw/vga-pci.c
@@ -110,6 +110,7 @@ static PCIDeviceInfo vga_info = {
     .qdev.name    = "VGA",
     .qdev.size    = sizeof(PCIVGAState),
     .qdev.vmsd    = &vmstate_vga_pci,
+    .no_hotplug   = 1,
     .init         = pci_vga_initfn,
     .config_write = pci_vga_write_config,
     .romfile      = "vgabios-stdvga.bin",
diff --git a/hw/vmware_vga.c b/hw/vmware_vga.c
index d9dd52fc60..6c59053308 100644
--- a/hw/vmware_vga.c
+++ b/hw/vmware_vga.c
@@ -1318,6 +1318,7 @@ static PCIDeviceInfo vmsvga_info = {
     .qdev.name    = "vmware-svga",
     .qdev.size    = sizeof(struct pci_vmsvga_state_s),
     .qdev.vmsd    = &vmstate_vmware_vga,
+    .no_hotplug   = 1,
     .init         = pci_vmsvga_initfn,
     .romfile      = "vgabios-vmware.bin",
 };

From 2f6bfe3b0c5bb216abfe015d824eaf84c449c6a5 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 6 Jan 2011 15:14:40 +0100
Subject: [PATCH 06/12] qxl: tag as not hotpluggable

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/qxl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/qxl.c b/hw/qxl.c
index 207aa63f90..bd71e5810f 100644
--- a/hw/qxl.c
+++ b/hw/qxl.c
@@ -1546,6 +1546,7 @@ static PCIDeviceInfo qxl_info_primary = {
     .qdev.size    = sizeof(PCIQXLDevice),
     .qdev.reset   = qxl_reset_handler,
     .qdev.vmsd    = &qxl_vmstate,
+    .no_hotplug   = 1,
     .init         = qxl_init_primary,
     .config_write = qxl_write_config,
     .romfile      = "vgabios-qxl.bin",

From 3dbca8e6a7a5be52c3251ad03bf30d73301babf3 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Fri, 17 Dec 2010 12:01:49 +0000
Subject: [PATCH 07/12] virtio-pci: Rename bugs field to flags

The VirtIOPCIProxy bugs field is currently used to enable workarounds
for older guests.  Rename it to flags so that other per-device behavior
can be tracked.

A later patch uses the flags field to remember whether ioeventfd should
be used for virtqueue host notification.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-pci.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 6186142b2b..13dd39194a 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -80,9 +80,8 @@
  * 12 is historical, and due to x86 page size. */
 #define VIRTIO_PCI_QUEUE_ADDR_SHIFT    12
 
-/* We can catch some guest bugs inside here so we continue supporting older
-   guests. */
-#define VIRTIO_PCI_BUG_BUS_MASTER	(1 << 0)
+/* Flags track per-device state like workarounds for quirks in older guests. */
+#define VIRTIO_PCI_FLAG_BUS_MASTER_BUG  (1 << 0)
 
 /* QEMU doesn't strictly need write barriers since everything runs in
  * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
@@ -95,7 +94,7 @@
 typedef struct {
     PCIDevice pci_dev;
     VirtIODevice *vdev;
-    uint32_t bugs;
+    uint32_t flags;
     uint32_t addr;
     uint32_t class_code;
     uint32_t nvectors;
@@ -159,7 +158,7 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f)
        in ready state. Then we have a buggy guest OS. */
     if ((proxy->vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) &&
         !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-        proxy->bugs |= VIRTIO_PCI_BUG_BUS_MASTER;
+        proxy->flags |= VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
     }
     return 0;
 }
@@ -185,7 +184,7 @@ static void virtio_pci_reset(DeviceState *d)
     VirtIOPCIProxy *proxy = container_of(d, VirtIOPCIProxy, pci_dev.qdev);
     virtio_reset(proxy->vdev);
     msix_reset(&proxy->pci_dev);
-    proxy->bugs = 0;
+    proxy->flags = 0;
 }
 
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
@@ -235,7 +234,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
            some safety checks. */
         if ((val & VIRTIO_CONFIG_S_DRIVER_OK) &&
             !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-            proxy->bugs |= VIRTIO_PCI_BUG_BUS_MASTER;
+            proxy->flags |= VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
         }
         break;
     case VIRTIO_MSI_CONFIG_VECTOR:
@@ -403,7 +402,7 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
 
     if (PCI_COMMAND == address) {
         if (!(val & PCI_COMMAND_MASTER)) {
-            if (!(proxy->bugs & VIRTIO_PCI_BUG_BUS_MASTER)) {
+            if (!(proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG)) {
                 virtio_set_status(proxy->vdev,
                                   proxy->vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
             }

From 85cf2a8d7435754f685a26f95dcb43a93a84ff60 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 10 Jan 2011 14:28:40 +0200
Subject: [PATCH 08/12] virtio: move vmstate change tracking to core

Move tracking vmstate change from virtio-net to virtio.c
as it is going to be used by virito-blk and virtio-pci
for the ioeventfd support.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-net.c | 28 +++++++---------------------
 hw/virtio.c     | 22 ++++++++++++++++++++++
 hw/virtio.h     |  3 +++
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index ec1bf8dda7..ccb3e632a4 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -54,8 +54,6 @@ typedef struct VirtIONet
     uint8_t nouni;
     uint8_t nobcast;
     uint8_t vhost_started;
-    bool vm_running;
-    VMChangeStateEntry *vmstate;
     struct {
         int in_use;
         int first_multi;
@@ -102,7 +100,7 @@ static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
 static bool virtio_net_started(VirtIONet *n, uint8_t status)
 {
     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
-        (n->status & VIRTIO_NET_S_LINK_UP) && n->vm_running;
+        (n->status & VIRTIO_NET_S_LINK_UP) && n->vdev.vm_running;
 }
 
 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
@@ -453,7 +451,7 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
 static int virtio_net_can_receive(VLANClientState *nc)
 {
     VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
-    if (!n->vm_running) {
+    if (!n->vdev.vm_running) {
         return 0;
     }
 
@@ -708,7 +706,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
         return num_packets;
     }
 
-    assert(n->vm_running);
+    assert(n->vdev.vm_running);
 
     if (n->async_tx.elem.out_num) {
         virtio_queue_set_notification(n->tx_vq, 0);
@@ -769,7 +767,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
     VirtIONet *n = to_virtio_net(vdev);
 
     /* This happens when device was stopped but VCPU wasn't. */
-    if (!n->vm_running) {
+    if (!n->vdev.vm_running) {
         n->tx_waiting = 1;
         return;
     }
@@ -796,7 +794,7 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
     }
     n->tx_waiting = 1;
     /* This happens when device was stopped but VCPU wasn't. */
-    if (!n->vm_running) {
+    if (!n->vdev.vm_running) {
         return;
     }
     virtio_queue_set_notification(vq, 0);
@@ -806,7 +804,7 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
 static void virtio_net_tx_timer(void *opaque)
 {
     VirtIONet *n = opaque;
-    assert(n->vm_running);
+    assert(n->vdev.vm_running);
 
     n->tx_waiting = 0;
 
@@ -823,7 +821,7 @@ static void virtio_net_tx_bh(void *opaque)
     VirtIONet *n = opaque;
     int32_t ret;
 
-    assert(n->vm_running);
+    assert(n->vdev.vm_running);
 
     n->tx_waiting = 0;
 
@@ -988,16 +986,6 @@ static NetClientInfo net_virtio_info = {
     .link_status_changed = virtio_net_set_link_status,
 };
 
-static void virtio_net_vmstate_change(void *opaque, int running, int reason)
-{
-    VirtIONet *n = opaque;
-    n->vm_running = running;
-    /* This is called when vm is started/stopped,
-     * it will start/stop vhost backend if appropriate
-     * e.g. after migration. */
-    virtio_net_set_status(&n->vdev, n->vdev.status);
-}
-
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
                               virtio_net_conf *net)
 {
@@ -1052,7 +1040,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
     n->qdev = dev;
     register_savevm(dev, "virtio-net", -1, VIRTIO_NET_VM_VERSION,
                     virtio_net_save, virtio_net_load, n);
-    n->vmstate = qemu_add_vm_change_state_handler(virtio_net_vmstate_change, n);
 
     add_boot_device_path(conf->bootindex, dev, "/ethernet-phy@0");
 
@@ -1062,7 +1049,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf,
 void virtio_net_exit(VirtIODevice *vdev)
 {
     VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
-    qemu_del_vm_change_state_handler(n->vmstate);
 
     /* This will stop vhost backend if appropriate. */
     virtio_net_set_status(vdev, 0);
diff --git a/hw/virtio.c b/hw/virtio.c
index 07dbf868fd..1d20be2ca3 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -743,11 +743,31 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f)
 
 void virtio_cleanup(VirtIODevice *vdev)
 {
+    qemu_del_vm_change_state_handler(vdev->vmstate);
     if (vdev->config)
         qemu_free(vdev->config);
     qemu_free(vdev->vq);
 }
 
+static void virtio_vmstate_change(void *opaque, int running, int reason)
+{
+    VirtIODevice *vdev = opaque;
+    bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK);
+    vdev->vm_running = running;
+
+    if (backend_run) {
+        virtio_set_status(vdev, vdev->status);
+    }
+
+    if (vdev->binding->vmstate_change) {
+        vdev->binding->vmstate_change(vdev->binding_opaque, backend_run);
+    }
+
+    if (!backend_run) {
+        virtio_set_status(vdev, vdev->status);
+    }
+}
+
 VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
                                  size_t config_size, size_t struct_size)
 {
@@ -774,6 +794,8 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
     else
         vdev->config = NULL;
 
+    vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev);
+
     return vdev;
 }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index 02fa312d3e..bd5274225e 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -95,6 +95,7 @@ typedef struct {
     unsigned (*get_features)(void * opaque);
     int (*set_guest_notifiers)(void * opaque, bool assigned);
     int (*set_host_notifier)(void * opaque, int n, bool assigned);
+    void (*vmstate_change)(void * opaque, bool running);
 } VirtIOBindings;
 
 #define VIRTIO_PCI_QUEUE_MAX 64
@@ -123,6 +124,8 @@ struct VirtIODevice
     const VirtIOBindings *binding;
     void *binding_opaque;
     uint16_t device_id;
+    bool vm_running;
+    VMChangeStateEntry *vmstate;
 };
 
 static inline void virtio_set_status(VirtIODevice *vdev, uint8_t val)

From d2f2b8a740c82319f9eea51ebed50815fbc3da3e Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 10 Jan 2011 13:50:05 +0200
Subject: [PATCH 09/12] kvm: test for ioeventfd support on old kernels

There used to be a limit of 6 KVM io bus devices in the kernel.
On such a kernel, we can't use many ioeventfds for host notification
since the limit is reached too easily.

Add an API to test for this condition.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 kvm-all.c  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 kvm-stub.c |  5 +++++
 kvm.h      |  1 +
 3 files changed, 55 insertions(+)

diff --git a/kvm-all.c b/kvm-all.c
index cae24bb87c..255b6fad9c 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -28,6 +28,11 @@
 #include "kvm.h"
 #include "bswap.h"
 
+/* This check must be after config-host.h is included */
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
 /* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
 #define PAGE_SIZE TARGET_PAGE_SIZE
 
@@ -72,6 +77,7 @@ struct KVMState
     int irqchip_in_kernel;
     int pit_in_kernel;
     int xsave, xcrs;
+    int many_ioeventfds;
 };
 
 static KVMState *kvm_state;
@@ -441,6 +447,39 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
     return ret;
 }
 
+static int kvm_check_many_ioeventfds(void)
+{
+    /* Older kernels have a 6 device limit on the KVM io bus.  Find out so we
+     * can avoid creating too many ioeventfds.
+     */
+#ifdef CONFIG_EVENTFD
+    int ioeventfds[7];
+    int i, ret = 0;
+    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
+        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
+        if (ioeventfds[i] < 0) {
+            break;
+        }
+        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
+        if (ret < 0) {
+            close(ioeventfds[i]);
+            break;
+        }
+    }
+
+    /* Decide whether many devices are supported or not */
+    ret = i == ARRAY_SIZE(ioeventfds);
+
+    while (i-- > 0) {
+        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
+        close(ioeventfds[i]);
+    }
+    return ret;
+#else
+    return 0;
+#endif
+}
+
 static void kvm_set_phys_mem(target_phys_addr_t start_addr,
 			     ram_addr_t size,
 			     ram_addr_t phys_offset)
@@ -717,6 +756,8 @@ int kvm_init(int smp_cpus)
     kvm_state = s;
     cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
 
+    s->many_ioeventfds = kvm_check_many_ioeventfds();
+
     return 0;
 
 err:
@@ -1046,6 +1087,14 @@ int kvm_has_xcrs(void)
     return kvm_state->xcrs;
 }
 
+int kvm_has_many_ioeventfds(void)
+{
+    if (!kvm_enabled()) {
+        return 0;
+    }
+    return kvm_state->many_ioeventfds;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
     if (!kvm_has_sync_mmu()) {
diff --git a/kvm-stub.c b/kvm-stub.c
index 5384a4b9a4..33d4476fa3 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -99,6 +99,11 @@ int kvm_has_robust_singlestep(void)
     return 0;
 }
 
+int kvm_has_many_ioeventfds(void)
+{
+    return 0;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
 }
diff --git a/kvm.h b/kvm.h
index 60a9b425c8..ce08d42756 100644
--- a/kvm.h
+++ b/kvm.h
@@ -42,6 +42,7 @@ int kvm_has_robust_singlestep(void);
 int kvm_has_debugregs(void);
 int kvm_has_xsave(void);
 int kvm_has_xcrs(void);
+int kvm_has_many_ioeventfds(void);
 
 #ifdef NEED_CPU_H
 int kvm_init_vcpu(CPUState *env);

From 25db9ebe15125deb32958c6df74996f745edf1f9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Fri, 17 Dec 2010 12:01:50 +0000
Subject: [PATCH 10/12] virtio-pci: Use ioeventfd for virtqueue notify

Virtqueue notify is currently handled synchronously in userspace virtio.  This
prevents the vcpu from executing guest code while hardware emulation code
handles the notify.

On systems that support KVM, the ioeventfd mechanism can be used to make
virtqueue notify a lightweight exit by deferring hardware emulation to the
iothread and allowing the VM to continue execution.  This model is similar to
how vhost receives virtqueue notifies.

The result of this change is improved performance for userspace virtio devices.
Virtio-blk throughput increases especially for multithreaded scenarios and
virtio-net transmit throughput increases substantially.

Some virtio devices are known to have guest drivers which expect a notify to be
processed synchronously and spin waiting for completion.
For virtio-net, this also seems to interact with the guest stack in strange
ways so that TCP throughput for small message sizes (~200bytes)
is harmed. Only enable ioeventfd for virtio-blk for now.

Care must be taken not to interfere with vhost-net, which uses host
notifiers.  If the set_host_notifier() API is used by a device
virtio-pci will disable virtio-ioeventfd and let the device deal with
host notifiers as it wishes.

Finally, there used to be a limit of 6 KVM io bus devices inside the
kernel.  On such a kernel, don't use ioeventfd for virtqueue host
notification since the limit is reached too easily.  This ensures that
existing vhost-net setups (which always use ioeventfd) have ioeventfds
available so they can continue to work.

After migration and on VM change state (running/paused) virtio-ioeventfd
will enable/disable itself.

 * VIRTIO_CONFIG_S_DRIVER_OK -> enable virtio-ioeventfd
 * !VIRTIO_CONFIG_S_DRIVER_OK -> disable virtio-ioeventfd
 * virtio_pci_set_host_notifier() -> disable virtio-ioeventfd
 * vm_change_state(running=0) -> disable virtio-ioeventfd
 * vm_change_state(running=1) -> enable virtio-ioeventfd

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-pci.c | 196 ++++++++++++++++++++++++++++++++++++++++++------
 hw/virtio.c     |  14 +++-
 hw/virtio.h     |   1 +
 3 files changed, 185 insertions(+), 26 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 13dd39194a..70c40eeb56 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -83,6 +83,11 @@
 /* Flags track per-device state like workarounds for quirks in older guests. */
 #define VIRTIO_PCI_FLAG_BUS_MASTER_BUG  (1 << 0)
 
+/* Performance improves when virtqueue kick processing is decoupled from the
+ * vcpu thread using ioeventfd for some devices. */
+#define VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT 1
+#define VIRTIO_PCI_FLAG_USE_IOEVENTFD   (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT)
+
 /* QEMU doesn't strictly need write barriers since everything runs in
  * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
  * KVM or if kqemu gets SMP support.
@@ -107,6 +112,8 @@ typedef struct {
     /* Max. number of ports we can have for a the virtio-serial device */
     uint32_t max_virtserial_ports;
     virtio_net_conf net;
+    bool ioeventfd_disabled;
+    bool ioeventfd_started;
 } VirtIOPCIProxy;
 
 /* virtio device */
@@ -179,12 +186,132 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
     return 0;
 }
 
+static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
+                                                 int n, bool assign)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    int r;
+    if (assign) {
+        r = event_notifier_init(notifier, 1);
+        if (r < 0) {
+            return r;
+        }
+        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
+                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                                       n, assign);
+        if (r < 0) {
+            event_notifier_cleanup(notifier);
+        }
+    } else {
+        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
+                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                                       n, assign);
+        if (r < 0) {
+            return r;
+        }
+
+        /* Handle the race condition where the guest kicked and we deassigned
+         * before we got around to handling the kick.
+         */
+        if (event_notifier_test_and_clear(notifier)) {
+            virtio_queue_notify_vq(vq);
+        }
+
+        event_notifier_cleanup(notifier);
+    }
+    return r;
+}
+
+static void virtio_pci_host_notifier_read(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    EventNotifier *n = virtio_queue_get_host_notifier(vq);
+    if (event_notifier_test_and_clear(n)) {
+        virtio_queue_notify_vq(vq);
+    }
+}
+
+static void virtio_pci_set_host_notifier_fd_handler(VirtIOPCIProxy *proxy,
+                                                    int n, bool assign)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
+    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+    if (assign) {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            virtio_pci_host_notifier_read, NULL, vq);
+    } else {
+        qemu_set_fd_handler(event_notifier_get_fd(notifier),
+                            NULL, NULL, NULL);
+    }
+}
+
+static int virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
+{
+    int n, r;
+
+    if (!(proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) ||
+        proxy->ioeventfd_disabled ||
+        proxy->ioeventfd_started) {
+        return 0;
+    }
+
+    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        r = virtio_pci_set_host_notifier_internal(proxy, n, true);
+        if (r < 0) {
+            goto assign_error;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, true);
+    }
+    proxy->ioeventfd_started = true;
+    return 0;
+
+assign_error:
+    while (--n >= 0) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
+        virtio_pci_set_host_notifier_internal(proxy, n, false);
+    }
+    proxy->ioeventfd_started = false;
+    proxy->ioeventfd_disabled = true;
+    return r;
+}
+
+static int virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
+{
+    int n;
+
+    if (!proxy->ioeventfd_started) {
+        return 0;
+    }
+
+    for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
+        if (!virtio_queue_get_num(proxy->vdev, n)) {
+            continue;
+        }
+
+        virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
+        virtio_pci_set_host_notifier_internal(proxy, n, false);
+    }
+    proxy->ioeventfd_started = false;
+    return 0;
+}
+
 static void virtio_pci_reset(DeviceState *d)
 {
     VirtIOPCIProxy *proxy = container_of(d, VirtIOPCIProxy, pci_dev.qdev);
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_reset(proxy->vdev);
     msix_reset(&proxy->pci_dev);
-    proxy->flags = 0;
+    proxy->flags &= ~VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
 }
 
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
@@ -209,6 +336,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
     case VIRTIO_PCI_QUEUE_PFN:
         pa = (target_phys_addr_t)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT;
         if (pa == 0) {
+            virtio_pci_stop_ioeventfd(proxy);
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
         }
@@ -223,7 +351,16 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         virtio_queue_notify(vdev, val);
         break;
     case VIRTIO_PCI_STATUS:
+        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+            virtio_pci_stop_ioeventfd(proxy);
+        }
+
         virtio_set_status(vdev, val & 0xFF);
+
+        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
+            virtio_pci_start_ioeventfd(proxy);
+        }
+
         if (vdev->status == 0) {
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
@@ -403,6 +540,7 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
     if (PCI_COMMAND == address) {
         if (!(val & PCI_COMMAND_MASTER)) {
             if (!(proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG)) {
+                virtio_pci_stop_ioeventfd(proxy);
                 virtio_set_status(proxy->vdev,
                                   proxy->vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
             }
@@ -480,30 +618,30 @@ assign_error:
 static int virtio_pci_set_host_notifier(void *opaque, int n, bool assign)
 {
     VirtIOPCIProxy *proxy = opaque;
-    VirtQueue *vq = virtio_get_queue(proxy->vdev, n);
-    EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
-    int r;
+
+    /* Stop using ioeventfd for virtqueue kick if the device starts using host
+     * notifiers.  This makes it easy to avoid stepping on each others' toes.
+     */
+    proxy->ioeventfd_disabled = assign;
     if (assign) {
-        r = event_notifier_init(notifier, 1);
-        if (r < 0) {
-            return r;
-        }
-        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
-                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
-                                       n, assign);
-        if (r < 0) {
-            event_notifier_cleanup(notifier);
-        }
-    } else {
-        r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
-                                       proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
-                                       n, assign);
-        if (r < 0) {
-            return r;
-        }
-        event_notifier_cleanup(notifier);
+        virtio_pci_stop_ioeventfd(proxy);
+    }
+    /* We don't need to start here: it's not needed because backend
+     * currently only stops on status change away from ok,
+     * reset, vmstop and such. If we do add code to start here,
+     * need to check vmstate, device state etc. */
+    return virtio_pci_set_host_notifier_internal(proxy, n, assign);
+}
+
+static void virtio_pci_vmstate_change(void *opaque, bool running)
+{
+    VirtIOPCIProxy *proxy = opaque;
+
+    if (running) {
+        virtio_pci_start_ioeventfd(proxy);
+    } else {
+        virtio_pci_stop_ioeventfd(proxy);
     }
-    return r;
 }
 
 static const VirtIOBindings virtio_pci_bindings = {
@@ -515,6 +653,7 @@ static const VirtIOBindings virtio_pci_bindings = {
     .get_features = virtio_pci_get_features,
     .set_host_notifier = virtio_pci_set_host_notifier,
     .set_guest_notifiers = virtio_pci_set_guest_notifiers,
+    .vmstate_change = virtio_pci_vmstate_change,
 };
 
 static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
@@ -559,10 +698,15 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
     pci_register_bar(&proxy->pci_dev, 0, size, PCI_BASE_ADDRESS_SPACE_IO,
                            virtio_map);
 
+    if (!kvm_has_many_ioeventfds()) {
+        proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
+    }
+
     virtio_bind_device(vdev, &virtio_pci_bindings, proxy);
     proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY;
     proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE;
     proxy->host_features = vdev->get_features(vdev, proxy->host_features);
+
 }
 
 static int virtio_blk_init_pci(PCIDevice *pci_dev)
@@ -597,6 +741,7 @@ static int virtio_blk_exit_pci(PCIDevice *pci_dev)
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
 
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_blk_exit(proxy->vdev);
     blockdev_mark_auto_del(proxy->block.bs);
     return virtio_exit_pci(pci_dev);
@@ -658,6 +803,7 @@ static int virtio_net_exit_pci(PCIDevice *pci_dev)
 {
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
 
+    virtio_pci_stop_ioeventfd(proxy);
     virtio_net_exit(proxy->vdev);
     return virtio_exit_pci(pci_dev);
 }
@@ -705,6 +851,8 @@ static PCIDeviceInfo virtio_info[] = {
         .qdev.props = (Property[]) {
             DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0),
             DEFINE_BLOCK_PROPERTIES(VirtIOPCIProxy, block),
+            DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+                            VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
             DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_PROP_END_OF_LIST(),
@@ -717,6 +865,8 @@ static PCIDeviceInfo virtio_info[] = {
         .exit       = virtio_net_exit_pci,
         .romfile    = "pxe-virtio.bin",
         .qdev.props = (Property[]) {
+            DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+                            VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, false),
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
             DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic),
diff --git a/hw/virtio.c b/hw/virtio.c
index 1d20be2ca3..31bd9e32dc 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -575,11 +575,19 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
     return vdev->vq[n].vring.num;
 }
 
+void virtio_queue_notify_vq(VirtQueue *vq)
+{
+    if (vq->vring.desc) {
+        VirtIODevice *vdev = vq->vdev;
+        trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
+        vq->handle_output(vdev, vq);
+    }
+}
+
 void virtio_queue_notify(VirtIODevice *vdev, int n)
 {
-    if (n < VIRTIO_PCI_QUEUE_MAX && vdev->vq[n].vring.desc) {
-        trace_virtio_queue_notify(vdev, n, &vdev->vq[n]);
-        vdev->vq[n].handle_output(vdev, &vdev->vq[n]);
+    if (n < VIRTIO_PCI_QUEUE_MAX) {
+        virtio_queue_notify_vq(&vdev->vq[n]);
     }
 }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index bd5274225e..d8546d5b30 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -222,5 +222,6 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx);
 VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n);
 EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
+void virtio_queue_notify_vq(VirtQueue *vq);
 void virtio_irq(VirtQueue *vq);
 #endif

From 65d6dcbde84314c6d05a365a26a384f880a5c8fe Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Fri, 17 Dec 2010 12:01:52 +0000
Subject: [PATCH 11/12] docs: Document virtio PCI -device ioeventfd=on|off

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 docs/qdev-device-use.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/qdev-device-use.txt b/docs/qdev-device-use.txt
index f252c8e3bc..f2f9b757a5 100644
--- a/docs/qdev-device-use.txt
+++ b/docs/qdev-device-use.txt
@@ -97,10 +97,13 @@ The -device argument differs in detail for each kind of drive:
 
 * if=virtio
 
-  -device virtio-blk-pci,drive=DRIVE-ID,class=C,vectors=V
+  -device virtio-blk-pci,drive=DRIVE-ID,class=C,vectors=V,ioeventfd=IOEVENTFD
 
   This lets you control PCI device class and MSI-X vectors.
 
+  IOEVENTFD controls whether or not ioeventfd is used for virtqueue notify.  It
+  can be set to on (default) or off.
+
   As for all PCI devices, you can add bus=PCI-BUS,addr=DEVFN to
   control the PCI device address.
 
@@ -240,6 +243,9 @@ For PCI devices, you can add bus=PCI-BUS,addr=DEVFN to control the PCI
 device address, as usual.  The old -net nic provides parameter addr
 for that, it is silently ignored when the NIC is not a PCI device.
 
+For virtio-net-pci, you can control whether or not ioeventfd is used for
+virtqueue notify by setting ioeventfd= to on or off (default).
+
 -net nic accepts vectors=V for all models, but it's silently ignored
 except for virtio-net-pci (model=virtio).  With -device, only devices
 that support it accept it.

From b36e391441906c36ed0856b69de84001860402bf Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 11 Jan 2011 14:10:15 +0200
Subject: [PATCH 12/12] ioeventfd: error handling cleanup

- Don't return status from start/stop functions where it's ignored
- report errors to make debugging easier
- assert on unexpected failures
- don't disable notifiers on error so that we'll
  retry when guest driver restarts

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 hw/virtio-pci.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 70c40eeb56..d07ff976be 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -195,12 +195,16 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
     if (assign) {
         r = event_notifier_init(notifier, 1);
         if (r < 0) {
+            error_report("%s: unable to init event notifier: %d",
+                         __func__, r);
             return r;
         }
         r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier),
                                        proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
                                        n, assign);
         if (r < 0) {
+            error_report("%s: unable to map ioeventfd: %d",
+                         __func__, r);
             event_notifier_cleanup(notifier);
         }
     } else {
@@ -208,6 +212,8 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
                                        proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
                                        n, assign);
         if (r < 0) {
+            error_report("%s: unable to unmap ioeventfd: %d",
+                         __func__, r);
             return r;
         }
 
@@ -246,14 +252,14 @@ static void virtio_pci_set_host_notifier_fd_handler(VirtIOPCIProxy *proxy,
     }
 }
 
-static int virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
+static void virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
 {
     int n, r;
 
     if (!(proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) ||
         proxy->ioeventfd_disabled ||
         proxy->ioeventfd_started) {
-        return 0;
+        return;
     }
 
     for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
@@ -269,7 +275,7 @@ static int virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
         virtio_pci_set_host_notifier_fd_handler(proxy, n, true);
     }
     proxy->ioeventfd_started = true;
-    return 0;
+    return;
 
 assign_error:
     while (--n >= 0) {
@@ -278,19 +284,20 @@ assign_error:
         }
 
         virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
-        virtio_pci_set_host_notifier_internal(proxy, n, false);
+        r = virtio_pci_set_host_notifier_internal(proxy, n, false);
+        assert(r >= 0);
     }
     proxy->ioeventfd_started = false;
-    proxy->ioeventfd_disabled = true;
-    return r;
+    error_report("%s: failed. Fallback to a userspace (slower).", __func__);
 }
 
-static int virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
+static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
 {
+    int r;
     int n;
 
     if (!proxy->ioeventfd_started) {
-        return 0;
+        return;
     }
 
     for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
@@ -299,10 +306,10 @@ static int virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
         }
 
         virtio_pci_set_host_notifier_fd_handler(proxy, n, false);
-        virtio_pci_set_host_notifier_internal(proxy, n, false);
+        r = virtio_pci_set_host_notifier_internal(proxy, n, false);
+        assert(r >= 0);
     }
     proxy->ioeventfd_started = false;
-    return 0;
 }
 
 static void virtio_pci_reset(DeviceState *d)
@@ -706,7 +713,6 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
     proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY;
     proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE;
     proxy->host_features = vdev->get_features(vdev, proxy->host_features);
-
 }
 
 static int virtio_blk_init_pci(PCIDevice *pci_dev)