From 9035f8c09bebb63c0cc6014acf5c7066ef778aff Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 30 Jun 2014 09:50:33 -0600
Subject: [PATCH 1/4] vfio-pci: Fix MSI/X debug code

Use the correct MSI message function for debug info.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 7b279c4f05..4975ccf22c 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -642,9 +642,9 @@ static void vfio_msi_interrupt(void *opaque)
     MSIMessage msg;
 
     if (vdev->interrupt == VFIO_INT_MSIX) {
-        msg = msi_get_message(&vdev->pdev, nr);
-    } else if (vdev->interrupt == VFIO_INT_MSI) {
         msg = msix_get_message(&vdev->pdev, nr);
+    } else if (vdev->interrupt == VFIO_INT_MSI) {
+        msg = msi_get_message(&vdev->pdev, nr);
     } else {
         abort();
     }

From f4d45d47826377722700894dbf7f47444527a9d2 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 30 Jun 2014 09:50:33 -0600
Subject: [PATCH 2/4] vfio-pci: Fix MSI-X masking performance

There are still old guests out there that over-exercise MSI-X masking.
The current code completely sets-up and tears-down an MSI-X vector on
the "use" and "release" callbacks.  While this is functional, it can
slow an old guest to a crawl.  We can easily skip the KVM parts of
this so that we keep the MSI route and irqfd setup.  We do however
need to switch VFIO to trigger a different eventfd while masked.
Actually, we have the option of continuing to use -1 to disable the
trigger, but by using another EventNotifier we can allow the MSI-X
core to emulate pending bits and re-fire the vector once unmasked.
MSI code gets updated as well to use the same setup and teardown
structures and functions.

Prior to this change, an igbvf assigned to a RHEL5 guest gets about
20Mbps and 50 transactions/s with netperf (remote or VF->PF).  With
this change, we get line rate and 3k transactions/s remote or 2Gbps
and 6k+ transactions/s to the PF.  No significant change is expected
for newer guests with more well behaved MSI-X support.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 233 +++++++++++++++++++++++++++----------------------
 1 file changed, 131 insertions(+), 102 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 4975ccf22c..7312ce20f2 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -121,6 +121,7 @@ typedef struct VFIOINTx {
 
 typedef struct VFIOMSIVector {
     EventNotifier interrupt; /* eventfd triggered on interrupt */
+    EventNotifier kvm_interrupt; /* eventfd triggered for KVM irqfd bypass */
     struct VFIODevice *vdev; /* back pointer to device */
     MSIMessage msg; /* cache the MSI message so we know when it changes */
     int virq; /* KVM irqchip route for QEMU bypass */
@@ -682,10 +683,11 @@ static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
     for (i = 0; i < vdev->nr_vectors; i++) {
         if (!vdev->msi_vectors[i].use) {
             fds[i] = -1;
-            continue;
+        } else if (vdev->msi_vectors[i].virq >= 0) {
+            fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
+        } else {
+            fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
         }
-
-        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
     }
 
     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
@@ -695,6 +697,52 @@ static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
     return ret;
 }
 
+static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
+                                  bool msix)
+{
+    int virq;
+
+    if ((msix && !VFIO_ALLOW_KVM_MSIX) ||
+        (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) {
+        return;
+    }
+
+    if (event_notifier_init(&vector->kvm_interrupt, 0)) {
+        return;
+    }
+
+    virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
+    if (virq < 0) {
+        event_notifier_cleanup(&vector->kvm_interrupt);
+        return;
+    }
+
+    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
+                                       NULL, virq) < 0) {
+        kvm_irqchip_release_virq(kvm_state, virq);
+        event_notifier_cleanup(&vector->kvm_interrupt);
+        return;
+    }
+
+    vector->msg = *msg;
+    vector->virq = virq;
+}
+
+static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
+{
+    kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
+                                      vector->virq);
+    kvm_irqchip_release_virq(kvm_state, vector->virq);
+    vector->virq = -1;
+    event_notifier_cleanup(&vector->kvm_interrupt);
+}
+
+static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
+{
+    kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
+    vector->msg = msg;
+}
+
 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
                                    MSIMessage *msg, IOHandler *handler)
 {
@@ -707,30 +755,32 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
             vdev->host.function, nr);
 
     vector = &vdev->msi_vectors[nr];
-    vector->vdev = vdev;
-    vector->use = true;
 
-    msix_vector_use(pdev, nr);
-
-    if (event_notifier_init(&vector->interrupt, 0)) {
-        error_report("vfio: Error: event_notifier_init failed");
+    if (!vector->use) {
+        vector->vdev = vdev;
+        vector->virq = -1;
+        if (event_notifier_init(&vector->interrupt, 0)) {
+            error_report("vfio: Error: event_notifier_init failed");
+        }
+        vector->use = true;
+        msix_vector_use(pdev, nr);
     }
 
+    qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                        handler, NULL, vector);
+
     /*
      * Attempt to enable route through KVM irqchip,
      * default to userspace handling if unavailable.
      */
-    vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
-                   kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
-    if (vector->virq < 0 ||
-        kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
-                                       NULL, vector->virq) < 0) {
-        if (vector->virq >= 0) {
-            kvm_irqchip_release_virq(kvm_state, vector->virq);
-            vector->virq = -1;
+    if (vector->virq >= 0) {
+        if (!msg) {
+            vfio_remove_kvm_msi_virq(vector);
+        } else {
+            vfio_update_kvm_msi_virq(vector, *msg);
         }
-        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                            handler, NULL, vector);
+    } else {
+        vfio_add_kvm_msi_virq(vector, msg, true);
     }
 
     /*
@@ -761,7 +811,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
         irq_set->count = 1;
         pfd = (int32_t *)&irq_set->data;
 
-        *pfd = event_notifier_get_fd(&vector->interrupt);
+        if (vector->virq >= 0) {
+            *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
+        } else {
+            *pfd = event_notifier_get_fd(&vector->interrupt);
+        }
 
         ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
         g_free(irq_set);
@@ -783,50 +837,41 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 {
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
-    int argsz;
-    struct vfio_irq_set *irq_set;
-    int32_t *pfd;
 
     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function, nr);
 
     /*
-     * XXX What's the right thing to do here?  This turns off the interrupt
-     * completely, but do we really just want to switch the interrupt to
-     * bouncing through userspace and let msix.c drop it?  Not sure.
+     * There are still old guests that mask and unmask vectors on every
+     * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
+     * the KVM setup in place, simply switch VFIO to use the non-bypass
+     * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
+     * core will mask the interrupt and set pending bits, allowing it to
+     * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
      */
-    msix_vector_unuse(pdev, nr);
+    if (vector->virq >= 0) {
+        int argsz;
+        struct vfio_irq_set *irq_set;
+        int32_t *pfd;
 
-    argsz = sizeof(*irq_set) + sizeof(*pfd);
+        argsz = sizeof(*irq_set) + sizeof(*pfd);
 
-    irq_set = g_malloc0(argsz);
-    irq_set->argsz = argsz;
-    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
-                     VFIO_IRQ_SET_ACTION_TRIGGER;
-    irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-    irq_set->start = nr;
-    irq_set->count = 1;
-    pfd = (int32_t *)&irq_set->data;
+        irq_set = g_malloc0(argsz);
+        irq_set->argsz = argsz;
+        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                         VFIO_IRQ_SET_ACTION_TRIGGER;
+        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+        irq_set->start = nr;
+        irq_set->count = 1;
+        pfd = (int32_t *)&irq_set->data;
 
-    *pfd = -1;
+        *pfd = event_notifier_get_fd(&vector->interrupt);
 
-    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+        ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
-    g_free(irq_set);
-
-    if (vector->virq < 0) {
-        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                            NULL, NULL, NULL);
-    } else {
-        kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
-                                          vector->virq);
-        kvm_irqchip_release_virq(kvm_state, vector->virq);
-        vector->virq = -1;
+        g_free(irq_set);
     }
-
-    event_notifier_cleanup(&vector->interrupt);
-    vector->use = false;
 }
 
 static void vfio_enable_msix(VFIODevice *vdev)
@@ -876,28 +921,28 @@ retry:
         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 
         vector->vdev = vdev;
+        vector->virq = -1;
         vector->use = true;
 
         if (event_notifier_init(&vector->interrupt, 0)) {
             error_report("vfio: Error: event_notifier_init failed");
         }
 
+        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                            vfio_msi_interrupt, NULL, vector);
+
         vector->msg = msi_get_message(&vdev->pdev, i);
 
         /*
          * Attempt to enable route through KVM irqchip,
          * default to userspace handling if unavailable.
          */
-        vector->virq = VFIO_ALLOW_KVM_MSI ?
-                       kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
-        if (vector->virq < 0 ||
-            kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
-                                           NULL, vector->virq) < 0) {
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                vfio_msi_interrupt, NULL, vector);
-        }
+        vfio_add_kvm_msi_virq(vector, &vector->msg, false);
     }
 
+    /* Set interrupt type prior to possible interrupts */
+    vdev->interrupt = VFIO_INT_MSI;
+
     ret = vfio_enable_vectors(vdev, false);
     if (ret) {
         if (ret < 0) {
@@ -910,14 +955,10 @@ retry:
         for (i = 0; i < vdev->nr_vectors; i++) {
             VFIOMSIVector *vector = &vdev->msi_vectors[i];
             if (vector->virq >= 0) {
-                kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
-                                                  vector->virq);
-                kvm_irqchip_release_virq(kvm_state, vector->virq);
-                vector->virq = -1;
-            } else {
-                qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                    NULL, NULL, NULL);
+                vfio_remove_kvm_msi_virq(vector);
             }
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                NULL, NULL, NULL);
             event_notifier_cleanup(&vector->interrupt);
         }
 
@@ -929,11 +970,17 @@ retry:
         }
         vdev->nr_vectors = 0;
 
+        /*
+         * Failing to setup MSI doesn't really fall within any specification.
+         * Let's try leaving interrupts disabled and hope the guest figures
+         * out to fall back to INTx for this device.
+         */
+        error_report("vfio: Error: Failed to enable MSI");
+        vdev->interrupt = VFIO_INT_NONE;
+
         return;
     }
 
-    vdev->interrupt = VFIO_INT_MSI;
-
     DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function, vdev->nr_vectors);
@@ -941,6 +988,20 @@ retry:
 
 static void vfio_disable_msi_common(VFIODevice *vdev)
 {
+    int i;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
+        if (vdev->msi_vectors[i].use) {
+            if (vector->virq >= 0) {
+                vfio_remove_kvm_msi_virq(vector);
+            }
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                NULL, NULL, NULL);
+            event_notifier_cleanup(&vector->interrupt);
+        }
+    }
+
     g_free(vdev->msi_vectors);
     vdev->msi_vectors = NULL;
     vdev->nr_vectors = 0;
@@ -962,6 +1023,7 @@ static void vfio_disable_msix(VFIODevice *vdev)
     for (i = 0; i < vdev->nr_vectors; i++) {
         if (vdev->msi_vectors[i].use) {
             vfio_msix_vector_release(&vdev->pdev, i);
+            msix_vector_unuse(&vdev->pdev, i);
         }
     }
 
@@ -977,30 +1039,7 @@ static void vfio_disable_msix(VFIODevice *vdev)
 
 static void vfio_disable_msi(VFIODevice *vdev)
 {
-    int i;
-
     vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
-
-    for (i = 0; i < vdev->nr_vectors; i++) {
-        VFIOMSIVector *vector = &vdev->msi_vectors[i];
-
-        if (!vector->use) {
-            continue;
-        }
-
-        if (vector->virq >= 0) {
-            kvm_irqchip_remove_irqfd_notifier(kvm_state,
-                                              &vector->interrupt, vector->virq);
-            kvm_irqchip_release_virq(kvm_state, vector->virq);
-            vector->virq = -1;
-        } else {
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                NULL, NULL, NULL);
-        }
-
-        event_notifier_cleanup(&vector->interrupt);
-    }
-
     vfio_disable_msi_common(vdev);
 
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
@@ -1020,17 +1059,7 @@ static void vfio_update_msi(VFIODevice *vdev)
         }
 
         msg = msi_get_message(&vdev->pdev, i);
-
-        if (msg.address != vector->msg.address ||
-            msg.data != vector->msg.data) {
-
-            DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
-                    __func__, vdev->host.domain, vdev->host.bus,
-                    vdev->host.slot, vdev->host.function, i);
-
-            kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
-            vector->msg = msg;
-        }
+        vfio_update_kvm_msi_virq(vector, msg);
     }
 }
 

From c40708176a6b52b73bec14796b7c71b882ceb102 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 30 Jun 2014 09:52:58 -0600
Subject: [PATCH 3/4] vfio: Make BARs native endian

Slow BAR access path is used when VFIO fails to mmap() BAR.
Since this is just a transport between the guest and a device, there is
no need to do endianness swapping.

This changes BARs to use native endianness. Since non-ROM BARs were
doing byte swapping, we need to remove it so does the patch.
As the result, this eliminates cancelling byte swaps and there is
no change in behavior for non-ROM BARs.

ROM BARs were declared little endian too but byte swapping was not
implemented for them so they never actually worked on big endian systems
as there was no cancelling byte swap. This fixes endiannes for ROM BARs
by declaring them native endian and only fixing access sizes as it is
done for non-ROM BARs.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 7312ce20f2..d32678e2fe 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1082,10 +1082,10 @@ static void vfio_bar_write(void *opaque, hwaddr addr,
         buf.byte = data;
         break;
     case 2:
-        buf.word = cpu_to_le16(data);
+        buf.word = data;
         break;
     case 4:
-        buf.dword = cpu_to_le32(data);
+        buf.dword = data;
         break;
     default:
         hw_error("vfio: unsupported write size, %d bytes", size);
@@ -1142,10 +1142,10 @@ static uint64_t vfio_bar_read(void *opaque,
         data = buf.byte;
         break;
     case 2:
-        data = le16_to_cpu(buf.word);
+        data = buf.word;
         break;
     case 4:
-        data = le32_to_cpu(buf.dword);
+        data = buf.dword;
         break;
     default:
         hw_error("vfio: unsupported read size, %d bytes", size);
@@ -1172,7 +1172,7 @@ static uint64_t vfio_bar_read(void *opaque,
 static const MemoryRegionOps vfio_bar_ops = {
     .read = vfio_bar_read,
     .write = vfio_bar_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
+    .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
 static void vfio_pci_load_rom(VFIODevice *vdev)
@@ -1234,21 +1234,42 @@ static void vfio_pci_load_rom(VFIODevice *vdev)
 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
 {
     VFIODevice *vdev = opaque;
-    uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
+    union {
+        uint8_t byte;
+        uint16_t word;
+        uint32_t dword;
+        uint64_t qword;
+    } buf;
+    uint64_t data = 0;
 
     /* Load the ROM lazily when the guest tries to read it */
     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
         vfio_pci_load_rom(vdev);
     }
 
-    memcpy(&val, vdev->rom + addr,
+    memcpy(&buf, vdev->rom + addr,
            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
 
+    switch (size) {
+    case 1:
+        data = buf.byte;
+        break;
+    case 2:
+        data = buf.word;
+        break;
+    case 4:
+        data = buf.dword;
+        break;
+    default:
+        hw_error("vfio: unsupported read size, %d bytes", size);
+        break;
+    }
+
     DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
             __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
-            vdev->host.function, addr, size, val);
+            vdev->host.function, addr, size, data);
 
-    return val;
+    return data;
 }
 
 static void vfio_rom_write(void *opaque, hwaddr addr,
@@ -1259,7 +1280,7 @@ static void vfio_rom_write(void *opaque, hwaddr addr,
 static const MemoryRegionOps vfio_rom_ops = {
     .read = vfio_rom_read,
     .write = vfio_rom_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
+    .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
 static bool vfio_blacklist_opt_rom(VFIODevice *vdev)

From ba29776fd8160a5c1c1892af5e237fc37aec3cf7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 30 Jun 2014 09:56:08 -0600
Subject: [PATCH 4/4] vfio: use correct runstate

io-error is for block device errors; it should always be preceded
by a BLOCK_IO_ERROR event.  I think vfio wants to use
RUN_STATE_INTERNAL_ERROR instead.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/misc/vfio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index d32678e2fe..aef4c9ce9d 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -4062,7 +4062,7 @@ static void vfio_err_notifier_handler(void *opaque)
                  __func__, vdev->host.domain, vdev->host.bus,
                  vdev->host.slot, vdev->host.function);
 
-    vm_stop(RUN_STATE_IO_ERROR);
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
 }
 
 /*