From 191f90cbea08122b48107f1f9116106fbf3bdfac Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 14 May 2020 11:14:39 -0400
Subject: [PATCH 01/58] msix: allow qword MSI-X table accesses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCI spec says:

For all accesses to MSI-X Table and MSI-X PBA fields, software must use
aligned full DWORD or aligned full QWORD transactions; otherwise, the
result is undefined.

However, since MSI-X was converted to use memory API, QEMU
started blocking qword transactions, only allowing DWORD
ones. Guests do not seem to use QWORD accesses, but let's
be spec compliant.

Fixes: 95524ae8dc8f ("msix: convert to memory API")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
---
 hw/pci/msix.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 29187898f2..e6a5559038 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -199,6 +199,9 @@ static const MemoryRegionOps msix_table_mmio_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
     .valid = {
         .min_access_size = 4,
+        .max_access_size = 8,
+    },
+    .impl = {
         .max_access_size = 4,
     },
 };
@@ -227,6 +230,9 @@ static const MemoryRegionOps msix_pba_mmio_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
     .valid = {
         .min_access_size = 4,
+        .max_access_size = 8,
+    },
+    .impl = {
         .max_access_size = 4,
     },
 };

From acc5c98ddd8f85d41fd22e57cf289aeaa34f160a Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Fri, 5 Jun 2020 18:09:09 -0600
Subject: [PATCH 02/58] diffs-allowed: add the SRAT AML to diffs-allowed

In anticipation of a change to the SRAT generation in qemu, add the AML
file to diffs-allowed.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Message-Id: <20200606000911.9896-2-vishal.l.verma@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/qtest/bios-tables-test-allowed-diff.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..e8f2766a63 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,4 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/pc/SRAT.dimmpxm",
+"tests/data/acpi/q35/SRAT.dimmpxm",
+"tests/data/acpi/virt/SRAT.memhp",

From c3b0cf6e7dbc48ecf84ace4224f220fa9baa85b6 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Fri, 5 Jun 2020 18:09:10 -0600
Subject: [PATCH 03/58] hw/acpi/nvdimm: add a helper to augment SRAT generation

NVDIMMs can belong to their own proximity domains, as described by the
NFIT. In such cases, the SRAT needs to have Memory Affinity structures
in the SRAT for these NVDIMMs, otherwise Linux doesn't populate node
data structures properly during NUMA initialization. See the following
for an example failure case.

https://lore.kernel.org/linux-nvdimm/20200416225438.15208-1-vishal.l.verma@intel.com/

Introduce a new helper, nvdimm_build_srat(), and call it for both the
i386 and arm versions of 'build_srat()' to augment the SRAT with
memory affinity information for NVDIMMs.

The relevant command line options to exercise this are below. Nodes 0-1
contain CPUs and regular memory, and nodes 2-3 are the NVDIMM address
space.

    -object memory-backend-ram,id=mem0,size=2048M
    -numa node,nodeid=0,memdev=mem0,
    -numa cpu,node-id=0,socket-id=0
    -object memory-backend-ram,id=mem1,size=2048M
    -numa node,nodeid=1,memdev=mem1,
    -numa cpu,node-id=1,socket-id=1
    -numa node,nodeid=2,
    -object memory-backend-file,id=nvmem0,share,mem-path=nvdimm-0,size=16384M,align=1G
    -device nvdimm,memdev=nvmem0,id=nv0,label-size=2M,node=2
    -numa node,nodeid=3,
    -object memory-backend-file,id=nvmem1,share,mem-path=nvdimm-1,size=16384M,align=1G
    -device nvdimm,memdev=nvmem1,id=nv1,label-size=2M,node=3

Cc: Jingqi Liu <jingqi.liu@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Message-Id: <20200606000911.9896-3-vishal.l.verma@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/nvdimm.c         | 23 +++++++++++++++++++++++
 hw/arm/virt-acpi-build.c |  4 ++++
 hw/i386/acpi-build.c     |  5 +++++
 include/hw/mem/nvdimm.h  |  1 +
 4 files changed, 33 insertions(+)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9316d12b70..8f7cc16add 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -28,6 +28,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/uuid.h"
+#include "qapi/error.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/aml-build.h"
 #include "hw/acpi/bios-linker-loader.h"
@@ -1334,6 +1335,28 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
     free_aml_allocator();
 }
 
+void nvdimm_build_srat(GArray *table_data)
+{
+    GSList *device_list = nvdimm_get_device_list();
+
+    for (; device_list; device_list = device_list->next) {
+        AcpiSratMemoryAffinity *numamem = NULL;
+        DeviceState *dev = device_list->data;
+        Object *obj = OBJECT(dev);
+        uint64_t addr, size;
+        int node;
+
+        node = object_property_get_int(obj, PC_DIMM_NODE_PROP, &error_abort);
+        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, &error_abort);
+        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, &error_abort);
+
+        numamem = acpi_data_push(table_data, sizeof *numamem);
+        build_srat_memory(numamem, addr, size, node,
+                          MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE);
+    }
+    g_slist_free(device_list);
+}
+
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
                        BIOSLinker *linker, NVDIMMState *state,
                        uint32_t ram_slots)
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 1b0a584c7b..2cbccd5fe2 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -539,6 +539,10 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         }
     }
 
+    if (ms->nvdimms_state->is_enabled) {
+        nvdimm_build_srat(table_data);
+    }
+
     if (ms->device_memory) {
         numamem = acpi_data_push(table_data, sizeof *numamem);
         build_srat_memory(numamem, ms->device_memory->base,
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2e15f6848e..d996525e2c 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2428,6 +2428,11 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
                               MEM_AFFINITY_ENABLED);
         }
     }
+
+    if (machine->nvdimms_state->is_enabled) {
+        nvdimm_build_srat(table_data);
+    }
+
     slots = (table_data->len - numa_start) / sizeof *numamem;
     for (; slots < pcms->numa_nodes + 2; slots++) {
         numamem = acpi_data_push(table_data, sizeof *numamem);
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index a3c08955e8..b67a1aedf6 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -155,6 +155,7 @@ typedef struct NVDIMMState NVDIMMState;
 void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io,
                             struct AcpiGenericAddress dsm_io,
                             FWCfgState *fw_cfg, Object *owner);
+void nvdimm_build_srat(GArray *table_data);
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
                        BIOSLinker *linker, NVDIMMState *state,
                        uint32_t ram_slots);

From 8a49b30034da8c5440f425a94e7fb12425f35283 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Fri, 5 Jun 2020 18:09:11 -0600
Subject: [PATCH 04/58] tests/acpi: update expected SRAT files

Update expected SRAT files for the change to account for NVDIMM NUMA
nodes in the SRAT.

AML diffs:

tests/data/acpi/pc/SRAT.dimmpxm:
Message-Id: <20200606000911.9896-4-vishal.l.verma@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/data/acpi/pc/SRAT.dimmpxm             | Bin 392 -> 392 bytes
 tests/data/acpi/q35/SRAT.dimmpxm            | Bin 392 -> 392 bytes
 tests/data/acpi/virt/SRAT.memhp             | Bin 186 -> 226 bytes
 tests/qtest/bios-tables-test-allowed-diff.h |   3 ---
 4 files changed, 3 deletions(-)

diff --git a/tests/data/acpi/pc/SRAT.dimmpxm b/tests/data/acpi/pc/SRAT.dimmpxm
index f5c0267ea24bb404b6b4e687390140378fbdc3f1..5a13c61b9041c6045c29643bf93a111fb1c0c76a 100644
GIT binary patch
delta 51
scmeBR?qKE$4ss0XU}Rum%-G0fz$nec00kUCF%aN@Pz(&LlS3Je0lmQmhyVZp

delta 51
icmeBR?qKE$4ss0XU}RumY}m+Uz$ndt8%z#mGzI{_tp$hx

diff --git a/tests/data/acpi/q35/SRAT.dimmpxm b/tests/data/acpi/q35/SRAT.dimmpxm
index f5c0267ea24bb404b6b4e687390140378fbdc3f1..5a13c61b9041c6045c29643bf93a111fb1c0c76a 100644
GIT binary patch
delta 51
scmeBR?qKE$4ss0XU}Rum%-G0fz$nec00kUCF%aN@Pz(&LlS3Je0lmQmhyVZp

delta 51
icmeBR?qKE$4ss0XU}RumY}m+Uz$ndt8%z#mGzI{_tp$hx

diff --git a/tests/data/acpi/virt/SRAT.memhp b/tests/data/acpi/virt/SRAT.memhp
index 1b57db2072e7f7e2085c4a427aa31c7383851b71..9a35adb40c6f7cd822e5af37abba8aad033617cb 100644
GIT binary patch
delta 43
rcmdnR_=u4!ILI;N5d#AQbIe4p$wD1K76@=aC<X@BiSc3+=gI;A(y0ha

delta 21
dcmaFFxQmf1ILI+%7Xt$Wv-3o*$rF#t0suzv27~|r

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index e8f2766a63..dfb8523c8b 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1,4 +1 @@
 /* List of comma-separated changed AML files to ignore */
-"tests/data/acpi/pc/SRAT.dimmpxm",
-"tests/data/acpi/q35/SRAT.dimmpxm",
-"tests/data/acpi/virt/SRAT.memhp",

From 9b20a3365d73dad4ad144eab9c5827dbbb2e9f21 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:06 +0200
Subject: [PATCH 05/58] qtest: allow DSDT acpi table changes

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Message-Id: <20200515150421.25479-2-kraxel@redhat.com>
---
 tests/qtest/bios-tables-test-allowed-diff.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..6a052c5044 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,18 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/pc/DSDT",
+"tests/data/acpi/pc/DSDT.acpihmat",
+"tests/data/acpi/pc/DSDT.bridge",
+"tests/data/acpi/pc/DSDT.cphp",
+"tests/data/acpi/pc/DSDT.dimmpxm",
+"tests/data/acpi/pc/DSDT.ipmikcs",
+"tests/data/acpi/pc/DSDT.memhp",
+"tests/data/acpi/pc/DSDT.numamem",
+"tests/data/acpi/q35/DSDT",
+"tests/data/acpi/q35/DSDT.acpihmat",
+"tests/data/acpi/q35/DSDT.bridge",
+"tests/data/acpi/q35/DSDT.cphp",
+"tests/data/acpi/q35/DSDT.dimmpxm",
+"tests/data/acpi/q35/DSDT.ipmibt",
+"tests/data/acpi/q35/DSDT.memhp",
+"tests/data/acpi/q35/DSDT.mmio64",
+"tests/data/acpi/q35/DSDT.numamem",

From df9b9b42cd2375a4eb785702899699a020ca2597 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:07 +0200
Subject: [PATCH 06/58] acpi: move aml builder code for rtc device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200515150421.25479-3-kraxel@redhat.com>
---
 hw/i386/acpi-build.c | 17 -----------------
 hw/rtc/mc146818rtc.c | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index d996525e2c..df5417c75f 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1137,22 +1137,6 @@ static Aml *build_fdc_device_aml(ISADevice *fdc)
     return dev;
 }
 
-static Aml *build_rtc_device_aml(void)
-{
-    Aml *dev;
-    Aml *crs;
-
-    dev = aml_device("RTC");
-    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0B00")));
-    crs = aml_resource_template();
-    aml_append(crs, aml_io(AML_DECODE16, 0x0070, 0x0070, 0x10, 0x02));
-    aml_append(crs, aml_irq_no_flags(8));
-    aml_append(crs, aml_io(AML_DECODE16, 0x0072, 0x0072, 0x02, 0x06));
-    aml_append(dev, aml_name_decl("_CRS", crs));
-
-    return dev;
-}
-
 static Aml *build_kbd_device_aml(void)
 {
     Aml *dev;
@@ -1278,7 +1262,6 @@ static void build_isa_devices_aml(Aml *table)
     Aml *scope = aml_scope("_SB.PCI0.ISA");
     Object *obj = object_resolve_path_type("", TYPE_ISA_BUS, &ambiguous);
 
-    aml_append(scope, build_rtc_device_aml());
     aml_append(scope, build_kbd_device_aml());
     aml_append(scope, build_mouse_device_aml());
     if (fdc) {
diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c
index 9c30cbdcd7..fe05a4488e 100644
--- a/hw/rtc/mc146818rtc.c
+++ b/hw/rtc/mc146818rtc.c
@@ -27,6 +27,7 @@
 #include "qemu/cutils.h"
 #include "qemu/module.h"
 #include "qemu/bcd.h"
+#include "hw/acpi/aml-build.h"
 #include "hw/irq.h"
 #include "hw/qdev-properties.h"
 #include "qemu/timer.h"
@@ -1007,13 +1008,34 @@ static void rtc_resetdev(DeviceState *d)
     }
 }
 
+static void rtc_build_aml(ISADevice *isadev, Aml *scope)
+{
+    Aml *dev;
+    Aml *crs;
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE, RTC_ISA_BASE,
+                           0x10, 0x02));
+    aml_append(crs, aml_irq_no_flags(RTC_ISA_IRQ));
+    aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE + 2, RTC_ISA_BASE + 2,
+                           0x02, 0x06));
+
+    dev = aml_device("RTC");
+    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0B00")));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+
+    aml_append(scope, dev);
+}
+
 static void rtc_class_initfn(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    ISADeviceClass *isa = ISA_DEVICE_CLASS(klass);
 
     dc->realize = rtc_realizefn;
     dc->reset = rtc_resetdev;
     dc->vmsd = &vmstate_rtc;
+    isa->build_aml = rtc_build_aml;
     device_class_set_props(dc, mc146818rtc_properties);
 }
 

From f592b94f3cbd5325bcf5142509b60cb6ab252770 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:08 +0200
Subject: [PATCH 07/58] acpi: rtc: use a single crs range

Use a single io range for _CRS instead of two,
following what real hardware does.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <20200515150421.25479-4-kraxel@redhat.com>
---
 hw/rtc/mc146818rtc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c
index fe05a4488e..1e9fa0f33f 100644
--- a/hw/rtc/mc146818rtc.c
+++ b/hw/rtc/mc146818rtc.c
@@ -1013,12 +1013,14 @@ static void rtc_build_aml(ISADevice *isadev, Aml *scope)
     Aml *dev;
     Aml *crs;
 
+    /*
+     * Reserving 8 io ports here, following what physical hardware
+     * does, even though qemu only responds to the first two ports.
+     */
     crs = aml_resource_template();
     aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE, RTC_ISA_BASE,
-                           0x10, 0x02));
+                           0x01, 0x08));
     aml_append(crs, aml_irq_no_flags(RTC_ISA_IRQ));
-    aml_append(crs, aml_io(AML_DECODE16, RTC_ISA_BASE + 2, RTC_ISA_BASE + 2,
-                           0x02, 0x06));
 
     dev = aml_device("RTC");
     aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0B00")));

From 4b8e369b91deff3ae7d8ae1067d336a66464f472 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:09 +0200
Subject: [PATCH 08/58] acpi: serial: don't use _STA method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The _STA method dates back to the days where we had a static DSDT.  The
device is listed in the DSDT table unconditionally and the _STA method
checks a bit in the isa bridge pci config space to figure whenever a
given is isa device is present or not, then evaluates to 0x0f (present)
or 0x00 (absent).

These days the DSDT is generated by qemu anyway, so if a device is not
present we can simply drop it from the DSDT instead.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200515150421.25479-5-kraxel@redhat.com>
---
 hw/i386/acpi-build.c | 32 ++++++++------------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index df5417c75f..cb22cb0fe6 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1208,50 +1208,34 @@ static Aml *build_lpt_device_aml(void)
     return dev;
 }
 
-static Aml *build_com_device_aml(uint8_t uid)
+static void build_com_device_aml(Aml *scope, uint8_t uid)
 {
     Aml *dev;
     Aml *crs;
-    Aml *method;
-    Aml *if_ctx;
-    Aml *else_ctx;
-    Aml *zero = aml_int(0);
-    Aml *is_present = aml_local(0);
-    const char *enabled_field = "CAEN";
     uint8_t irq = 4;
     uint16_t io_port = 0x03F8;
 
     assert(uid == 1 || uid == 2);
     if (uid == 2) {
-        enabled_field = "CBEN";
         irq = 3;
         io_port = 0x02F8;
     }
+    if (!memory_region_present(get_system_io(), io_port)) {
+        return;
+    }
 
     dev = aml_device("COM%d", uid);
     aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0501")));
     aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
 
-    method = aml_method("_STA", 0, AML_NOTSERIALIZED);
-    aml_append(method, aml_store(aml_name("%s", enabled_field), is_present));
-    if_ctx = aml_if(aml_equal(is_present, zero));
-    {
-        aml_append(if_ctx, aml_return(aml_int(0x00)));
-    }
-    aml_append(method, if_ctx);
-    else_ctx = aml_else();
-    {
-        aml_append(else_ctx, aml_return(aml_int(0x0f)));
-    }
-    aml_append(method, else_ctx);
-    aml_append(dev, method);
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
 
     crs = aml_resource_template();
     aml_append(crs, aml_io(AML_DECODE16, io_port, io_port, 0x00, 0x08));
     aml_append(crs, aml_irq_no_flags(irq));
     aml_append(dev, aml_name_decl("_CRS", crs));
 
-    return dev;
+    aml_append(scope, dev);
 }
 
 static void build_isa_devices_aml(Aml *table)
@@ -1268,8 +1252,8 @@ static void build_isa_devices_aml(Aml *table)
         aml_append(scope, build_fdc_device_aml(fdc));
     }
     aml_append(scope, build_lpt_device_aml());
-    aml_append(scope, build_com_device_aml(1));
-    aml_append(scope, build_com_device_aml(2));
+    build_com_device_aml(scope, 1);
+    build_com_device_aml(scope, 2);
 
     if (ambiguous) {
         error_report("Multiple ISA busses, unable to define IPMI ACPI data");

From dcdbfaafe90a5e6e172368b2aa5500a9ca192e49 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:10 +0200
Subject: [PATCH 09/58] acpi: move aml builder code for serial device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code uses the isa_serial_io array to figure what the device uid is.
Side effect is that acpi antries are not limited to port 1+2 any more,
we'll also get entries for ports 3+4.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200515150421.25479-6-kraxel@redhat.com>
---
 hw/char/serial-isa.c | 22 ++++++++++++++++++++++
 hw/i386/acpi-build.c | 32 --------------------------------
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/hw/char/serial-isa.c b/hw/char/serial-isa.c
index f9b6eed783..165e320e65 100644
--- a/hw/char/serial-isa.c
+++ b/hw/char/serial-isa.c
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "qemu/module.h"
 #include "sysemu/sysemu.h"
+#include "hw/acpi/aml-build.h"
 #include "hw/char/serial.h"
 #include "hw/isa/isa.h"
 #include "hw/qdev-properties.h"
@@ -81,6 +82,25 @@ static void serial_isa_realizefn(DeviceState *dev, Error **errp)
     isa_register_ioport(isadev, &s->io, isa->iobase);
 }
 
+static void serial_isa_build_aml(ISADevice *isadev, Aml *scope)
+{
+    ISASerialState *isa = ISA_SERIAL(isadev);
+    Aml *dev;
+    Aml *crs;
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_io(AML_DECODE16, isa->iobase, isa->iobase, 0x00, 0x08));
+    aml_append(crs, aml_irq_no_flags(isa->isairq));
+
+    dev = aml_device("COM%d", isa->index + 1);
+    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0501")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(isa->index + 1)));
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+
+    aml_append(scope, dev);
+}
+
 static const VMStateDescription vmstate_isa_serial = {
     .name = "serial",
     .version_id = 3,
@@ -103,9 +123,11 @@ static Property serial_isa_properties[] = {
 static void serial_isa_class_initfn(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    ISADeviceClass *isa = ISA_DEVICE_CLASS(klass);
 
     dc->realize = serial_isa_realizefn;
     dc->vmsd = &vmstate_isa_serial;
+    isa->build_aml = serial_isa_build_aml;
     device_class_set_props(dc, serial_isa_properties);
     set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
 }
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index cb22cb0fe6..6de25f6484 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1208,36 +1208,6 @@ static Aml *build_lpt_device_aml(void)
     return dev;
 }
 
-static void build_com_device_aml(Aml *scope, uint8_t uid)
-{
-    Aml *dev;
-    Aml *crs;
-    uint8_t irq = 4;
-    uint16_t io_port = 0x03F8;
-
-    assert(uid == 1 || uid == 2);
-    if (uid == 2) {
-        irq = 3;
-        io_port = 0x02F8;
-    }
-    if (!memory_region_present(get_system_io(), io_port)) {
-        return;
-    }
-
-    dev = aml_device("COM%d", uid);
-    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0501")));
-    aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
-
-    aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
-
-    crs = aml_resource_template();
-    aml_append(crs, aml_io(AML_DECODE16, io_port, io_port, 0x00, 0x08));
-    aml_append(crs, aml_irq_no_flags(irq));
-    aml_append(dev, aml_name_decl("_CRS", crs));
-
-    aml_append(scope, dev);
-}
-
 static void build_isa_devices_aml(Aml *table)
 {
     ISADevice *fdc = pc_find_fdc0();
@@ -1252,8 +1222,6 @@ static void build_isa_devices_aml(Aml *table)
         aml_append(scope, build_fdc_device_aml(fdc));
     }
     aml_append(scope, build_lpt_device_aml());
-    build_com_device_aml(scope, 1);
-    build_com_device_aml(scope, 2);
 
     if (ambiguous) {
         error_report("Multiple ISA busses, unable to define IPMI ACPI data");

From 3e824d38256364f983be8717214fef709bc90e31 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:11 +0200
Subject: [PATCH 10/58] acpi: parallel: don't use _STA method

The _STA method dates back to the days where we had a static DSDT.  The
device is listed in the DSDT table unconditionally and the _STA method
checks a bit in the isa bridge pci config space to figure whenever a
given is isa device is present or not, then evaluates to 0x0f (present)
or 0x00 (absent).

These days the DSDT is generated by qemu anyway, so if a device is not
present we can simply drop it from the DSDT instead.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <20200515150421.25479-7-kraxel@redhat.com>
---
 hw/i386/acpi-build.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 6de25f6484..2cafad03e2 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1173,39 +1173,26 @@ static Aml *build_mouse_device_aml(void)
     return dev;
 }
 
-static Aml *build_lpt_device_aml(void)
+static void build_lpt_device_aml(Aml *scope)
 {
     Aml *dev;
     Aml *crs;
-    Aml *method;
-    Aml *if_ctx;
-    Aml *else_ctx;
-    Aml *zero = aml_int(0);
-    Aml *is_present = aml_local(0);
+
+    if (!memory_region_present(get_system_io(), 0x0378)) {
+        return;
+    }
 
     dev = aml_device("LPT");
     aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0400")));
 
-    method = aml_method("_STA", 0, AML_NOTSERIALIZED);
-    aml_append(method, aml_store(aml_name("LPEN"), is_present));
-    if_ctx = aml_if(aml_equal(is_present, zero));
-    {
-        aml_append(if_ctx, aml_return(aml_int(0x00)));
-    }
-    aml_append(method, if_ctx);
-    else_ctx = aml_else();
-    {
-        aml_append(else_ctx, aml_return(aml_int(0x0f)));
-    }
-    aml_append(method, else_ctx);
-    aml_append(dev, method);
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
 
     crs = aml_resource_template();
     aml_append(crs, aml_io(AML_DECODE16, 0x0378, 0x0378, 0x08, 0x08));
     aml_append(crs, aml_irq_no_flags(7));
     aml_append(dev, aml_name_decl("_CRS", crs));
 
-    return dev;
+    aml_append(scope, dev);
 }
 
 static void build_isa_devices_aml(Aml *table)
@@ -1221,7 +1208,7 @@ static void build_isa_devices_aml(Aml *table)
     if (fdc) {
         aml_append(scope, build_fdc_device_aml(fdc));
     }
-    aml_append(scope, build_lpt_device_aml());
+    build_lpt_device_aml(scope);
 
     if (ambiguous) {
         error_report("Multiple ISA busses, unable to define IPMI ACPI data");

From ed003c8c7743b846b1bf3f87cb88baf090da44e6 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 15 May 2020 17:04:12 +0200
Subject: [PATCH 11/58] acpi: move aml builder code for parallel device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also adds support for multiple LPT devices.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200515150421.25479-8-kraxel@redhat.com>
---
 hw/char/parallel.c   | 22 ++++++++++++++++++++++
 hw/i386/acpi-build.c | 23 -----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/hw/char/parallel.c b/hw/char/parallel.c
index 8dd67d1375..c0f34bf924 100644
--- a/hw/char/parallel.c
+++ b/hw/char/parallel.c
@@ -28,6 +28,7 @@
 #include "qemu/module.h"
 #include "chardev/char-parallel.h"
 #include "chardev/char-fe.h"
+#include "hw/acpi/aml-build.h"
 #include "hw/irq.h"
 #include "hw/isa/isa.h"
 #include "hw/qdev-properties.h"
@@ -568,6 +569,25 @@ static void parallel_isa_realizefn(DeviceState *dev, Error **errp)
                              s, "parallel");
 }
 
+static void parallel_isa_build_aml(ISADevice *isadev, Aml *scope)
+{
+    ISAParallelState *isa = ISA_PARALLEL(isadev);
+    Aml *dev;
+    Aml *crs;
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_io(AML_DECODE16, isa->iobase, isa->iobase, 0x08, 0x08));
+    aml_append(crs, aml_irq_no_flags(isa->isairq));
+
+    dev = aml_device("LPT%d", isa->index + 1);
+    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0400")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(isa->index + 1)));
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+
+    aml_append(scope, dev);
+}
+
 /* Memory mapped interface */
 static uint64_t parallel_mm_readfn(void *opaque, hwaddr addr, unsigned size)
 {
@@ -624,9 +644,11 @@ static Property parallel_isa_properties[] = {
 static void parallel_isa_class_initfn(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    ISADeviceClass *isa = ISA_DEVICE_CLASS(klass);
 
     dc->realize = parallel_isa_realizefn;
     dc->vmsd = &vmstate_parallel_isa;
+    isa->build_aml = parallel_isa_build_aml;
     device_class_set_props(dc, parallel_isa_properties);
     set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
 }
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2cafad03e2..58fe505fb6 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1173,28 +1173,6 @@ static Aml *build_mouse_device_aml(void)
     return dev;
 }
 
-static void build_lpt_device_aml(Aml *scope)
-{
-    Aml *dev;
-    Aml *crs;
-
-    if (!memory_region_present(get_system_io(), 0x0378)) {
-        return;
-    }
-
-    dev = aml_device("LPT");
-    aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0400")));
-
-    aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
-
-    crs = aml_resource_template();
-    aml_append(crs, aml_io(AML_DECODE16, 0x0378, 0x0378, 0x08, 0x08));
-    aml_append(crs, aml_irq_no_flags(7));
-    aml_append(dev, aml_name_decl("_CRS", crs));
-
-    aml_append(scope, dev);
-}
-
 static void build_isa_devices_aml(Aml *table)
 {
     ISADevice *fdc = pc_find_fdc0();
@@ -1208,7 +1186,6 @@ static void build_isa_devices_aml(Aml *table)
     if (fdc) {
         aml_append(scope, build_fdc_device_aml(fdc));
     }
-    build_lpt_device_aml(scope);
 
     if (ambiguous) {
         error_report("Multiple ISA busses, unable to define IPMI ACPI data");

From bab16ab33012cdae36246d2994b4a5ed9ac441f0 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 9 Jun 2020 11:16:14 -0400
Subject: [PATCH 12/58] tests/acpi: update DSDT expected files

Update DSDT after CRS changes and _STA methods dropped.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/data/acpi/pc/DSDT                     | Bin 5125 -> 5014 bytes
 tests/data/acpi/pc/DSDT.acpihmat            | Bin 6449 -> 6338 bytes
 tests/data/acpi/pc/DSDT.bridge              | Bin 6984 -> 6873 bytes
 tests/data/acpi/pc/DSDT.cphp                | Bin 5588 -> 5477 bytes
 tests/data/acpi/pc/DSDT.dimmpxm             | Bin 6778 -> 6667 bytes
 tests/data/acpi/pc/DSDT.ipmikcs             | Bin 5197 -> 5086 bytes
 tests/data/acpi/pc/DSDT.memhp               | Bin 6484 -> 6373 bytes
 tests/data/acpi/pc/DSDT.numamem             | Bin 5131 -> 5020 bytes
 tests/data/acpi/q35/DSDT                    | Bin 7863 -> 7752 bytes
 tests/data/acpi/q35/DSDT.acpihmat           | Bin 9187 -> 9076 bytes
 tests/data/acpi/q35/DSDT.bridge             | Bin 7880 -> 7769 bytes
 tests/data/acpi/q35/DSDT.cphp               | Bin 8326 -> 8215 bytes
 tests/data/acpi/q35/DSDT.dimmpxm            | Bin 9516 -> 9405 bytes
 tests/data/acpi/q35/DSDT.ipmibt             | Bin 7938 -> 7827 bytes
 tests/data/acpi/q35/DSDT.memhp              | Bin 9222 -> 9111 bytes
 tests/data/acpi/q35/DSDT.mmio64             | Bin 8993 -> 8882 bytes
 tests/data/acpi/q35/DSDT.numamem            | Bin 7869 -> 7758 bytes
 tests/qtest/bios-tables-test-allowed-diff.h |  17 -----------------
 18 files changed, 17 deletions(-)

diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT
index ad4b2d46cc7865e8bafcca2e4e888a03cc5483b5..384a82dbb3cb0e9f47db6f4d08945631c2b72b56 100644
GIT binary patch
delta 164
zcmZqGn5NF<66_K(O_+g!v27z)FQb5;xIS}yuv2`1v!{V)uw(q>Ym5)2jC=w@3_0RG
zJY9GkFR(Ch#D{viFml8Phd55Y%On+U<m~SYmu3Y@z@@qPIpUpzf&~S*c-<L)F#lj?
n;7}4^sAPz4QVj}mj)xh^4Kxz2tbn0_fssRrfpK#abF%;dyx=Gg

delta 282
zcmbQH-m1an66_MfD#F0PsJ4--mr>YPPM<kG*eO21+0(!?*fBo3Nq6!`CJ_k<E>U;J
z0)_$x0VX8|Mt8;{h9U+gwn~P{%NZZm+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX>?2004S}
BL(TvI

diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat
index eff7aadfabe431c3ac2d28e0c6721eb6e322af66..47ddfdb027b06dc2daa46be711c3f4640ce68320 100644
GIT binary patch
delta 164
zcmdmJbjXm)CD<k8kOTt*W5`CXUPb{waee0aV5j&1XHNsqV8{5$*BBp48TkZ+7;?mW
zc)IX7USMJ1h!6F2VdRJp4so1(mq{wz$l2c)F3k#*fJ<}nbHqCb1q%vr@wzkqVE)0(
nz@a3-P{|P8q#6|B91k;+8)zh4Sph=<10#nL1LNi<=Jlcg0#+!i

delta 282
zcmX?PxY3BqCD<jzP?CXx@ykZ8UPfVGIeq5%V5j&1XHNsqV8{6ACf&&&nM5QcxJ2C<
z3m6I*1elZ<7~L6*7>XE}*eV$&FK2vMZ|4&b63-Fu;pxKTc!7mML?S*o#F3%EC&1M&
zVFBCZ1coIH3t5(M@pHsG2L%fXaPhh`RxnpEb8sj%FjO)`H@UJn`}-Qg3}9vCh!6F2
zVT2ju?1<Z#AIv|P8GJaD1i<FFfy^;NGlvW4dWd_Ro$wgM^n;0kLy3c7voP~|Q2_l*
BMQQ*5

diff --git a/tests/data/acpi/pc/DSDT.bridge b/tests/data/acpi/pc/DSDT.bridge
index 92ae808e2e071e367f0b68b3553239bd56b4b766..d1e2fa9fb8c75160fc1fa46deed6a6a9cb515559 100644
GIT binary patch
delta 164
zcmX?McGHy0CD<k8rW6AM<N1wTy^I2W;`+?-!A|i3&YlLI!H)5huQ5K9GV%!sG31E%
z@O0sEyuiZ15g+R5!pIRH9O5|nE|XNak+Z)qT$&Xq0hi|D=ZJR>3KkUL;&o^I!Tf`n
nfkR1vp^_oGNi`_MIUZ&tH_%A9vI2$z21X7g2FA@z%*Vt5d=n{-

delta 282
zcmca<dcus$CD<jzLz;nsF@GahFQc%poIZ1Wuv2`1v!{V)uw#65lkVh?Od=8zT%zub
z1q=lY0!&H_jP8s@3`Gn~Y?Ta?moq-BxAO@IiRXy-@O0sEyuiXBA`u@P;>b|o6X5EX
zuz+oH0>cu9g)B?B_&MU8gMtMGxOm+eE0`;oIXILW7%CZ}n_O9({e2B#2Cy=6#D{vi
zFv5&+cEoMW59S}t3_cu60$_98K;{^snZpHiJ;XiEPIwGr`oYA&p~S(kS(y2lH~`2b
BMNI$z

diff --git a/tests/data/acpi/pc/DSDT.cphp b/tests/data/acpi/pc/DSDT.cphp
index f3572358510f3fbb3d966047274f7aa7835e7bef..54f481faf1e336c0bbf5e774cd67220fe06e951b 100644
GIT binary patch
delta 164
zcmcbj{Zxy~CD<h-Rg{5&@&87yUPb{waee0aV5j&1XHNsqV8{5$*BBp48TkZ+7;?mW
zc)IX7USMJ1h!6F2VdRJp4so1(mq{wz$l2c)F3k#*fJ<}nbHqCb1q%vr@wzkqVE)0(
nz@a3-P{|P8q#6|B91k;+8)zh4Sph=<10#nL1LNi<<{%*ePh=?0

delta 282
zcmaE=bw!)YCD<k8iYNmE<ARM`y^O-Xa{A2i!A|i3&YlLI!H)6KO}djmGKok?aEZDz
z7BCbr2rwxzFuF4qF%&T{u~jlmUe5Ti-p(f=B%UMQ!_$Sw@d68jh(vsFh$BORPk^gm
z!UDF*2@FdZ7P2hi;^&BW4hj|&;No>>tYEHS=HO6jV5nq>ZgORD_V+b}8NkZO5g+R5
z!U!|Q*%7xfKbU_oGx%^Q34qOU1DRulW)2t7^$_<sJK-^i=?4=7hY|<FW?|+aApncU
BMU4Oe

diff --git a/tests/data/acpi/pc/DSDT.dimmpxm b/tests/data/acpi/pc/DSDT.dimmpxm
index 7fa09463c16c2f6b60114591785918b9d96b4f17..5d98016ae571cde04ff96d58212e0faf9aaf50e6 100644
GIT binary patch
delta 164
zcmexm(rv=!66_MfEycjV_-rFrFQb5;xIS}yuv2`1v!{V)uw(q>Ym5)2jC=w@3_0RG
zJY9GkFR(Ch#D{viFml8Phd55Y%On+U<m~SYmu3Y@z@@qPIpUpzf&~S*c-<L)F#lj?
n;7}4^sAPz4QVj}mj)xh^4Kxz2tbn0_fssRrfpK#a^A9lq`Rpj^

delta 282
zcmeA+`DMc866_LECB?wN*uIggmr>YPPM<kG*eO21+0(!?*fBo3Nq6!`CJ_k<E>U;J
z0)_$x0VX8|Mt8;{h9U+gwn~P{%NZZm+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX@2v3;^?H
BMW_G(

diff --git a/tests/data/acpi/pc/DSDT.ipmikcs b/tests/data/acpi/pc/DSDT.ipmikcs
index 469d13e1f6b873bb9cfa0b3af32d1a3bc58e8f77..57b78358744a5bb13639ccddb887be2721240807 100644
GIT binary patch
delta 172
zcmX@BaZjDgCD<k8o-hLgqtr&OUPb{o8GYvXV5j&1XHNsqV8{5$*BBp8Zew<niEc9T
z2?#Odi1+Yx;c>jc!oU$9>gmGB5g#1lIQcG<RHTu!zb{;x6(|9h=HlmwcMb{`6yV}@
vXZ*qZgPDOtNr0h}A-YL5D8xA)W+XSzNVu{Bh5`mg4kZT0N`}p6m>UHE!Nn_l

delta 259
zcmcboepZ9aCD<jzSA>Cqao0w!UPgOYWqs!OV5j&1XHNsqV8{6ACf%SA=Xj2I4^I~!
z#|zvH9P!RU!GaQ8qV9|Z3<V4VOiB!l?u<nYMGQ=Al?;=YGd@(a^9cxn>tJCJk%$it
zabzg)32^mGSim+pfnf>5LY5_yUolDPy0SR?`x?SbXJzDw5A}3mgz0d0MAae3EamD3
t(qV+AgNq5S#R)|V7eB<k0$jZAj6axuFfni_aWGU)Ze{Y^9LwA&008daLcst4

diff --git a/tests/data/acpi/pc/DSDT.memhp b/tests/data/acpi/pc/DSDT.memhp
index aee75bea272cc9983202cceb747aecc297b5104e..8cb90ef14e13be85995c6fe3d3f6d12f4d939504 100644
GIT binary patch
delta 164
zcmca&^wf~cCD<k8sRRQ9W5q_UUPb{waee0aV5j&1XHNsqV8{5$*BBp48TkZ+7;?mW
zc)IX7USMJ1h!6F2VdRJp4so1(mq{wz$l2c)F3k#*fJ<}nbHqCb1q%vr@wzkqVE)0(
nz@a3-P{|P8q#6|B91k;+8)zh4Sph=<10#nL1LNi<=1ZagS$8R0

delta 282
zcmaEAc*TgzCD<h-M3RAlQDP%kFQc%poIZ1Wuv2`1v!{V)uw#65lkVh?Od=8zT%zub
z1q=lY0!&H_jP8s@3`Gn~Y?Ta?moq-BxAO@IiRXy-@O0sEyuiXBA`u@P;>b|o6X5EX
zuz+oH0>cu9g)B?B_&MU8gMtMGxOm+eE0`;oIXILW7%CZ}n_O9({e2B#2Cy=6#D{vi
zFv5&+cEoMW59S}t3_cu60$_98K;{^snZpHiJ;XiEPIwGr`oYA&p~S(kS(y2fC;*9S
BMH&DA

diff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem
index 9a747f6f08f61c73b891d8f91db01521e635f811..f194bc639482eb839a875d493857526f85f1a9e0 100644
GIT binary patch
delta 164
zcmeCyn4`|+66_K(N0@<u(O@H2FQb5;xIS}yuv2`1v!{V)uw(q>Ym5)2jC=w@3_0RG
zJY9GkFR(Ch#D{viFml8Phd55Y%On+U<m~SYmu3Y@z@@qPIpUpzf&~S*c-<L)F#lj?
n;7}4^sAPz4QVj}mj)xh^4Kxz2tbn0_fssRrfpK#abEg0RmCh&8

delta 282
zcmbQE-mSsq66_MfEyBRScwr+~FQc%poIZ1Wuv2`1v!{V)uw#65lkVh?Od=8zT%zub
z1q=lY0!&H_jP8s@3`Gn~Y?Ta?moq-BxAO@IiRXy-@O0sEyuiXBA`u@P;>b|o6X5EX
zuz+oH0>cu9g)B?B_&MU8gMtMGxOm+eE0`;oIXILW7%CZ}n_O9({e2B#2Cy=6#D{vi
zFv5&+cEoMW59S}t3_cu60$_98K;{^snZpHiJ;XiEPIwGr`oYA&p~S(kS(v#~002NJ
BM34Xg

diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT
index 9fa4d5a405c2bcd9313b13894917622bf156013e..6a5e4dd85a7d9a95f7ad0fb95e6a4fa7a8d91adb 100644
GIT binary patch
delta 164
zcmdmPd%}jxCD<jzLym!g@ybRn0VV-&etqWnV5j&1XHNsqV8{5$kxX$?Mm_-{h8*!8
zo-RC&7g!iL;zK=M7&+pDLmVgHWs(Xva`yLyOS1wc;L=?D9P!RU!GZ!@yzY!Yn13)c
na3~2dR5C<2sRo5O$HR=|1{w)hR=`lez{sJ*z__`I*;E<;P|hab

delta 282
zcmX?Mv)z`<CD<ioyBq@pW9ddN0VZKDF@5IvV5j&1XHNsqV8{6ACf&&&nM5QcxJ2C<
z3m6I*1elZ<7~L6*7>XE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX-^w4FJ*)
BL!JNt

diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat
index 2d834a854ccddc17afd0bc4b4a9e0886feff8e65..c1dd7773f3386a946fcb4a9a3bf9ad3a33ddbbe9 100644
GIT binary patch
delta 164
zcmaFt{>6>UCD<jTM45qsanVLD0VV-&etqWnV5j&1XHNsqV8{5$kxX$?Mm_-{h8*!8
zo-RC&7g!iL;zK=M7&+pDLmVgHWs(Xva`yLyOS1wc;L=?D9P!RU!GZ!@yzY!Yn13)c
na3~2dR5C<2sRo5O$HR=|1{w)hR=`lez{sJ*z__`IIZ*)smNh4P

delta 282
zcmez3_Sl`vCD<k8u`&Y#qw_{C0VZKDF@5IvV5j&1XHNsqV8{6ACf&&&nM5QcxJ2C<
z3m6I*1elZ<7~L6*7>XE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX<s!002nW
BL^=Qf

diff --git a/tests/data/acpi/q35/DSDT.bridge b/tests/data/acpi/q35/DSDT.bridge
index b75122b24a41b8fd58b9e7071032298e0fc2284b..2ef1e894a35b9e85fe07e2678bd2456f5ec40dc6 100644
GIT binary patch
delta 164
zcmX?Md((!?CD<h-QjURvv1B8c0F!_>zdmz(uv2`1v!{V)uw(q>NTxU`BcFf}LymY4
zPZu7?3oHyA@u8kBj2!X7A&!&pGD(FSIs5y<rCEUzaA_`nj(F#wU_k*cUU$YH%s-eJ
nIFtk!DjA}iRD(jC<6%Z}1C4|$D_|&KVB}C@VBFlq>?REWIH4xP

delta 282
zcmca<bHbL(CD<k8gd76{qv%F10VZKDF@5IvV5j&1XHNsqV8{6ACf&&&nM5QcxJ2C<
z3m6I*1elZ<7~L6*7>XE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX?dC4FIJi
BLxKPR

diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp
index c59c19ff46b9bb4fa3e06e9ffbcbeba308a80cd0..74e86176e5fec46e660c567acf8fbcf08a14bdfb 100644
GIT binary patch
delta 164
zcmZp3obJHo66_KpuE4;+=(~|ifJwlcU!OTX*eO21+0(!?*fD-`BvYJ}kxxK~AxFH2
zrwfnc1r`R5_)t$5MvnO45XZ@PnWVyvoc(>_(yTxUxHK0(N4#@Tu%G}JuRG%p<{!)q
n97+NVl?>5MszD*n@h~H~fkwiW6)+SqFmfm{Fm7&QW|0K|t9T`#

delta 282
zcmbR4(B{bH66_Mvroh0!_--SY0F$tnm_BoSuv2`1v!{V)uw#65lkVh?Od=8zT%zub
z1q=lY0!&H_jP8s@3`Gn~Y?Ta?wV2}S?R)}4;yL0yJY9GkFR(C(NW=$+I5HIY1i1Po
zEMS|Qz_5g2A<GgjevWwOpkP4(E?#%W3g!xC4i2RThDwI$CRY|`e_unG0j!K1@u8kB
zj4)%I9dR4;gZT$DgAa$20N5NikU2(Z=5PUB4{?vP6CQ(@elRg`C~+`s7G`FV1pvEf
BLtg*@

diff --git a/tests/data/acpi/q35/DSDT.dimmpxm b/tests/data/acpi/q35/DSDT.dimmpxm
index 9edc104ee6b06dd9d909b65b83c99494d03a137d..4bf8e9d64b04f9f805047d6850c2dd0086970445 100644
GIT binary patch
delta 164
zcmZ4Ewbzr&CD<iouL=VLqxeQH0VV-&etqWnV5j&1XHNsqV8{5$kxX$?Mm_-{h8*!8
zo-RC&7g!iL;zK=M7&+pDLmVgHWs(Xva`yLyOS1wc;L=?D9P!RU!GZ!@yzY!Yn13)c
na3~2dR5C<2sRo5O$HR=|1{w)hR=`lez{sJ*z__`Id6gmnFXbm3

delta 282
zcmdn%xyFmjCD<iIN0otrar;It0VZKDF@5IvV5j&1XHNsqV8{6ACf&&&nM5QcxJ2C<
z3m6I*1elZ<7~L6*7>XE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX=%05de>t
BL)!oV

diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt
index 3910e9b767808962b46501da51945229359e3d1d..38723daef80421ea528b2ad2d411e7357df43956 100644
GIT binary patch
delta 172
zcmZp&n{3PF66_K(S&o5$kz*s50F!{5h(2?Cuv2`1v!{V)uw(q>NT#^SZOooB(M?7^
z0U?GQ@gANoJdPJw7&ziXJzW?%;)6pRC*Ng~iZpWe_k~Nd0wv(mT>KpI&OyO~0$jZA
uj6axvFf(u{2{2SLL^r7hg*eB<jN}Fy30GFYPylq25(8r;!{+nM#?k<Ck|_WH

delta 258
zcmbPi+hoV(66_MfB+tOWxMCxh0F%9|v_5luuv2`1v!{V)uw#65lWtImb38}9ho=jV
z;{|R8j(F#wU_l8kQFq1yh5`lwCM5<&cg7-yA_gY5N`}c=OmS*<J^>+c9V`qY67j(y
zjtm7p0j_=t3)m(nFf3tM$g*VeTP7)8R~Bb~UqhJbtc)D-p`I>`Fdfd0s5%sxrCi)V
rI*ibCa52HPIH73a;)l3bfQ#3i@dwinCI${A4u;7sOum~Fn2n_Y$^<{r

diff --git a/tests/data/acpi/q35/DSDT.memhp b/tests/data/acpi/q35/DSDT.memhp
index 8461e984c965da916d828884f6629422e83e429c..98328d1c4197ab19a71de7f3f18e2985f4910f45 100644
GIT binary patch
delta 164
zcmZqknC{Ny66_K(U73M_@z_Q#0VV-&etqWnV5j&1XHNsqV8{5$kxX$?Mm_-{h8*!8
zo-RC&7g!iL;zK=M7&+pDLmVgHWs(Xva`yLyOS1wc;L=?D9P!RU!GZ!@yzY!Yn13)c
na3~2dR5C<2sRo5O$HR=|1{w)hR=`lez{sJ*z__`IxkUj0OPVJw

delta 282
zcmbR4-sZvO66_MfrozC$n6{BifJxX(OrJSE*eO21+0(!?*fBo3Nq6!`CJ_k<E>U;J
z0)_$x0VX8|Mt8;{h9U+gwn~P{T1;{Ec0K_i@f`6Uo-RC&7g!iXB;tcZ92p9H0$lwP
z7O+iDU|7PikYx!MKS#WCP_Uo?7q2^G1#<;62ZvGvLnT9WlPim}zpo+809Hnh_)t$5
zMwl_qj<}8a!Tf`n!G}Xh0BnvM$Q&aybGU%6hq%Yt36DWcKbROelsFhR3p2MU006lR
BLhJwl

diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64
index fc0cc096baf8aedc0a526978ff796025d7380453..5916c0e9ce0a9607c6230f9dfebe2c1be70b2495 100644
GIT binary patch
delta 164
zcmZ4Jw#k*tCD<iolM(|1W79@10VV-&etqWnV5j&1XHNsqV8{5$kxX$?Mm_-{h8*!8
zo-RC&7g!iL;zK=M7&+pDLmVgHWs(Xva`yLyOS1wc;L=?D9P!RU!GZ!@yzY!Yn13)c
na3~2dR5C<2sRo5O$HR=|1{w)hR=`lez{sJ*z__`IdA>XVQ;;W5

delta 282
zcmdnwy3mcwCD<iIQJH~(QE4NW0F$tnm_BoSuv2`1v!{V)uw#65lkVh?Od=8zT%zub
z1q=lY0!&H_jP8s@3`Gn~Y?Ta?wV2}S?R)}4;yL0yJY9GkFR(C(NW=$+I5HIY1i1Po
zEMS|Qz_5g2A<GgjevWwOpkP4(E?#%W3g!xC4i2RThDwI$CRY|`e_unG0j!K1@u8kB
zj4)%I9dR4;gZT$DgAa$20N5NikU2(Z=5PUB4{?vP6CQ(@elRg`C~+`s7G|C=4*<ln
BLk<7{

diff --git a/tests/data/acpi/q35/DSDT.numamem b/tests/data/acpi/q35/DSDT.numamem
index 498c843be1695a0fe1d31d88873b57206d17d758..cf3fde3449bc8e9bbe683b936cf9866590b0ef82 100644
GIT binary patch
delta 164
zcmdmMd(MW-CD<jzPmY0sv2G)m0F!_>zdmz(uv2`1v!{V)uw(q>NTxU`BcFf}LymY4
zPZu7?3oHyA@u8kBj2!X7A&!&pGD(FSIs5y<rCEUzaA_`nj(F#wU_k*cUU$YH%s-eJ
nIFtk!DjA}iRD(jC<6%Z}1C4|$D_|&KVB}C@VBFlqY%L7{DZD13

delta 282
zcmX?Sv)7i(CD<iouN(sdqx?oL0VZKDF@5IvV5j&1XHNsqV8{6ACf&&&nM5QcxJ2C<
z3m6I*1elZ<7~L6*7>XE}*eV$&Yca*u+xY~9#B;=Zc)IX7USMGmk%$itabzg)32^mG
zSim+pfnf>5LY5_5{2cMlLBWCoT)gg#70eaP92`mw43!MgO|C4?{=SAV16Ub3;zK=M
z7-7aZJK{Fx2lEeR1|JS30kAo4Aajh+%;5sM9^xKnCp-o*{a|9?P~u?NEX-^z4FHR!
BLtp>^

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index 6a052c5044..dfb8523c8b 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1,18 +1 @@
 /* List of comma-separated changed AML files to ignore */
-"tests/data/acpi/pc/DSDT",
-"tests/data/acpi/pc/DSDT.acpihmat",
-"tests/data/acpi/pc/DSDT.bridge",
-"tests/data/acpi/pc/DSDT.cphp",
-"tests/data/acpi/pc/DSDT.dimmpxm",
-"tests/data/acpi/pc/DSDT.ipmikcs",
-"tests/data/acpi/pc/DSDT.memhp",
-"tests/data/acpi/pc/DSDT.numamem",
-"tests/data/acpi/q35/DSDT",
-"tests/data/acpi/q35/DSDT.acpihmat",
-"tests/data/acpi/q35/DSDT.bridge",
-"tests/data/acpi/q35/DSDT.cphp",
-"tests/data/acpi/q35/DSDT.dimmpxm",
-"tests/data/acpi/q35/DSDT.ipmibt",
-"tests/data/acpi/q35/DSDT.memhp",
-"tests/data/acpi/q35/DSDT.mmio64",
-"tests/data/acpi/q35/DSDT.numamem",

From 7e7c1b84ca11c36a4269baa6538a47dbdf2ee758 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Fri, 29 May 2020 15:28:40 -0400
Subject: [PATCH 13/58] acpi: tpm: Do not build TCPA table for TPM 2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not build a TCPA table for TPM 2 anymore but create the log area when
building the TPM2 table. The TCPA table is only needed for TPM 1.2.

Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/acpi-build.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 58fe505fb6..d05d010f77 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2216,6 +2216,10 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
     tpm2_ptr->log_area_minimum_length =
         cpu_to_le32(TPM_LOG_AREA_MINIMUM_SIZE);
 
+    acpi_data_push(tcpalog, le32_to_cpu(tpm2_ptr->log_area_minimum_length));
+    bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1,
+                             false);
+
     /* log area start address to be filled by Guest linker */
     bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                    log_addr_offset, log_addr_size,
@@ -2752,10 +2756,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
         build_hpet(tables_blob, tables->linker);
     }
     if (misc.tpm_version != TPM_VERSION_UNSPEC) {
-        acpi_add_table(table_offsets, tables_blob);
-        build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog);
-
-        if (misc.tpm_version == TPM_VERSION_2_0) {
+        if (misc.tpm_version == TPM_VERSION_1_2) {
+            acpi_add_table(table_offsets, tables_blob);
+            build_tpm_tcpa(tables_blob, tables->linker, tables->tcpalog);
+        } else { /* TPM_VERSION_2_0 */
             acpi_add_table(table_offsets, tables_blob);
             build_tpm2(tables_blob, tables->linker, tables->tcpalog);
         }

From 04b778610af51987309055440b1ff16b50db5646 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 1 Jun 2020 11:57:34 +0200
Subject: [PATCH 14/58] acpi: Convert build_tpm2() to build_append* API

In preparation of its move to the generic acpi code,
let's convert build_tpm2() to use build_append API. This
latter now is prefered in place of direct ACPI struct field
settings with manual endianness conversion.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Message-Id: <20200601095737.32671-2-eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/acpi-build.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index d05d010f77..8d93a2d339 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2197,30 +2197,40 @@ build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
 static void
 build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
 {
-    Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof *tpm2_ptr);
+    Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof(AcpiTableHeader));
     unsigned log_addr_size = sizeof(tpm2_ptr->log_area_start_address);
     unsigned log_addr_offset =
         (char *)&tpm2_ptr->log_area_start_address - table_data->data;
+    uint8_t start_method_params[12] = {};
 
-    tpm2_ptr->platform_class = cpu_to_le16(TPM2_ACPI_CLASS_CLIENT);
+    /* platform class */
+    build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2);
+    /* reserved */
+    build_append_int_noprefix(table_data, 0, 2);
     if (TPM_IS_TIS_ISA(tpm_find())) {
-        tpm2_ptr->control_area_address = cpu_to_le64(0);
-        tpm2_ptr->start_method = cpu_to_le32(TPM2_START_METHOD_MMIO);
+        /* address of control area */
+        build_append_int_noprefix(table_data, 0, 8);
+        /* start method */
+        build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4);
     } else if (TPM_IS_CRB(tpm_find())) {
-        tpm2_ptr->control_area_address = cpu_to_le64(TPM_CRB_ADDR_CTRL);
-        tpm2_ptr->start_method = cpu_to_le32(TPM2_START_METHOD_CRB);
+        build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8);
+        build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4);
     } else {
         g_warn_if_reached();
     }
 
-    tpm2_ptr->log_area_minimum_length =
-        cpu_to_le32(TPM_LOG_AREA_MINIMUM_SIZE);
+    /* platform specific parameters */
+    g_array_append_vals(table_data, &start_method_params, 12);
 
-    acpi_data_push(tcpalog, le32_to_cpu(tpm2_ptr->log_area_minimum_length));
+    /* log area minimum length */
+    build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4);
+
+    acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE);
     bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1,
                              false);
 
     /* log area start address to be filled by Guest linker */
+    build_append_int_noprefix(table_data, 0, 8);
     bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                    log_addr_offset, log_addr_size,
                                    ACPI_BUILD_TPMLOG_FILE, 0);

From 4338416064303aad7929a247770f2ed4d9652966 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 1 Jun 2020 11:57:35 +0200
Subject: [PATCH 15/58] acpi: Move build_tpm2() in the generic part

We plan to build the TPM2 table on ARM too. In order to reuse the
generation code, let's move build_tpm2() to aml-build.c.

No change in the implementation.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Message-Id: <20200601095737.32671-3-eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/aml-build.c         | 44 +++++++++++++++++++++++++++++++++++++
 hw/i386/acpi-build.c        | 44 -------------------------------------
 include/hw/acpi/aml-build.h |  2 ++
 3 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 3681ec6e3d..b37052c1b4 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -26,6 +26,7 @@
 #include "qemu/bitops.h"
 #include "sysemu/numa.h"
 #include "hw/boards.h"
+#include "hw/acpi/tpm.h"
 
 static GArray *build_alloc_array(void)
 {
@@ -1877,6 +1878,49 @@ build_hdr:
                  "FACP", tbl->len - fadt_start, f->rev, oem_id, oem_table_id);
 }
 
+void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
+{
+    Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof(AcpiTableHeader));
+    unsigned log_addr_size = sizeof(tpm2_ptr->log_area_start_address);
+    unsigned log_addr_offset =
+        (char *)&tpm2_ptr->log_area_start_address - table_data->data;
+    uint8_t start_method_params[12] = {};
+
+    /* platform class */
+    build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2);
+    /* reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    if (TPM_IS_TIS_ISA(tpm_find())) {
+        /* address of control area */
+        build_append_int_noprefix(table_data, 0, 8);
+        /* start method */
+        build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4);
+    } else if (TPM_IS_CRB(tpm_find())) {
+        build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8);
+        build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4);
+    } else {
+        g_warn_if_reached();
+    }
+
+    /* platform specific parameters */
+    g_array_append_vals(table_data, &start_method_params, 12);
+
+    /* log area minimum length */
+    build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4);
+
+    acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE);
+    bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1,
+                             false);
+
+    /* log area start address to be filled by Guest linker */
+    build_append_int_noprefix(table_data, 0, 8);
+    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
+                                   log_addr_offset, log_addr_size,
+                                   ACPI_BUILD_TPMLOG_FILE, 0);
+    build_header(linker, table_data,
+                 (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL);
+}
+
 /* ACPI 5.0: 6.4.3.8.2 Serial Bus Connection Descriptors */
 static Aml *aml_serial_bus_device(uint8_t serial_bus_type, uint8_t flags,
                                   uint16_t type_flags,
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 8d93a2d339..1ecb68f45f 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2194,50 +2194,6 @@ build_tpm_tcpa(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
                  (void *)tcpa, "TCPA", sizeof(*tcpa), 2, NULL, NULL);
 }
 
-static void
-build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
-{
-    Acpi20TPM2 *tpm2_ptr = acpi_data_push(table_data, sizeof(AcpiTableHeader));
-    unsigned log_addr_size = sizeof(tpm2_ptr->log_area_start_address);
-    unsigned log_addr_offset =
-        (char *)&tpm2_ptr->log_area_start_address - table_data->data;
-    uint8_t start_method_params[12] = {};
-
-    /* platform class */
-    build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2);
-    /* reserved */
-    build_append_int_noprefix(table_data, 0, 2);
-    if (TPM_IS_TIS_ISA(tpm_find())) {
-        /* address of control area */
-        build_append_int_noprefix(table_data, 0, 8);
-        /* start method */
-        build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4);
-    } else if (TPM_IS_CRB(tpm_find())) {
-        build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8);
-        build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4);
-    } else {
-        g_warn_if_reached();
-    }
-
-    /* platform specific parameters */
-    g_array_append_vals(table_data, &start_method_params, 12);
-
-    /* log area minimum length */
-    build_append_int_noprefix(table_data, TPM_LOG_AREA_MINIMUM_SIZE, 4);
-
-    acpi_data_push(tcpalog, TPM_LOG_AREA_MINIMUM_SIZE);
-    bios_linker_loader_alloc(linker, ACPI_BUILD_TPMLOG_FILE, tcpalog, 1,
-                             false);
-
-    /* log area start address to be filled by Guest linker */
-    build_append_int_noprefix(table_data, 0, 8);
-    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
-                                   log_addr_offset, log_addr_size,
-                                   ACPI_BUILD_TPMLOG_FILE, 0);
-    build_header(linker, table_data,
-                 (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL);
-}
-
 #define HOLE_640K_START  (640 * KiB)
 #define HOLE_640K_END   (1 * MiB)
 
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index ed7c89309e..d27da03d64 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -437,4 +437,6 @@ void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms);
 
 void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f,
                 const char *oem_id, const char *oem_table_id);
+
+void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog);
 #endif

From 80bde69353924d99e2a7f5ac6be0ab4cf489d33c Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 1 Jun 2020 11:57:36 +0200
Subject: [PATCH 16/58] arm/acpi: TPM2 ACPI table support

Add a TPM2 ACPI table if a TPM2.0 sysbus device has been
dynamically instantiated.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

Message-Id: <20200601095737.32671-4-eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/aml-build.c      | 5 +++--
 hw/arm/virt-acpi-build.c | 7 +++++++
 include/sysemu/tpm.h     | 2 ++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index b37052c1b4..d24e9e6c3a 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1885,17 +1885,18 @@ void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
     unsigned log_addr_offset =
         (char *)&tpm2_ptr->log_area_start_address - table_data->data;
     uint8_t start_method_params[12] = {};
+    TPMIf *tpmif = tpm_find();
 
     /* platform class */
     build_append_int_noprefix(table_data, TPM2_ACPI_CLASS_CLIENT, 2);
     /* reserved */
     build_append_int_noprefix(table_data, 0, 2);
-    if (TPM_IS_TIS_ISA(tpm_find())) {
+    if (TPM_IS_TIS_ISA(tpmif) || TPM_IS_TIS_SYSBUS(tpmif)) {
         /* address of control area */
         build_append_int_noprefix(table_data, 0, 8);
         /* start method */
         build_append_int_noprefix(table_data, TPM2_START_METHOD_MMIO, 4);
-    } else if (TPM_IS_CRB(tpm_find())) {
+    } else if (TPM_IS_CRB(tpmif)) {
         build_append_int_noprefix(table_data, TPM_CRB_ADDR_CTRL, 8);
         build_append_int_noprefix(table_data, TPM2_START_METHOD_CRB, 4);
     } else {
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 2cbccd5fe2..ca31f70f7f 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -41,12 +41,14 @@
 #include "hw/acpi/pci.h"
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/generic_event_device.h"
+#include "hw/acpi/tpm.h"
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
 #include "hw/arm/virt.h"
 #include "hw/mem/nvdimm.h"
 #include "sysemu/numa.h"
 #include "sysemu/reset.h"
+#include "sysemu/tpm.h"
 #include "kvm_arm.h"
 #include "migration/vmstate.h"
 #include "hw/acpi/ghes.h"
@@ -848,6 +850,11 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
         build_iort(tables_blob, tables->linker, vms);
     }
 
+    if (tpm_get_version(tpm_find()) == TPM_VERSION_2_0) {
+        acpi_add_table(table_offsets, tables_blob);
+        build_tpm2(tables_blob, tables->linker, tables->tcpalog);
+    }
+
     /* XSDT is pointed to by RSDP */
     xsdt = tables_blob->len;
     build_xsdt(tables_blob, tables->linker, table_offsets, NULL, NULL);
diff --git a/include/sysemu/tpm.h b/include/sysemu/tpm.h
index f37851b1aa..03fb25941c 100644
--- a/include/sysemu/tpm.h
+++ b/include/sysemu/tpm.h
@@ -50,6 +50,8 @@ typedef struct TPMIfClass {
 
 #define TPM_IS_TIS_ISA(chr)                         \
     object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_ISA)
+#define TPM_IS_TIS_SYSBUS(chr)                      \
+    object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS_SYSBUS)
 #define TPM_IS_CRB(chr)                             \
     object_dynamic_cast(OBJECT(chr), TYPE_TPM_CRB)
 #define TPM_IS_SPAPR(chr)                           \

From 266345a867c290d6664c008b3bb3945395eca821 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Jun 2020 14:54:05 +0200
Subject: [PATCH 17/58] test/tpm-emu: include sockets and channel headers in
 tpm-emu header

Include sockets and channel headers to that the header is
self-contained.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Message-Id: <20200609125409.24179-2-eric.auger@redhat.com>
---
 tests/qtest/tpm-emu.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qtest/tpm-emu.h b/tests/qtest/tpm-emu.h
index a4f1d64226..73f3bed0c4 100644
--- a/tests/qtest/tpm-emu.h
+++ b/tests/qtest/tpm-emu.h
@@ -16,6 +16,9 @@
 #define TPM_RC_FAILURE 0x101
 #define TPM2_ST_NO_SESSIONS 0x8001
 
+#include "qemu/sockets.h"
+#include "io/channel.h"
+
 struct tpm_hdr {
     uint16_t tag;
     uint32_t len;

From 6d6d1a23fc2ed42ab6ac76e289000474def8ff0c Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Jun 2020 14:54:06 +0200
Subject: [PATCH 18/58] tests/acpi: Add void tables for Q35/TPM-TIS
 bios-tables-test

Add placeholders for TPM and DSDT reference tables for
Q35 TPM-TIS tests and ignore them for the time being.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>

Message-Id: <20200609125409.24179-3-eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/data/acpi/q35/DSDT.tis                | 0
 tests/data/acpi/q35/TPM2.tis                | 0
 tests/qtest/bios-tables-test-allowed-diff.h | 2 ++
 3 files changed, 2 insertions(+)
 create mode 100644 tests/data/acpi/q35/DSDT.tis
 create mode 100644 tests/data/acpi/q35/TPM2.tis

diff --git a/tests/data/acpi/q35/DSDT.tis b/tests/data/acpi/q35/DSDT.tis
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/data/acpi/q35/TPM2.tis b/tests/data/acpi/q35/TPM2.tis
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..a2a45d1d31 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,3 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/q35/DSDT.tis",
+"tests/data/acpi/q35/TPM2.tis",

From c7504b9f32f2ee6ddcb2b0d69219046b3fa48c2d Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Jun 2020 14:54:07 +0200
Subject: [PATCH 19/58] tests: tpm-emu: Remove assert on TPM2_ST_NO_SESSIONS

bios-tables-test executes SeaBIOS. Indeed FW is needed to
fetch tables from QEMU and put them into the guest RAM. Also
the FW patches cross table pointers. At some point, SeaBIOS
ends up calling the TPM2_CC_HierarchyControl command with
TPM2_ST_SESSIONS tag, most probably steming from
tpm_set_failure/tpm20_hierarchycontrol SeaBIOS call path.
This causes an assert() in the qtest tpm emulation code.

As the goal here is not to boot SeaBIOS completely but just
let it grab the ACPI tables and consolidate them, let's just
remove the assert().

Signed-off-by: Eric Auger <eric.auger@redhat.com>

Message-Id: <20200609125409.24179-4-eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/qtest/tpm-emu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/qtest/tpm-emu.c b/tests/qtest/tpm-emu.c
index c43ac4aef8..298d0eec74 100644
--- a/tests/qtest/tpm-emu.c
+++ b/tests/qtest/tpm-emu.c
@@ -49,7 +49,6 @@ static void *tpm_emu_tpm_thread(void *data)
         s->tpm_msg->tag = be16_to_cpu(s->tpm_msg->tag);
         s->tpm_msg->len = be32_to_cpu(s->tpm_msg->len);
         g_assert_cmpint(s->tpm_msg->len, >=, minhlen);
-        g_assert_cmpint(s->tpm_msg->tag, ==, TPM2_ST_NO_SESSIONS);
 
         s->tpm_msg = g_realloc(s->tpm_msg, s->tpm_msg->len);
         qio_channel_read(ioc, (char *)&s->tpm_msg->code,

From 5da7c35e25a4cba351602774e30cb59173af3c73 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Jun 2020 14:54:08 +0200
Subject: [PATCH 20/58] bios-tables-test: Add Q35/TPM-TIS test

Test tables specific to the TPM-TIS instantiation.
The TPM2 is added in the framework. Also the DSDT
is updated with the TPM. The new function should be
be usable for CRB as well, later one.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

Message-Id: <20200609125409.24179-5-eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/qtest/Makefile.include   |  1 +
 tests/qtest/bios-tables-test.c | 58 ++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/tests/qtest/Makefile.include b/tests/qtest/Makefile.include
index 9e5a51d033..5023fa413d 100644
--- a/tests/qtest/Makefile.include
+++ b/tests/qtest/Makefile.include
@@ -262,6 +262,7 @@ tests/qtest/hd-geo-test$(EXESUF): tests/qtest/hd-geo-test.o $(libqos-obj-y)
 tests/qtest/boot-order-test$(EXESUF): tests/qtest/boot-order-test.o $(libqos-obj-y)
 tests/qtest/boot-serial-test$(EXESUF): tests/qtest/boot-serial-test.o $(libqos-obj-y)
 tests/qtest/bios-tables-test$(EXESUF): tests/qtest/bios-tables-test.o \
+        tests/qtest/tpm-emu.o $(test-io-obj-y) \
 	tests/qtest/boot-sector.o tests/qtest/acpi-utils.o $(libqos-obj-y)
 tests/qtest/pxe-test$(EXESUF): tests/qtest/pxe-test.o tests/qtest/boot-sector.o $(libqos-obj-y)
 tests/qtest/microbit-test$(EXESUF): tests/qtest/microbit-test.o
diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index c9843829b3..53f104a9c5 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -57,6 +57,9 @@
 #include "qemu/bitmap.h"
 #include "acpi-utils.h"
 #include "boot-sector.h"
+#include "tpm-emu.h"
+#include "hw/acpi/tpm.h"
+
 
 #define MACHINE_PC "pc"
 #define MACHINE_Q35 "q35"
@@ -874,6 +877,60 @@ static void test_acpi_piix4_tcg_numamem(void)
     free_test_data(&data);
 }
 
+uint64_t tpm_tis_base_addr;
+
+static void test_acpi_tcg_tpm(const char *machine, const char *tpm_if,
+                              uint64_t base)
+{
+    gchar *tmp_dir_name = g_strdup_printf("qemu-test_acpi_%s_tcg_%s.XXXXXX",
+                                          machine, tpm_if);
+    char *tmp_path = g_dir_make_tmp(tmp_dir_name, NULL);
+    TestState test;
+    test_data data;
+    GThread *thread;
+    char *args, *variant = g_strdup_printf(".%s", tpm_if);
+
+    tpm_tis_base_addr = base;
+
+    module_call_init(MODULE_INIT_QOM);
+
+    test.addr = g_new0(SocketAddress, 1);
+    test.addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+    test.addr->u.q_unix.path = g_build_filename(tmp_path, "sock", NULL);
+    g_mutex_init(&test.data_mutex);
+    g_cond_init(&test.data_cond);
+    test.data_cond_signal = false;
+
+    thread = g_thread_new(NULL, tpm_emu_ctrl_thread, &test);
+    tpm_emu_test_wait_cond(&test);
+
+    memset(&data, 0, sizeof(data));
+    data.machine = machine;
+    data.variant = variant;
+
+    args = g_strdup_printf(
+        " -chardev socket,id=chr,path=%s"
+        " -tpmdev emulator,id=dev,chardev=chr"
+        " -device tpm-%s,tpmdev=dev",
+        test.addr->u.q_unix.path, tpm_if);
+
+    test_acpi_one(args, &data);
+
+    g_thread_join(thread);
+    g_unlink(test.addr->u.q_unix.path);
+    qapi_free_SocketAddress(test.addr);
+    g_rmdir(tmp_path);
+    g_free(variant);
+    g_free(tmp_path);
+    g_free(tmp_dir_name);
+    free_test_data(&data);
+}
+
+static void test_acpi_q35_tcg_tpm_tis(void)
+{
+    test_acpi_tcg_tpm("q35", "tis", 0xFED40000);
+}
+
 static void test_acpi_tcg_dimm_pxm(const char *machine)
 {
     test_data data;
@@ -1037,6 +1094,7 @@ int main(int argc, char *argv[])
             return ret;
         }
 
+        qtest_add_func("acpi/q35/tpm-tis", test_acpi_q35_tcg_tpm_tis);
         qtest_add_func("acpi/piix4", test_acpi_piix4_tcg);
         qtest_add_func("acpi/piix4/bridge", test_acpi_piix4_tcg_bridge);
         qtest_add_func("acpi/q35", test_acpi_q35_tcg);

From cae98d8c86b7c76eae1c389ff7b9bd98143f656d Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 9 Jun 2020 14:54:09 +0200
Subject: [PATCH 21/58] bios-tables-test: Generate reference tables for
 Q35/TPM-TIS

TPM2, DSDT tables were generated using
tests/data/acpi/rebuild-expected-aml.sh

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Message-Id: <20200609125409.24179-6-eric.auger@redhat.com>

Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/data/acpi/q35/DSDT.tis                | Bin 0 -> 8357 bytes
 tests/data/acpi/q35/TPM2.tis                | Bin 0 -> 76 bytes
 tests/qtest/bios-tables-test-allowed-diff.h |   2 --
 3 files changed, 2 deletions(-)

diff --git a/tests/data/acpi/q35/DSDT.tis b/tests/data/acpi/q35/DSDT.tis
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..56b6fb0c3298517d080e38fea05a748b9f1dba54 100644
GIT binary patch
literal 8357
zcmb7JON=AM8Lskc+U@pC+vC|cthEqD1O%QLASe+DcDG-d@l21~y9B(Edp3~F?nc4_
zR)VaMW(JAJN{IpnluINpa-v+KTyhV}IR_3affVJCYouHfMXYjI=liQ%yH&PFVtr`$
zU;TamUtjg(>#p)czw_H7WBd=5ZLbj)D?bc8A8nj5M*H;dHWL?Fci*qndpr|M@a{oI
z>|;~03Xl1f^L75~z37uDdi~vq@AlcHPu-6%?e^b(FS^7CbnQ~#wrR)RuzO*p*FE&?
zz-yMQUZd$d<<D))>UnZ6ZuQE3n_0c{O`jJw7kb;Fm$-42dH(aO!>#0CFSl&F-PYgM
zzdC#7xlf*dz5Lm;U;O08RXYK|8vZT(ulH#aU61Uu(a`;HaGAeFba8BTxcuQj9F}b^
z9qC8Za?_DblxlT!)ood_uT`rNR6DA#96dzeFdmJd(_#)|pWn%_MEAg@c=<wg%P$6P
zzgH|g-3vQq)~A-JPffc&k8C#@Iv)%oHi$Cg@rVr<KN#$?k9HmQkN+k|Y-BHCB-&vv
zXD_~!#?T`<OEC5(bDvvyoVr89hT%&!!*Wjy6*_Phoe7%lotPcjPIrG3XE0{(+p+G!
zN!n-LcB>p`9Df!3Z3Z1bCQ#LweMqPkpsLI(SGTxV!DchS#@$$YBCaY@weI1Q8x<c#
z<vELE9H38i_=LSu=Mjq-=XL%PdkO#NV(54G@Zr|>`Ma#8WrJne$#cvl)Ec_As&XeL
z=ik7(cDBynh`$lHtaJM!^BNT0<s`YhW@b6J<G16?I_C<`H`*QVZf`QEL!&KH_gSBD
z>m1{6GAn3$HCAwX_~-pSirinmz-csG@1Nqz4ceU`zwz3BK9j(SWSd(XRVMd2_G>Kf
zP`e#NP**`5L4)P931>XUeI;>|h*$tF%432Ds0xXR`~n*jOkgJwF%>k%1QSrfL`Y2Z
zMquNDsd_+{g2*bMamfT!g~U`<a0yDR$deFE1x-k%>O4YSXVTD_6ifw8N~Y>OLS1Ld
z(3uiU1x=Yc5$ZY@44n&x&IMB^LS3h2=(G%-mZ=k=t`j$_HqvQBXWG<>P}gZ2I&DLz
zZR$j*>&zHBGltHLsS}~D6AumTd^(0s$JB{X*O@hRW(}QLQzt@Q=c1u=(a^bQ>O`pP
zTrzYn89J9tod|WEuA$R4bh@Tagu2e0p)+Ub%$Yh7>N@j=&b*;BZ|X#->s&T;E*m<R
zO`QmJodrW@!O&SSbt2Stt{6I344o^cPK3J7V}{OShR$QAPK3J7<A%=ThR)-rPK3J7
z6Nb(chRzeFPK3J7lY&{EJPA(<W@YlQJ1Lpoy^xsx1bfP0o-&xHOeR8|dD>u}HkhYP
zCPJNg#$cW?m}g8TLY?`5!F<4AK43Bt>dXfPQ@PB8f~lVI4@#zba3dt9a+0$~&9g?$
zvt~_%vZe(CEo%l^51V%l6#FPVF;E03D}y)!RTKs)al(=~;}WPGGbW)bB&L!E1C>~1
zpc<46R6+%l3=|=HqYPAHClS?DUP}flp?3`wA-XQGFi?q|M8s5sO9m>Tf=LF7Q0at$
zN~~bQKs6{CsDuh887M-f69y`=f(ZlFpk$yDDwt%T2$fD4sKg2;3{-=Xfl8=gl7S*r
zI$@v^E0{1)4N3+op@K;Uicsl<fl91k!ay}B8K{H`CK)I~r4t4!v4RN$)u3dc5-ONv
zpa_*t7^uVwCJa=Al7UL7V3L6%R61dx5-XT6Pz_23Dxrc&28vMWgn>$|V8TE(C>f}P
z3MLsSLZuT1DzSnI1J$5ppb{#WWS|I@P8g`f3MLFxgOY(ts9=(TB2+qIpb{&XFi;Ij
z1}dR~Nd}5g>4bqwtYE@GH7FUVgbF4ZC_<$Z1}d?F2?N!jWS|l%m}H;`l};F_#0n-1
zRD+U%N~mCxfg)5oVW1K#m@rTcN(L&Sf=LF7Q0at$N~~bQKs6{CsDuh887M-f69y`=
zf(ZlFpk$yDDwt%T2$fD4sKg2;3{-=Xfl8=gl7S*rI$@v^E0{1)4N3+op@K;Uicsl<
zfg;ij6p?PA2z3KRs2Qlngn?>IGEj|42C6Y(pc)efsxiqxH6|IT#)N@tOc<!fBm>o$
zWS|-o2C6Y(pc<15RAZ8XYD^d?B6W6QponnB#9)$vBE)G>Au-k1!axzJv4w#mQe#U7
zicpO$87M-Z6;AbFp={_3#K-E0K1gq-=&#&8_|6k?`bwqGcA6OAL3^hR$K9#W#-ojI
zF-%kJRB69Pn>uY8R$;lh6L$D8ecj&Q!+b<~WM>LbHg~*s8h~~ww`gbHPq8%33*m$M
z2DRBqY@7Y-d-f8eJ#?atNs7${yd#v=H@$<I4VpKx(CsjH%wB$fk)`8sH-_go!;=`<
z3QJ<jWJFy+Z_0&S&u@n8E!wyDMavR>urnaMwuq1S!gk1S>`nSW<*9vAy)soDa|B<z
zdPS>V5!EZ`pI*IEm|wlpJ#3M7Z>qW-yN~joR^Aiky{Yov5#_x}d4IY*W^BH;dS5H=
zi}L<ddH;y={-k_$x_sh3%2&1WRZ+e=Rla&e`Rb&6ZMuB&KFZg$@-<PuHdVfMMETmJ
ze0{on>ORWXweodQzCKmHenk2Dq<mw#{K9>dZ)oKkqI_ejeB+4n4U|vNqinjoE6b~g
z2VN_1yUWQpv#pR4w><vnzMiC;zK!jinGWw7GwEd8+;lVdW!_AO7m%5BvTbg<nfnZL
zoOGWlVR$Q<NhjOprklBM3T8UIs?4O5ZFAGj+y^UWI=sKkq?2uP)6HDh$INtishLS9
z+vcX5xlWIp>F}mAlTNmo>D<%W?D$gHs$gva>j+L9ljH6tX5!_d!^L}3+CjS-s;kt-
z%?WSHdcpZ(<CWL@KfCtR;;S#e_R6(afBf=mtjoP%b7OL~W^KF9zjVHIxPxhHcoNJq
z^QD7%_k)KhD6^tR)8}4uE973gb%A@;ACx#tcwWo5x(9gz#l5h)Sp?EAcZ!&1Zf-TL
zKBLC&;r62!g%tEf_AguZf@Z0-TB07vMUrG8=XGi?#D?}@kG<oSN~Oz%M#@9=boZf)
zPbAc|J!z!AkH`JJ>1Gt`vuJo~Fnxe}-?x{d?l)(SY;KIALMus%?ynG^PK69}^A{S6
zUg&LEN6FyPr{&Bb;^=yxO((?&59^5;LE=ncxTfJ4eBPhw8PI(m-P}kF8?&jz`JA81
zj?d--9J%HTZKb{Ku;Uyhbnf|@=`0lYFVE4{w`0?-1idodh#{EUyYAc1&{c(-4?kR>
zxU#GfU0=8cTjfqJV_OWL`t%Z5Si}-tCqrV`-sLNW)3|)=LA6&ct>FZT6(qWp>8!d1
zIwEem$wggynz^l<yx`@6AT7t#9Q!ZtWmya#+4zlVgGHs=&TWTVXy>@A@%svgZ##G6
zELLF{`_H?0I>_VnVmThuizWSHOdJuftq860c>G6kfU(~jM?M>m|9nRt`Rsc)9=(@M
zwow9b$53zlD8`B%4&S=N0(xj+l`a9fa*;C|Yz3t<<jMyPwQElpxn`A-ekh(W#Z)oH
zu`uR7auJkpy_ZiVIFe)k$zI+{*fAT^ScsmYBh(Z3yZ45%d-!<mc~3t1vpdz+d7R5N
zLSwy7ui_@%Bt6I6?ME((#jW}ya(yuqmP&GY5i-6NmY$ITMw~O)QMl@}k=R*NJL_U6
z6?j`6P_`fL1TDGDmTiT#4xV1CllsO2r#@W!ogi4ldg{)tJNAp+17tuL1kEiAo1hp3
zZL!&*y;gA(+hUhE>&$r(99os8h_|*cT(AmjouIar9;QG1cn|YBBRr~caFxC4u;A96
zJvKqFa<ta0k=UZkYxx8XEB4m&NxI4k)e}8GY&pYvK`uT2?tg!O_Vm+#+8RCl3)lM1
zv2d6fTw$YJ(vFQe9>-VsD7*UjIvYMVxMIh!0)Ca@Nkx5$c5(Rnu8q015${Ln{!SnA
z+AS;se;?rZa0Lt1eQJM?dcqlc)mj@qG`NgJs(GC=+HVnL9hU%M)`u4OiIE)x_==tE
z)7S`r1P8O=)Cl#eNuU>w>?6J`MR)BOfnDmH)@$l5`XyT+lPfJJk<>DijzpeOF3#ET
qDLIV)>O|jO{_C~cug=sjeDUvFKN_aRFvfX&xQszm{D@BGi2Wba<i~0N

literal 0
HcmV?d00001

diff --git a/tests/data/acpi/q35/TPM2.tis b/tests/data/acpi/q35/TPM2.tis
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7878a6e79a6a406d99ca1f5e9a528eb392b8d793 100644
GIT binary patch
literal 76
wcmWFu@HO&bU|?V=a`Jcf2v%^42yhMoiZKGkKx`0=4A_u4U^Ym_e|8WP0Iiz{0RR91

literal 0
HcmV?d00001

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h
index a2a45d1d31..dfb8523c8b 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1,3 +1 @@
 /* List of comma-separated changed AML files to ignore */
-"tests/data/acpi/q35/DSDT.tis",
-"tests/data/acpi/q35/TPM2.tis",

From 12fc8903a8ee09fb5f642de82699a0b211e1b5a7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 May 2020 12:04:37 +0200
Subject: [PATCH 22/58] virtio-balloon: fix free page hinting without an
 iothread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case we don't have an iothread, we mark the feature as abscent but
still add the queue. 'free_page_bh' remains set to NULL.

qemu-system-i386 \
        -M microvm \
        -nographic \
        -device virtio-balloon-device,free-page-hint=true \
        -nographic \
        -display none \
        -monitor none \
        -serial none \
        -qtest stdio

Doing a "write 0xc0000e30 0x24
0x030000000300000003000000030000000300000003000000030000000300000003000000"

We will trigger a SEGFAULT. Let's move the check and bail out.

While at it, move the static initializations to instance_init().
free_page_report_status and block_iothread are implicitly set to the
right values (0/false) already, so drop the initialization.

Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Fixes: c13c4153f76d ("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT")
Reported-by: Alexander Bulekov <alxndr@bu.edu>
Cc: qemu-stable@nongnu.org
Cc: Wei Wang <wei.w.wang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Philippe Mathieu-Daudé <philmd@redhat.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200520100439.19872-2-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-balloon.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 065cd450f1..7ff6a7aa7c 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -789,6 +789,13 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
+        !s->iothread) {
+        error_setg(errp, "'free-page-hint' requires 'iothread' to be set");
+        virtio_cleanup(vdev);
+        return;
+    }
+
     s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
@@ -797,24 +804,11 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
                            VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
         s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE,
                                            virtio_balloon_handle_free_page_vq);
-        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
-        s->free_page_report_cmd_id =
-                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN;
-        s->free_page_report_notify.notify =
-                                       virtio_balloon_free_page_report_notify;
         precopy_add_notifier(&s->free_page_report_notify);
-        if (s->iothread) {
-            object_ref(OBJECT(s->iothread));
-            s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
-                                       virtio_ballloon_get_free_page_hints, s);
-            qemu_mutex_init(&s->free_page_lock);
-            qemu_cond_init(&s->free_page_cond);
-            s->block_iothread = false;
-        } else {
-            /* Simply disable this feature if the iothread wasn't created. */
-            s->host_features &= ~(1 << VIRTIO_BALLOON_F_FREE_PAGE_HINT);
-            virtio_error(vdev, "iothread is missing");
-        }
+
+        object_ref(OBJECT(s->iothread));
+        s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
+                                     virtio_ballloon_get_free_page_hints, s);
     }
     reset_stats(s);
 }
@@ -892,6 +886,11 @@ static void virtio_balloon_instance_init(Object *obj)
 {
     VirtIOBalloon *s = VIRTIO_BALLOON(obj);
 
+    qemu_mutex_init(&s->free_page_lock);
+    qemu_cond_init(&s->free_page_cond);
+    s->free_page_report_cmd_id = VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN;
+    s->free_page_report_notify.notify = virtio_balloon_free_page_report_notify;
+
     object_property_add(obj, "guest-stats", "guest statistics",
                         balloon_stats_get_all, NULL, NULL, s);
 

From 49b01711b8eb3796c6904c7f85d2431572cfe54f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 May 2020 12:04:38 +0200
Subject: [PATCH 23/58] virtio-balloon: fix free page hinting check on
 unrealize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Checking against guest features is wrong. We allocated data structures
based on host features. We can rely on "free_page_bh" as an indicator
whether to un-do stuff instead.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Fixes: c13c4153f76d ("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT")
Cc: qemu-stable@nongnu.org
Cc: Wei Wang <wei.w.wang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Philippe Mathieu-Daudé <philmd@redhat.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200520100439.19872-3-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-balloon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 7ff6a7aa7c..32e9fe3f64 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -818,7 +818,7 @@ static void virtio_balloon_device_unrealize(DeviceState *dev)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
 
-    if (virtio_balloon_free_page_support(s)) {
+    if (s->free_page_bh) {
         qemu_bh_delete(s->free_page_bh);
         virtio_balloon_free_page_stop(s);
         precopy_remove_notifier(&s->free_page_report_notify);

From 105aef9c9479786d27c1c45c9b0b1fa03dc46be3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 May 2020 12:04:39 +0200
Subject: [PATCH 24/58] virtio-balloon: unref the iothread when unrealizing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We took a reference when realizing, so let's drop that reference when
unrealizing.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Fixes: c13c4153f76d ("virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT")
Cc: qemu-stable@nongnu.org
Cc: Wei Wang <wei.w.wang@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200520100439.19872-4-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-balloon.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 32e9fe3f64..cff8eab6a1 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -820,6 +820,7 @@ static void virtio_balloon_device_unrealize(DeviceState *dev)
 
     if (s->free_page_bh) {
         qemu_bh_delete(s->free_page_bh);
+        object_unref(OBJECT(s->iothread));
         virtio_balloon_free_page_stop(s);
         precopy_remove_notifier(&s->free_page_report_notify);
     }

From 7483cbbaf8204de330b7362339a4a070b114b425 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Tue, 26 May 2020 21:14:00 -0700
Subject: [PATCH 25/58] virtio-balloon: Implement support for page poison
 reporting feature

We need to make certain to advertise support for page poison reporting if
we want to actually get data on if the guest will be poisoning pages.

Add a value for reporting the poison value being used if page poisoning is
enabled in the guest. With this we can determine if we will need to skip
free page reporting when it is enabled in the future.

The value currently has no impact on existing balloon interfaces. In the
case of existing balloon interfaces the onus is on the guest driver to
reapply whatever poison is in place.

When we add free page reporting the poison value is used to determine if
we can perform in-place page reporting. The expectation is that a reported
page will already contain the value specified by the poison, and the
reporting of the page should not change that value.

Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Message-Id: <20200527041400.12700.33251.stgit@localhost.localdomain>
---
 hw/core/machine.c                  |  4 +++-
 hw/virtio/virtio-balloon.c         | 29 +++++++++++++++++++++++++++++
 include/hw/virtio/virtio-balloon.h |  1 +
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index bb3a7b18b1..9eca7d8c9b 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,7 +28,9 @@
 #include "hw/mem/nvdimm.h"
 #include "migration/vmstate.h"
 
-GlobalProperty hw_compat_5_0[] = {};
+GlobalProperty hw_compat_5_0[] = {
+    { "virtio-balloon-device", "page-poison", "false" },
+};
 const size_t hw_compat_5_0_len = G_N_ELEMENTS(hw_compat_5_0);
 
 GlobalProperty hw_compat_4_2[] = {
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index cff8eab6a1..31d3c88482 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -634,6 +634,7 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 
     config.num_pages = cpu_to_le32(dev->num_pages);
     config.actual = cpu_to_le32(dev->actual);
+    config.poison_val = cpu_to_le32(dev->poison_val);
 
     if (dev->free_page_report_status == FREE_PAGE_REPORT_S_REQUESTED) {
         config.free_page_report_cmd_id =
@@ -683,6 +684,14 @@ static ram_addr_t get_current_ram_size(void)
     return size;
 }
 
+static bool virtio_balloon_page_poison_support(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
+}
+
 static void virtio_balloon_set_config(VirtIODevice *vdev,
                                       const uint8_t *config_data)
 {
@@ -697,6 +706,10 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
         qapi_event_send_balloon_change(vm_ram_size -
                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
     }
+    dev->poison_val = 0;
+    if (virtio_balloon_page_poison_support(dev)) {
+        dev->poison_val = le32_to_cpu(config.poison_val);
+    }
     trace_virtio_balloon_set_config(dev->actual, oldactual);
 }
 
@@ -755,6 +768,17 @@ static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
     }
 };
 
+static const VMStateDescription vmstate_virtio_balloon_page_poison = {
+    .name = "vitio-balloon-device/page-poison",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = virtio_balloon_page_poison_support,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(poison_val, VirtIOBalloon),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio_balloon_device = {
     .name = "virtio-balloon-device",
     .version_id = 1,
@@ -767,6 +791,7 @@ static const VMStateDescription vmstate_virtio_balloon_device = {
     },
     .subsections = (const VMStateDescription * []) {
         &vmstate_virtio_balloon_free_page_report,
+        &vmstate_virtio_balloon_page_poison,
         NULL
     }
 };
@@ -849,6 +874,8 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev)
         g_free(s->stats_vq_elem);
         s->stats_vq_elem = NULL;
     }
+
+    s->poison_val = 0;
 }
 
 static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
@@ -916,6 +943,8 @@ static Property virtio_balloon_properties[] = {
                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
     DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
                     VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
+    DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features,
+                    VIRTIO_BALLOON_F_PAGE_POISON, true),
     /* QEMU 4.0 accidentally changed the config size even when free-page-hint
      * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
      * property retains this quirk for QEMU 4.1 machine types.
diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
index d1c968d237..7fe78e5c14 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -70,6 +70,7 @@ typedef struct VirtIOBalloon {
     uint32_t host_features;
 
     bool qemu_4_0_config_size;
+    uint32_t poison_val;
 } VirtIOBalloon;
 
 #endif

From 91b867191ddf73c64f51ca2ddb489b922ed4058e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Tue, 26 May 2020 21:14:07 -0700
Subject: [PATCH 26/58] virtio-balloon: Provide an interface for free page
 reporting

Add support for free page reporting. The idea is to function very similar
to how the balloon works in that we basically end up madvising the page as
not being used. However we don't really need to bother with any deflate
type logic since the page will be faulted back into the guest when it is
read or written to.

This provides a new way of letting the guest proactively report free
pages to the hypervisor, so the hypervisor can reuse them. In contrast to
inflate/deflate that is triggered via the hypervisor explicitly.

Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Message-Id: <20200527041407.12700.73735.stgit@localhost.localdomain>
---
 hw/virtio/virtio-balloon.c         | 72 ++++++++++++++++++++++++++++++
 include/hw/virtio/virtio-balloon.h |  2 +-
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 31d3c88482..10507b2a43 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -321,6 +321,67 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
     balloon_stats_change_timer(s, 0);
 }
 
+static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
+    VirtQueueElement *elem;
+
+    while ((elem = virtqueue_pop(vq, sizeof(VirtQueueElement)))) {
+        unsigned int i;
+
+        /*
+         * When we discard the page it has the effect of removing the page
+         * from the hypervisor itself and causing it to be zeroed when it
+         * is returned to us. So we must not discard the page if it is
+         * accessible by another device or process, or if the guest is
+         * expecting it to retain a non-zero value.
+         */
+        if (qemu_balloon_is_inhibited() || dev->poison_val) {
+            goto skip_element;
+        }
+
+        for (i = 0; i < elem->in_num; i++) {
+            void *addr = elem->in_sg[i].iov_base;
+            size_t size = elem->in_sg[i].iov_len;
+            ram_addr_t ram_offset;
+            RAMBlock *rb;
+
+            /*
+             * There is no need to check the memory section to see if
+             * it is ram/readonly/romd like there is for handle_output
+             * below. If the region is not meant to be written to then
+             * address_space_map will have allocated a bounce buffer
+             * and it will be freed in address_space_unmap and trigger
+             * and unassigned_mem_write before failing to copy over the
+             * buffer. If more than one bad descriptor is provided it
+             * will return NULL after the first bounce buffer and fail
+             * to map any resources.
+             */
+            rb = qemu_ram_block_from_host(addr, false, &ram_offset);
+            if (!rb) {
+                trace_virtio_balloon_bad_addr(elem->in_addr[i]);
+                continue;
+            }
+
+            /*
+             * For now we will simply ignore unaligned memory regions, or
+             * regions that overrun the end of the RAMBlock.
+             */
+            if (!QEMU_IS_ALIGNED(ram_offset | size, qemu_ram_pagesize(rb)) ||
+                (ram_offset + size) > qemu_ram_get_used_length(rb)) {
+                continue;
+            }
+
+            ram_block_discard_range(rb, ram_offset, size);
+        }
+
+skip_element:
+        virtqueue_push(vq, elem, 0);
+        virtio_notify(vdev, vq);
+        g_free(elem);
+    }
+}
+
 static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
@@ -835,6 +896,12 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
         s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
                                      virtio_ballloon_get_free_page_hints, s);
     }
+
+    if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_REPORTING)) {
+        s->reporting_vq = virtio_add_queue(vdev, 32,
+                                           virtio_balloon_handle_report);
+    }
+
     reset_stats(s);
 }
 
@@ -858,6 +925,9 @@ static void virtio_balloon_device_unrealize(DeviceState *dev)
     if (s->free_page_vq) {
         virtio_delete_queue(s->free_page_vq);
     }
+    if (s->reporting_vq) {
+        virtio_delete_queue(s->reporting_vq);
+    }
     virtio_cleanup(vdev);
 }
 
@@ -945,6 +1015,8 @@ static Property virtio_balloon_properties[] = {
                     VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
     DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features,
                     VIRTIO_BALLOON_F_PAGE_POISON, true),
+    DEFINE_PROP_BIT("free-page-reporting", VirtIOBalloon, host_features,
+                    VIRTIO_BALLOON_F_REPORTING, false),
     /* QEMU 4.0 accidentally changed the config size even when free-page-hint
      * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
      * property retains this quirk for QEMU 4.1 machine types.
diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
index 7fe78e5c14..d49fef00ce 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -42,7 +42,7 @@ enum virtio_balloon_free_page_report_status {
 
 typedef struct VirtIOBalloon {
     VirtIODevice parent_obj;
-    VirtQueue *ivq, *dvq, *svq, *free_page_vq;
+    VirtQueue *ivq, *dvq, *svq, *free_page_vq, *reporting_vq;
     uint32_t free_page_report_status;
     uint32_t num_pages;
     uint32_t actual;

From b963ea19f896e34e21275ff69fa4565948f703c8 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Sun, 7 Jun 2020 07:20:22 +0200
Subject: [PATCH 27/58] MAINTAINERS: Fix the classification of
 bios-tables-test-allowed-diff.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The file tests/qtest/bios-tables-test-allowed-diff.h is currently only
assigned to the qtest section according MAINTAINERS. However, this file
normally only gets updated when the ACPI tables changed - something the
qtest maintainers don't have much clue of. Thus this file should rather
be assigned to the ACPI maintainers instead.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <20200607052022.12222-1-thuth@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6e7890ce82..0c5ed09ad5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1531,7 +1531,7 @@ F: hw/acpi/*
 F: hw/smbios/*
 F: hw/i386/acpi-build.[hc]
 F: hw/arm/virt-acpi-build.c
-F: tests/qtest/bios-tables-test.c
+F: tests/qtest/bios-tables-test*
 F: tests/qtest/acpi-utils.[hc]
 F: tests/data/acpi/
 
@@ -2321,6 +2321,7 @@ S: Maintained
 F: qtest.c
 F: accel/qtest.c
 F: tests/qtest/
+X: tests/qtest/bios-tables-test-allowed-diff.h
 
 Device Fuzzing
 M: Alexander Bulekov <alxndr@bu.edu>

From 0dabc0f6544f2c0310546f6d6cf3b68979580a9c Mon Sep 17 00:00:00 2001
From: Julia Suvorova <jusual@redhat.com>
Date: Thu, 4 Jun 2020 14:59:46 +0200
Subject: [PATCH 28/58] hw/pci/pcie: Move hot plug capability check to pre_plug
 callback

Check for hot plug capability earlier to avoid removing devices attached
during the initialization process.

Run qemu with an unattached drive:
  -drive file=$FILE,if=none,id=drive0 \
  -device pcie-root-port,id=rp0,slot=3,bus=pcie.0,hotplug=off
Hotplug a block device:
  device_add virtio-blk-pci,id=blk0,drive=drive0,bus=rp0
If hotplug fails on plug_cb, drive0 will be deleted.

Fixes: 0501e1aa1d32a6 ("hw/pci/pcie: Forbid hot-plug if it's disabled on the slot")

Acked-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Julia Suvorova <jusual@redhat.com>
Message-Id: <20200604125947.881210-1-jusual@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/pcie.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index f50e10b8fb..5b9c022d91 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -407,6 +407,17 @@ static void pcie_cap_slot_plug_common(PCIDevice *hotplug_dev, DeviceState *dev,
 void pcie_cap_slot_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
                                Error **errp)
 {
+    PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev);
+    uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap;
+    uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP);
+
+    /* Check if hot-plug is disabled on the slot */
+    if (dev->hotplugged && (sltcap & PCI_EXP_SLTCAP_HPC) == 0) {
+        error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'",
+                         DEVICE(hotplug_pdev)->id);
+        return;
+    }
+
     pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, errp);
 }
 
@@ -415,7 +426,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
 {
     PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev);
     uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap;
-    uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP);
     PCIDevice *pci_dev = PCI_DEVICE(dev);
 
     /* Don't send event when device is enabled during qemu machine creation:
@@ -431,13 +441,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
         return;
     }
 
-    /* Check if hot-plug is disabled on the slot */
-    if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) {
-        error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'",
-                         DEVICE(hotplug_pdev)->id);
-        return;
-    }
-
     /* To enable multifunction hot-plug, we just ensure the function
      * 0 added last. When function 0 is added, we set the sltsta and
      * inform OS via event notification.

From f7d6a635fa3b7797f9d072e280f065bf3cfcd24d Mon Sep 17 00:00:00 2001
From: Prasad J Pandit <pjp@fedoraproject.org>
Date: Thu, 4 Jun 2020 17:05:25 +0530
Subject: [PATCH 29/58] pci: assert configuration access is within bounds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While accessing PCI configuration bytes, assert that
'address + len' is within PCI configuration space.

Generally it is within bounds. This is more of a defensive
assert, in case a buggy device was to send 'address' which
may go out of bounds.

Suggested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Prasad J Pandit <pjp@fedoraproject.org>
Message-Id: <20200604113525.58898-1-ppandit@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/pci.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 70c66965f5..7bf2ae6d92 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1381,6 +1381,8 @@ uint32_t pci_default_read_config(PCIDevice *d,
 {
     uint32_t val = 0;
 
+    assert(address + len <= pci_config_size(d));
+
     if (pci_is_express_downstream_port(d) &&
         ranges_overlap(address, len, d->exp.exp_cap + PCI_EXP_LNKSTA, 2)) {
         pcie_sync_bridge_lnk(d);
@@ -1394,6 +1396,8 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
     int i, was_irq_disabled = pci_irq_disabled(d);
     uint32_t val = val_in;
 
+    assert(addr + l <= pci_config_size(d));
+
     for (i = 0; i < l; val >>= 8, ++i) {
         uint8_t wmask = d->wmask[addr + i];
         uint8_t w1cmask = d->w1cmask[addr + i];

From ea2fe4dfe4e5d32e69ae749a3b9f287aaf8b898e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <f4bug@amsat.org>
Date: Mon, 1 Jun 2020 16:29:24 +0200
Subject: [PATCH 30/58] hw/pci-host/prep: Correct RAVEN bus bridge memory
 region size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

memory_region_set_size() handle the 16 Exabytes limit by
special-casing the UINT64_MAX value. This is not a problem
for the 32-bit maximum, 4 GiB.
By using the UINT32_MAX value, the bm-raven MemoryRegion
ends up missing 1 byte:

  $ qemu-system-ppc -M prep -S -monitor stdio -usb
  memory-region: bm-raven
    0000000000000000-00000000fffffffe (prio 0, i/o): bm-raven
      0000000000000000-000000003effffff (prio 0, i/o): alias bm-pci-memory @pci-memory 0000000000000000-000000003effffff
      0000000080000000-00000000ffffffff (prio 0, i/o): alias bm-system @system 0000000000000000-000000007fffffff

Fix by using the correct value. We now have:

  memory-region: bm-raven
    0000000000000000-00000000ffffffff (prio 0, i/o): bm-raven
      0000000000000000-000000003effffff (prio 0, i/o): alias bm-pci-memory @pci-memory 0000000000000000-000000003effffff
      0000000080000000-00000000ffffffff (prio 0, i/o): alias bm-system @system 0000000000000000-000000007fffffff

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20200601142930.29408-3-f4bug@amsat.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/pci-host/prep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/pci-host/prep.c b/hw/pci-host/prep.c
index 1a02e9a670..88e2fc66a9 100644
--- a/hw/pci-host/prep.c
+++ b/hw/pci-host/prep.c
@@ -294,7 +294,7 @@ static void raven_pcihost_initfn(Object *obj)
                              &s->pci_memory, &s->pci_io, 0, TYPE_PCI_BUS);
 
     /* Bus master address space */
-    memory_region_init(&s->bm, obj, "bm-raven", UINT32_MAX);
+    memory_region_init(&s->bm, obj, "bm-raven", 4 * GiB);
     memory_region_init_alias(&s->bm_pci_memory_alias, obj, "bm-pci-memory",
                              &s->pci_memory, 0,
                              memory_region_size(&s->pci_memory));

From 2dc48da25547bf82dee81f4e60766509b28e736e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <f4bug@amsat.org>
Date: Mon, 1 Jun 2020 16:29:25 +0200
Subject: [PATCH 31/58] hw/pci/pci_bridge: Correct pci_bridge_io memory region
 size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

memory_region_set_size() handle the 16 Exabytes limit by
special-casing the UINT64_MAX value. This is not a problem
for the 32-bit maximum, 4 GiB.
By using the UINT32_MAX value, the pci_bridge_io MemoryRegion
ends up missing 1 byte:

  (qemu) info mtree
  memory-region: pci_bridge_io
    0000000000000000-00000000fffffffe (prio 0, i/o): pci_bridge_io
      0000000000000060-0000000000000060 (prio 0, i/o): i8042-data
      0000000000000064-0000000000000064 (prio 0, i/o): i8042-cmd
      00000000000001ce-00000000000001d1 (prio 0, i/o): vbe
      0000000000000378-000000000000037f (prio 0, i/o): parallel
      00000000000003b4-00000000000003b5 (prio 0, i/o): vga
      ...

Fix by using the correct value. We now have:

  memory-region: pci_bridge_io
    0000000000000000-00000000ffffffff (prio 0, i/o): pci_bridge_io
      0000000000000060-0000000000000060 (prio 0, i/o): i8042-data
      0000000000000064-0000000000000064 (prio 0, i/o): i8042-cmd
      ...

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20200601142930.29408-4-f4bug@amsat.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/pci/pci_bridge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 97967d12eb..3ba3203f72 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -30,6 +30,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/pci/pci_bus.h"
 #include "qemu/module.h"
@@ -381,7 +382,7 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
     memory_region_init(&br->address_space_mem, OBJECT(br), "pci_bridge_pci", UINT64_MAX);
     sec_bus->address_space_io = &br->address_space_io;
     memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
-                       UINT32_MAX);
+                       4 * GiB);
     br->windows = pci_bridge_region_init(br);
     QLIST_INIT(&sec_bus->child);
     QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);

From 37e7211cae59d090eafdf26fb2b21ca41df7aed8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <f4bug@amsat.org>
Date: Mon, 1 Jun 2020 16:29:26 +0200
Subject: [PATCH 32/58] hw/pci/pci_bridge: Use the IEC binary prefix
 definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IEC binary prefixes ease code review: the unit is explicit.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20200601142930.29408-5-f4bug@amsat.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/pci/pci_bridge.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 3ba3203f72..3789c17edc 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -423,14 +423,14 @@ int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset,
     }
 
     if (res_reserve.mem_non_pref != (uint64_t)-1 &&
-        res_reserve.mem_non_pref >= (1ULL << 32)) {
+        res_reserve.mem_non_pref >= 4 * GiB) {
         error_setg(errp,
                    "PCI resource reserve cap: mem-reserve must be less than 4G");
         return -EINVAL;
     }
 
     if (res_reserve.mem_pref_32 != (uint64_t)-1 &&
-        res_reserve.mem_pref_32 >= (1ULL << 32)) {
+        res_reserve.mem_pref_32 >= 4 * GiB) {
         error_setg(errp,
                    "PCI resource reserve cap: pref32-reserve  must be less than 4G");
         return -EINVAL;

From 51eae1e7e427ef9efc7c10c52b33575fe685add3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <f4bug@amsat.org>
Date: Mon, 1 Jun 2020 16:29:27 +0200
Subject: [PATCH 33/58] hw/pci-host: Use the IEC binary prefix definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IEC binary prefixes ease code review: the unit is explicit.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20200601142930.29408-6-f4bug@amsat.org>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/pci-host/i440fx.c    | 3 ++-
 hw/pci-host/q35.c       | 2 +-
 hw/pci-host/versatile.c | 5 +++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 0adbd77553..aefb416c8f 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -23,6 +23,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "qemu/range.h"
 #include "hw/i386/pc.h"
 #include "hw/pci/pci.h"
@@ -301,7 +302,7 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type,
     memory_region_set_enabled(&f->smram_region, true);
 
     /* smram, as seen by SMM CPUs */
-    memory_region_init(&f->smram, OBJECT(d), "smram", 1ull << 32);
+    memory_region_init(&f->smram, OBJECT(d), "smram", 4 * GiB);
     memory_region_set_enabled(&f->smram, true);
     memory_region_init_alias(&f->low_smram, OBJECT(d), "smram-low",
                              f->ram_memory, 0xa0000, 0x20000);
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 352aeecfa7..b788f17b2c 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -589,7 +589,7 @@ static void mch_realize(PCIDevice *d, Error **errp)
     memory_region_set_enabled(&mch->open_high_smram, false);
 
     /* smram, as seen by SMM CPUs */
-    memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32);
+    memory_region_init(&mch->smram, OBJECT(mch), "smram", 4 * GiB);
     memory_region_set_enabled(&mch->smram, true);
     memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low",
                              mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
diff --git a/hw/pci-host/versatile.c b/hw/pci-host/versatile.c
index cfb9a78ea6..8ddfb8772a 100644
--- a/hw/pci-host/versatile.c
+++ b/hw/pci-host/versatile.c
@@ -8,6 +8,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "hw/sysbus.h"
 #include "migration/vmstate.h"
 #include "hw/irq.h"
@@ -399,8 +400,8 @@ static void pci_vpb_realize(DeviceState *dev, Error **errp)
     pci_map_irq_fn mapfn;
     int i;
 
-    memory_region_init(&s->pci_io_space, OBJECT(s), "pci_io", 1ULL << 32);
-    memory_region_init(&s->pci_mem_space, OBJECT(s), "pci_mem", 1ULL << 32);
+    memory_region_init(&s->pci_io_space, OBJECT(s), "pci_io", 4 * GiB);
+    memory_region_init(&s->pci_mem_space, OBJECT(s), "pci_mem", 4 * GiB);
 
     pci_root_bus_new_inplace(&s->pci_bus, sizeof(s->pci_bus), dev, "pci",
                              &s->pci_mem_space, &s->pci_io_space,

From 271094474b65de1ad7aaf729938de3d9b9d0d36f Mon Sep 17 00:00:00 2001
From: Dima Stepanov <dimastep@yandex-team.ru>
Date: Thu, 28 May 2020 12:11:18 +0300
Subject: [PATCH 34/58] char-socket: return -1 in case of disconnect during
 tcp_chr_write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During testing of the vhost-user-blk reconnect functionality the qemu
SIGSEGV was triggered:
 start qemu as:
 x86_64-softmmu/qemu-system-x86_64 -m 1024M -M q35 \
   -object memory-backend-file,id=ram-node0,size=1024M,mem-path=/dev/shm/qemu,share=on \
   -numa node,cpus=0,memdev=ram-node0 \
   -chardev socket,id=chardev0,path=./vhost.sock,noserver,reconnect=1 \
   -device vhost-user-blk-pci,chardev=chardev0,num-queues=4 --enable-kvm
 start vhost-user-blk daemon:
 ./vhost-user-blk -s ./vhost.sock -b test-img.raw

If vhost-user-blk will be killed during the vhost initialization
process, for instance after getting VHOST_SET_VRING_CALL command, then
QEMU will fail with the following backtrace:

Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
0x00005555559272bb in vhost_user_read (dev=0x7fffef2d53e0, msg=0x7fffffffd5b0)
    at ./hw/virtio/vhost-user.c:260
260         CharBackend *chr = u->user->chr;

 #0  0x00005555559272bb in vhost_user_read (dev=0x7fffef2d53e0, msg=0x7fffffffd5b0)
    at ./hw/virtio/vhost-user.c:260
 #1  0x000055555592acb8 in vhost_user_get_config (dev=0x7fffef2d53e0, config=0x7fffef2d5394 "", config_len=60)
    at ./hw/virtio/vhost-user.c:1645
 #2  0x0000555555925525 in vhost_dev_get_config (hdev=0x7fffef2d53e0, config=0x7fffef2d5394 "", config_len=60)
    at ./hw/virtio/vhost.c:1490
 #3  0x00005555558cc46b in vhost_user_blk_device_realize (dev=0x7fffef2d51a0, errp=0x7fffffffd8f0)
    at ./hw/block/vhost-user-blk.c:429
 #4  0x0000555555920090 in virtio_device_realize (dev=0x7fffef2d51a0, errp=0x7fffffffd948)
    at ./hw/virtio/virtio.c:3615
 #5  0x0000555555a9779c in device_set_realized (obj=0x7fffef2d51a0, value=true, errp=0x7fffffffdb88)
    at ./hw/core/qdev.c:891
 ...

The problem is that vhost_user_write doesn't get an error after
disconnect and try to call vhost_user_read(). The tcp_chr_write()
routine should return -1 in case of disconnect. Indicate the EIO error
if this routine is called in the disconnected state.

Signed-off-by: Dima Stepanov <dimastep@yandex-team.ru>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <aeb7806bfc945faadf09f64dcfa30f59de3ac053.1590396396.git.dimastep@yandex-team.ru>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 chardev/char-socket.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index db253d4024..18e762643b 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -175,15 +175,16 @@ static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
 
         if (ret < 0 && errno != EAGAIN) {
             if (tcp_chr_read_poll(chr) <= 0) {
+                /* Perform disconnect and return error. */
                 tcp_chr_disconnect_locked(chr);
-                return len;
             } /* else let the read handler finish it properly */
         }
 
         return ret;
     } else {
-        /* XXX: indicate an error ? */
-        return len;
+        /* Indicate an error. */
+        errno = EIO;
+        return -1;
     }
 }
 

From 4bcad76f4c390f798198a9a93fbaa1a20d06b73e Mon Sep 17 00:00:00 2001
From: Dima Stepanov <dimastep@yandex-team.ru>
Date: Thu, 28 May 2020 12:11:19 +0300
Subject: [PATCH 35/58] vhost-user-blk: delay vhost_user_blk_disconnect

A socket write during vhost-user communication may trigger a disconnect
event, calling vhost_user_blk_disconnect() and clearing all the
vhost_dev structures holding data that vhost-user functions expect to
remain valid to roll back initialization correctly. Delay the cleanup to
keep vhost_dev structure valid.
There are two possible states to handle:
1. RUN_STATE_PRELAUNCH: skip bh oneshot call and perform disconnect in
the caller routine.
2. RUN_STATE_RUNNING: delay by using bh

BH changes are based on the similar changes for the vhost-user-net
device:
  commit e7c83a885f865128ae3cf1946f8cb538b63cbfba
  "vhost-user: delay vhost_user_stop"

Signed-off-by: Dima Stepanov <dimastep@yandex-team.ru>
Message-Id: <69b73b94dcd066065595266c852810e0863a0895.1590396396.git.dimastep@yandex-team.ru>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Li Feng <fengli@smartx.com>
Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
---
 hw/block/vhost-user-blk.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 9d8c0b3909..76838e76d3 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -349,6 +349,19 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
     vhost_dev_cleanup(&s->dev);
 }
 
+static void vhost_user_blk_event(void *opaque, QEMUChrEvent event);
+
+static void vhost_user_blk_chr_closed_bh(void *opaque)
+{
+    DeviceState *dev = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+
+    vhost_user_blk_disconnect(dev);
+    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
+            NULL, opaque, NULL, true);
+}
+
 static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
 {
     DeviceState *dev = opaque;
@@ -363,7 +376,30 @@ static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
         }
         break;
     case CHR_EVENT_CLOSED:
-        vhost_user_blk_disconnect(dev);
+        /*
+         * A close event may happen during a read/write, but vhost
+         * code assumes the vhost_dev remains setup, so delay the
+         * stop & clear. There are two possible paths to hit this
+         * disconnect event:
+         * 1. When VM is in the RUN_STATE_PRELAUNCH state. The
+         * vhost_user_blk_device_realize() is a caller.
+         * 2. In tha main loop phase after VM start.
+         *
+         * For p2 the disconnect event will be delayed. We can't
+         * do the same for p1, because we are not running the loop
+         * at this moment. So just skip this step and perform
+         * disconnect in the caller function.
+         *
+         * TODO: maybe it is a good idea to make the same fix
+         * for other vhost-user devices.
+         */
+        if (runstate_is_running()) {
+            AioContext *ctx = qemu_get_current_aio_context();
+
+            qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, NULL,
+                    NULL, NULL, false);
+            aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque);
+        }
         break;
     case CHR_EVENT_BREAK:
     case CHR_EVENT_MUX_IN:

From ece99091c2d0aeb23734289a50ef2ff4e0a08929 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:26 +0000
Subject: [PATCH 36/58] Add helper to populate vhost-user message regions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When setting vhost-user memory tables, memory region descriptors must be
copied from the vhost_dev struct to the vhost-user message. To avoid
duplicating code in setting the memory tables, we should use a helper to
populate this field. This change adds this helper.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-2-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 hw/virtio/vhost-user.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index ec21e8fbe8..2e0552dd74 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -407,6 +407,15 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
     return 0;
 }
 
+static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
+                                       struct vhost_memory_region *src)
+{
+    assert(src != NULL && dst != NULL);
+    dst->userspace_addr = src->userspace_addr;
+    dst->memory_size = src->memory_size;
+    dst->guest_phys_addr = src->guest_phys_addr;
+}
+
 static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
                                              struct vhost_dev *dev,
                                              VhostUserMsg *msg,
@@ -417,6 +426,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
     ram_addr_t offset;
     MemoryRegion *mr;
     struct vhost_memory_region *reg;
+    VhostUserMemoryRegion region_buffer;
 
     msg->hdr.request = VHOST_USER_SET_MEM_TABLE;
 
@@ -441,12 +451,8 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
                 error_report("Failed preparing vhost-user memory table msg");
                 return -1;
             }
-            msg->payload.memory.regions[*fd_num].userspace_addr =
-                reg->userspace_addr;
-            msg->payload.memory.regions[*fd_num].memory_size =
-                reg->memory_size;
-            msg->payload.memory.regions[*fd_num].guest_phys_addr =
-                reg->guest_phys_addr;
+            vhost_user_fill_msg_region(&region_buffer, reg);
+            msg->payload.memory.regions[*fd_num] = region_buffer;
             msg->payload.memory.regions[*fd_num].mmap_offset = offset;
             fds[(*fd_num)++] = fd;
         } else if (track_ramblocks) {

From 23374a84c5f08e20ec2506a6322330d51f9134c5 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:29 +0000
Subject: [PATCH 37/58] Add vhost-user helper to get MemoryRegion data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When setting the memory tables, qemu uses a memory region's userspace
address to look up the region's MemoryRegion struct. Among other things,
the MemoryRegion contains the region's offset and associated file
descriptor, all of which need to be sent to the backend.

With VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS, this logic will be
needed in multiple places, so before feature support is added it
should be moved to a helper function.

This helper is also used to simplify the vhost_user_can_merge()
function.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-3-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 hw/virtio/vhost-user.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 2e0552dd74..442b0d650a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -407,6 +407,18 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
     return 0;
 }
 
+static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset,
+                                            int *fd)
+{
+    MemoryRegion *mr;
+
+    assert((uintptr_t)addr == addr);
+    mr = memory_region_from_host((void *)(uintptr_t)addr, offset);
+    *fd = memory_region_get_fd(mr);
+
+    return mr;
+}
+
 static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
                                        struct vhost_memory_region *src)
 {
@@ -433,10 +445,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
     for (i = 0; i < dev->mem->nregions; ++i) {
         reg = dev->mem->regions + i;
 
-        assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
-        mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr,
-                                     &offset);
-        fd = memory_region_get_fd(mr);
+        mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
         if (fd > 0) {
             if (track_ramblocks) {
                 assert(*fd_num < VHOST_MEMORY_MAX_NREGIONS);
@@ -1551,13 +1560,9 @@ static bool vhost_user_can_merge(struct vhost_dev *dev,
 {
     ram_addr_t offset;
     int mfd, rfd;
-    MemoryRegion *mr;
 
-    mr = memory_region_from_host((void *)(uintptr_t)start1, &offset);
-    mfd = memory_region_get_fd(mr);
-
-    mr = memory_region_from_host((void *)(uintptr_t)start2, &offset);
-    rfd = memory_region_get_fd(mr);
+    (void)vhost_user_get_mr_data(start1, &offset, &mfd);
+    (void)vhost_user_get_mr_data(start2, &offset, &rfd);
 
     return mfd == rfd;
 }

From 6b0eff1a4ea47c835a7d8bee88c05c47ada37495 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:32 +0000
Subject: [PATCH 38/58] Add VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change introduces a new feature to the vhost-user protocol allowing
a backend device to specify the maximum number of ram slots it supports.

At this point, the value returned by the backend will be capped at the
maximum number of ram slots which can be supported by vhost-user, which
is currently set to 8 because of underlying protocol limitations.

The returned value will be stored inside the VhostUserState struct so
that on device reconnect we can verify that the ram slot limitation
has not decreased since the last time the device connected.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com>
Message-Id: <1588533678-23450-4-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 docs/interop/vhost-user.rst    | 16 +++++++++++
 hw/virtio/vhost-user.c         | 49 ++++++++++++++++++++++++++++++++--
 include/hw/virtio/vhost-user.h |  1 +
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index 3b1b6602c7..b3cf5c3cb5 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -815,6 +815,7 @@ Protocol features
   #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD       12
   #define VHOST_USER_PROTOCOL_F_RESET_DEVICE         13
   #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14
+  #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS  15
 
 Master message types
 --------------------
@@ -1263,6 +1264,21 @@ Master message types
 
   The state.num field is currently reserved and must be set to 0.
 
+``VHOST_USER_GET_MAX_MEM_SLOTS``
+  :id: 36
+  :equivalent ioctl: N/A
+  :slave payload: u64
+
+  When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
+  feature has been successfully negotiated, this message is submitted
+  by master to the slave. The slave should return the message with a
+  u64 payload containing the maximum number of memory slots for
+  QEMU to expose to the guest. At this point, the value returned
+  by the backend will be capped at the maximum number of ram slots
+  which can be supported by vhost-user. Currently that limit is set
+  at VHOST_USER_MAX_RAM_SLOTS = 8 because of underlying protocol
+  limitations.
+
 Slave message types
 -------------------
 
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 442b0d650a..754ad885cf 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -59,6 +59,8 @@ enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
     VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
     VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
+    /* Feature 14 reserved for VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. */
+    VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
     VHOST_USER_PROTOCOL_F_MAX
 };
 
@@ -100,6 +102,8 @@ typedef enum VhostUserRequest {
     VHOST_USER_SET_INFLIGHT_FD = 32,
     VHOST_USER_GPU_SET_SOCKET = 33,
     VHOST_USER_RESET_DEVICE = 34,
+    /* Message number 35 reserved for VHOST_USER_VRING_KICK. */
+    VHOST_USER_GET_MAX_MEM_SLOTS = 36,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -895,6 +899,23 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
     return 0;
 }
 
+static int vhost_user_get_max_memslots(struct vhost_dev *dev,
+                                       uint64_t *max_memslots)
+{
+    uint64_t backend_max_memslots;
+    int err;
+
+    err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS,
+                             &backend_max_memslots);
+    if (err < 0) {
+        return err;
+    }
+
+    *max_memslots = backend_max_memslots;
+
+    return 0;
+}
+
 static int vhost_user_reset_device(struct vhost_dev *dev)
 {
     VhostUserMsg msg = {
@@ -1392,7 +1413,7 @@ static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
 
 static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
 {
-    uint64_t features, protocol_features;
+    uint64_t features, protocol_features, ram_slots;
     struct vhost_user *u;
     int err;
 
@@ -1454,6 +1475,27 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
                          "slave-req protocol features.");
             return -1;
         }
+
+        /* get max memory regions if backend supports configurable RAM slots */
+        if (!virtio_has_feature(dev->protocol_features,
+                                VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) {
+            u->user->memory_slots = VHOST_MEMORY_MAX_NREGIONS;
+        } else {
+            err = vhost_user_get_max_memslots(dev, &ram_slots);
+            if (err < 0) {
+                return err;
+            }
+
+            if (ram_slots < u->user->memory_slots) {
+                error_report("The backend specified a max ram slots limit "
+                             "of %" PRIu64", when the prior validated limit was %d. "
+                             "This limit should never decrease.", ram_slots,
+                             u->user->memory_slots);
+                return -1;
+            }
+
+            u->user->memory_slots = MIN(ram_slots, VHOST_MEMORY_MAX_NREGIONS);
+        }
     }
 
     if (dev->migration_blocker == NULL &&
@@ -1519,7 +1561,9 @@ static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx)
 
 static int vhost_user_memslots_limit(struct vhost_dev *dev)
 {
-    return VHOST_MEMORY_MAX_NREGIONS;
+    struct vhost_user *u = dev->opaque;
+
+    return u->user->memory_slots;
 }
 
 static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
@@ -1904,6 +1948,7 @@ bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
         return false;
     }
     user->chr = chr;
+    user->memory_slots = 0;
     return true;
 }
 
diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
index 811e325f42..a9abca3288 100644
--- a/include/hw/virtio/vhost-user.h
+++ b/include/hw/virtio/vhost-user.h
@@ -20,6 +20,7 @@ typedef struct VhostUserHostNotifier {
 typedef struct VhostUserState {
     CharBackend *chr;
     VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX];
+    int memory_slots;
 } VhostUserState;
 
 bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp);

From f1aeb14b0809e313c74244d838645ed25e85ea63 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:35 +0000
Subject: [PATCH 39/58] Transmit vhost-user memory regions individually
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature has been negotiated, Qemu no longer sends the backend
all the memory regions in a single message. Rather, when the memory
tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and
VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map
and/or unmap instead of sending send all the regions in one fixed size
VHOST_USER_SET_MEM_TABLE message.

The vhost_user struct maintains a shadow state of the VM’s memory
regions. When the memory tables are modified, the
vhost_user_set_mem_table() function compares the new device memory state
to the shadow state and only sends regions which need to be unmapped or
mapped in. The regions which must be unmapped are sent first, followed
by the new regions to be mapped in. After all the messages have been
sent, the shadow state is set to the current virtual device state.

Existing backends which do not support
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com>
Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com>
Suggested-by: Mike Cui <cui@nutanix.com>
Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 docs/interop/vhost-user.rst |  33 ++-
 hw/virtio/vhost-user.c      | 516 ++++++++++++++++++++++++++++++------
 2 files changed, 472 insertions(+), 77 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index b3cf5c3cb5..037eefab0e 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -1276,8 +1276,37 @@ Master message types
   QEMU to expose to the guest. At this point, the value returned
   by the backend will be capped at the maximum number of ram slots
   which can be supported by vhost-user. Currently that limit is set
-  at VHOST_USER_MAX_RAM_SLOTS = 8 because of underlying protocol
-  limitations.
+  at VHOST_USER_MAX_RAM_SLOTS = 8.
+
+``VHOST_USER_ADD_MEM_REG``
+  :id: 37
+  :equivalent ioctl: N/A
+  :slave payload: memory region
+
+  When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
+  feature has been successfully negotiated, this message is submitted
+  by the master to the slave. The message payload contains a memory
+  region descriptor struct, describing a region of guest memory which
+  the slave device must map in. When the
+  ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has
+  been successfully negotiated, along with the
+  ``VHOST_USER_REM_MEM_REG`` message, this message is used to set and
+  update the memory tables of the slave device.
+
+``VHOST_USER_REM_MEM_REG``
+  :id: 38
+  :equivalent ioctl: N/A
+  :slave payload: memory region
+
+  When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
+  feature has been successfully negotiated, this message is submitted
+  by the master to the slave. The message payload contains a memory
+  region descriptor struct, describing a region of guest memory which
+  the slave device must unmap. When the
+  ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has
+  been successfully negotiated, along with the
+  ``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and
+  update the memory tables of the slave device.
 
 Slave message types
 -------------------
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 754ad885cf..3640f017a2 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -104,6 +104,8 @@ typedef enum VhostUserRequest {
     VHOST_USER_RESET_DEVICE = 34,
     /* Message number 35 reserved for VHOST_USER_VRING_KICK. */
     VHOST_USER_GET_MAX_MEM_SLOTS = 36,
+    VHOST_USER_ADD_MEM_REG = 37,
+    VHOST_USER_REM_MEM_REG = 38,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -128,6 +130,11 @@ typedef struct VhostUserMemory {
     VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
 } VhostUserMemory;
 
+typedef struct VhostUserMemRegMsg {
+    uint32_t padding;
+    VhostUserMemoryRegion region;
+} VhostUserMemRegMsg;
+
 typedef struct VhostUserLog {
     uint64_t mmap_size;
     uint64_t mmap_offset;
@@ -186,6 +193,7 @@ typedef union {
         struct vhost_vring_state state;
         struct vhost_vring_addr addr;
         VhostUserMemory memory;
+        VhostUserMemRegMsg mem_reg;
         VhostUserLog log;
         struct vhost_iotlb_msg iotlb;
         VhostUserConfig config;
@@ -226,6 +234,16 @@ struct vhost_user {
 
     /* True once we've entered postcopy_listen */
     bool               postcopy_listen;
+
+    /* Our current regions */
+    int num_shadow_regions;
+    struct vhost_memory_region shadow_regions[VHOST_MEMORY_MAX_NREGIONS];
+};
+
+struct scrub_regions {
+    struct vhost_memory_region *region;
+    int reg_idx;
+    int fd_idx;
 };
 
 static bool ioeventfd_enabled(void)
@@ -489,8 +507,332 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
     return 1;
 }
 
+static inline bool reg_equal(struct vhost_memory_region *shadow_reg,
+                             struct vhost_memory_region *vdev_reg)
+{
+    return shadow_reg->guest_phys_addr == vdev_reg->guest_phys_addr &&
+        shadow_reg->userspace_addr == vdev_reg->userspace_addr &&
+        shadow_reg->memory_size == vdev_reg->memory_size;
+}
+
+static void scrub_shadow_regions(struct vhost_dev *dev,
+                                 struct scrub_regions *add_reg,
+                                 int *nr_add_reg,
+                                 struct scrub_regions *rem_reg,
+                                 int *nr_rem_reg, uint64_t *shadow_pcb,
+                                 bool track_ramblocks)
+{
+    struct vhost_user *u = dev->opaque;
+    bool found[VHOST_MEMORY_MAX_NREGIONS] = {};
+    struct vhost_memory_region *reg, *shadow_reg;
+    int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0;
+    ram_addr_t offset;
+    MemoryRegion *mr;
+    bool matching;
+
+    /*
+     * Find memory regions present in our shadow state which are not in
+     * the device's current memory state.
+     *
+     * Mark regions in both the shadow and device state as "found".
+     */
+    for (i = 0; i < u->num_shadow_regions; i++) {
+        shadow_reg = &u->shadow_regions[i];
+        matching = false;
+
+        for (j = 0; j < dev->mem->nregions; j++) {
+            reg = &dev->mem->regions[j];
+
+            mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+
+            if (reg_equal(shadow_reg, reg)) {
+                matching = true;
+                found[j] = true;
+                if (track_ramblocks) {
+                    /*
+                     * Reset postcopy client bases, region_rb, and
+                     * region_rb_offset in case regions are removed.
+                     */
+                    if (fd > 0) {
+                        u->region_rb_offset[j] = offset;
+                        u->region_rb[j] = mr->ram_block;
+                        shadow_pcb[j] = u->postcopy_client_bases[i];
+                    } else {
+                        u->region_rb_offset[j] = 0;
+                        u->region_rb[j] = NULL;
+                    }
+                }
+                break;
+            }
+        }
+
+        /*
+         * If the region was not found in the current device memory state
+         * create an entry for it in the removed list.
+         */
+        if (!matching) {
+            rem_reg[rm_idx].region = shadow_reg;
+            rem_reg[rm_idx++].reg_idx = i;
+        }
+    }
+
+    /*
+     * For regions not marked "found", create entries in the added list.
+     *
+     * Note their indexes in the device memory state and the indexes of their
+     * file descriptors.
+     */
+    for (i = 0; i < dev->mem->nregions; i++) {
+        reg = &dev->mem->regions[i];
+        mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+        if (fd > 0) {
+            ++fd_num;
+        }
+
+        /*
+         * If the region was in both the shadow and device state we don't
+         * need to send a VHOST_USER_ADD_MEM_REG message for it.
+         */
+        if (found[i]) {
+            continue;
+        }
+
+        add_reg[add_idx].region = reg;
+        add_reg[add_idx].reg_idx = i;
+        add_reg[add_idx++].fd_idx = fd_num;
+    }
+    *nr_rem_reg = rm_idx;
+    *nr_add_reg = add_idx;
+
+    return;
+}
+
+static int send_remove_regions(struct vhost_dev *dev,
+                               struct scrub_regions *remove_reg,
+                               int nr_rem_reg, VhostUserMsg *msg,
+                               bool reply_supported)
+{
+    struct vhost_user *u = dev->opaque;
+    struct vhost_memory_region *shadow_reg;
+    int i, fd, shadow_reg_idx, ret;
+    ram_addr_t offset;
+    VhostUserMemoryRegion region_buffer;
+
+    /*
+     * The regions in remove_reg appear in the same order they do in the
+     * shadow table. Therefore we can minimize memory copies by iterating
+     * through remove_reg backwards.
+     */
+    for (i = nr_rem_reg - 1; i >= 0; i--) {
+        shadow_reg = remove_reg[i].region;
+        shadow_reg_idx = remove_reg[i].reg_idx;
+
+        vhost_user_get_mr_data(shadow_reg->userspace_addr, &offset, &fd);
+
+        if (fd > 0) {
+            msg->hdr.request = VHOST_USER_REM_MEM_REG;
+            vhost_user_fill_msg_region(&region_buffer, shadow_reg);
+            msg->payload.mem_reg.region = region_buffer;
+
+            if (vhost_user_write(dev, msg, &fd, 1) < 0) {
+                return -1;
+            }
+
+            if (reply_supported) {
+                ret = process_message_reply(dev, msg);
+                if (ret) {
+                    return ret;
+                }
+            }
+        }
+
+        /*
+         * At this point we know the backend has unmapped the region. It is now
+         * safe to remove it from the shadow table.
+         */
+        memmove(&u->shadow_regions[shadow_reg_idx],
+                &u->shadow_regions[shadow_reg_idx + 1],
+                sizeof(struct vhost_memory_region) *
+                (u->num_shadow_regions - shadow_reg_idx));
+        u->num_shadow_regions--;
+    }
+
+    return 0;
+}
+
+static int send_add_regions(struct vhost_dev *dev,
+                            struct scrub_regions *add_reg, int nr_add_reg,
+                            VhostUserMsg *msg, uint64_t *shadow_pcb,
+                            bool reply_supported, bool track_ramblocks)
+{
+    struct vhost_user *u = dev->opaque;
+    int i, fd, ret, reg_idx, reg_fd_idx;
+    struct vhost_memory_region *reg;
+    MemoryRegion *mr;
+    ram_addr_t offset;
+    VhostUserMsg msg_reply;
+    VhostUserMemoryRegion region_buffer;
+
+    for (i = 0; i < nr_add_reg; i++) {
+        reg = add_reg[i].region;
+        reg_idx = add_reg[i].reg_idx;
+        reg_fd_idx = add_reg[i].fd_idx;
+
+        mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+
+        if (fd > 0) {
+            if (track_ramblocks) {
+                trace_vhost_user_set_mem_table_withfd(reg_fd_idx, mr->name,
+                                                      reg->memory_size,
+                                                      reg->guest_phys_addr,
+                                                      reg->userspace_addr,
+                                                      offset);
+                u->region_rb_offset[reg_idx] = offset;
+                u->region_rb[reg_idx] = mr->ram_block;
+            }
+            msg->hdr.request = VHOST_USER_ADD_MEM_REG;
+            vhost_user_fill_msg_region(&region_buffer, reg);
+            msg->payload.mem_reg.region = region_buffer;
+            msg->payload.mem_reg.region.mmap_offset = offset;
+
+            if (vhost_user_write(dev, msg, &fd, 1) < 0) {
+                return -1;
+            }
+
+            if (track_ramblocks) {
+                uint64_t reply_gpa;
+
+                if (vhost_user_read(dev, &msg_reply) < 0) {
+                    return -1;
+                }
+
+                reply_gpa = msg_reply.payload.mem_reg.region.guest_phys_addr;
+
+                if (msg_reply.hdr.request != VHOST_USER_ADD_MEM_REG) {
+                    error_report("%s: Received unexpected msg type."
+                                 "Expected %d received %d", __func__,
+                                 VHOST_USER_ADD_MEM_REG,
+                                 msg_reply.hdr.request);
+                    return -1;
+                }
+
+                /*
+                 * We're using the same structure, just reusing one of the
+                 * fields, so it should be the same size.
+                 */
+                if (msg_reply.hdr.size != msg->hdr.size) {
+                    error_report("%s: Unexpected size for postcopy reply "
+                                 "%d vs %d", __func__, msg_reply.hdr.size,
+                                 msg->hdr.size);
+                    return -1;
+                }
+
+                /* Get the postcopy client base from the backend's reply. */
+                if (reply_gpa == dev->mem->regions[reg_idx].guest_phys_addr) {
+                    shadow_pcb[reg_idx] =
+                        msg_reply.payload.mem_reg.region.userspace_addr;
+                    trace_vhost_user_set_mem_table_postcopy(
+                        msg_reply.payload.mem_reg.region.userspace_addr,
+                        msg->payload.mem_reg.region.userspace_addr,
+                        reg_fd_idx, reg_idx);
+                } else {
+                    error_report("%s: invalid postcopy reply for region. "
+                                 "Got guest physical address %" PRIX64 ", expected "
+                                 "%" PRIX64, __func__, reply_gpa,
+                                 dev->mem->regions[reg_idx].guest_phys_addr);
+                    return -1;
+                }
+            } else if (reply_supported) {
+                ret = process_message_reply(dev, msg);
+                if (ret) {
+                    return ret;
+                }
+            }
+        } else if (track_ramblocks) {
+            u->region_rb_offset[reg_idx] = 0;
+            u->region_rb[reg_idx] = NULL;
+        }
+
+        /*
+         * At this point, we know the backend has mapped in the new
+         * region, if the region has a valid file descriptor.
+         *
+         * The region should now be added to the shadow table.
+         */
+        u->shadow_regions[u->num_shadow_regions].guest_phys_addr =
+            reg->guest_phys_addr;
+        u->shadow_regions[u->num_shadow_regions].userspace_addr =
+            reg->userspace_addr;
+        u->shadow_regions[u->num_shadow_regions].memory_size =
+            reg->memory_size;
+        u->num_shadow_regions++;
+    }
+
+    return 0;
+}
+
+static int vhost_user_add_remove_regions(struct vhost_dev *dev,
+                                         VhostUserMsg *msg,
+                                         bool reply_supported,
+                                         bool track_ramblocks)
+{
+    struct vhost_user *u = dev->opaque;
+    struct scrub_regions add_reg[VHOST_MEMORY_MAX_NREGIONS];
+    struct scrub_regions rem_reg[VHOST_MEMORY_MAX_NREGIONS];
+    uint64_t shadow_pcb[VHOST_MEMORY_MAX_NREGIONS] = {};
+    int nr_add_reg, nr_rem_reg;
+
+    msg->hdr.size = sizeof(msg->payload.mem_reg.padding) +
+        sizeof(VhostUserMemoryRegion);
+
+    /* Find the regions which need to be removed or added. */
+    scrub_shadow_regions(dev, add_reg, &nr_add_reg, rem_reg, &nr_rem_reg,
+                         shadow_pcb, track_ramblocks);
+
+    if (nr_rem_reg && send_remove_regions(dev, rem_reg, nr_rem_reg, msg,
+                reply_supported) < 0)
+    {
+        goto err;
+    }
+
+    if (nr_add_reg && send_add_regions(dev, add_reg, nr_add_reg, msg,
+                shadow_pcb, reply_supported, track_ramblocks) < 0)
+    {
+        goto err;
+    }
+
+    if (track_ramblocks) {
+        memcpy(u->postcopy_client_bases, shadow_pcb,
+               sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
+        /*
+         * Now we've registered this with the postcopy code, we ack to the
+         * client, because now we're in the position to be able to deal with
+         * any faults it generates.
+         */
+        /* TODO: Use this for failure cases as well with a bad value. */
+        msg->hdr.size = sizeof(msg->payload.u64);
+        msg->payload.u64 = 0; /* OK */
+
+        if (vhost_user_write(dev, msg, NULL, 0) < 0) {
+            return -1;
+        }
+    }
+
+    return 0;
+
+err:
+    if (track_ramblocks) {
+        memcpy(u->postcopy_client_bases, shadow_pcb,
+               sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
+    }
+
+    return -1;
+}
+
 static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
-                                             struct vhost_memory *mem)
+                                             struct vhost_memory *mem,
+                                             bool reply_supported,
+                                             bool config_mem_slots)
 {
     struct vhost_user *u = dev->opaque;
     int fds[VHOST_MEMORY_MAX_NREGIONS];
@@ -513,71 +855,84 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
         u->region_rb_len = dev->mem->nregions;
     }
 
-    if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
+    if (config_mem_slots) {
+        if (vhost_user_add_remove_regions(dev, &msg, reply_supported,
                                           true) < 0) {
-        return -1;
-    }
-
-    if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
-        return -1;
-    }
-
-    if (vhost_user_read(dev, &msg_reply) < 0) {
-        return -1;
-    }
-
-    if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) {
-        error_report("%s: Received unexpected msg type."
-                     "Expected %d received %d", __func__,
-                     VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request);
-        return -1;
-    }
-    /* We're using the same structure, just reusing one of the
-     * fields, so it should be the same size.
-     */
-    if (msg_reply.hdr.size != msg.hdr.size) {
-        error_report("%s: Unexpected size for postcopy reply "
-                     "%d vs %d", __func__, msg_reply.hdr.size, msg.hdr.size);
-        return -1;
-    }
-
-    memset(u->postcopy_client_bases, 0,
-           sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
-
-    /* They're in the same order as the regions that were sent
-     * but some of the regions were skipped (above) if they
-     * didn't have fd's
-    */
-    for (msg_i = 0, region_i = 0;
-         region_i < dev->mem->nregions;
-        region_i++) {
-        if (msg_i < fd_num &&
-            msg_reply.payload.memory.regions[msg_i].guest_phys_addr ==
-            dev->mem->regions[region_i].guest_phys_addr) {
-            u->postcopy_client_bases[region_i] =
-                msg_reply.payload.memory.regions[msg_i].userspace_addr;
-            trace_vhost_user_set_mem_table_postcopy(
-                msg_reply.payload.memory.regions[msg_i].userspace_addr,
-                msg.payload.memory.regions[msg_i].userspace_addr,
-                msg_i, region_i);
-            msg_i++;
+            return -1;
+        }
+    } else {
+        if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
+                                              true) < 0) {
+            return -1;
+        }
+
+        if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
+            return -1;
+        }
+
+        if (vhost_user_read(dev, &msg_reply) < 0) {
+            return -1;
+        }
+
+        if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) {
+            error_report("%s: Received unexpected msg type."
+                         "Expected %d received %d", __func__,
+                         VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request);
+            return -1;
+        }
+
+        /*
+         * We're using the same structure, just reusing one of the
+         * fields, so it should be the same size.
+         */
+        if (msg_reply.hdr.size != msg.hdr.size) {
+            error_report("%s: Unexpected size for postcopy reply "
+                         "%d vs %d", __func__, msg_reply.hdr.size,
+                         msg.hdr.size);
+            return -1;
+        }
+
+        memset(u->postcopy_client_bases, 0,
+               sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
+
+        /*
+         * They're in the same order as the regions that were sent
+         * but some of the regions were skipped (above) if they
+         * didn't have fd's
+         */
+        for (msg_i = 0, region_i = 0;
+             region_i < dev->mem->nregions;
+             region_i++) {
+            if (msg_i < fd_num &&
+                msg_reply.payload.memory.regions[msg_i].guest_phys_addr ==
+                dev->mem->regions[region_i].guest_phys_addr) {
+                u->postcopy_client_bases[region_i] =
+                    msg_reply.payload.memory.regions[msg_i].userspace_addr;
+                trace_vhost_user_set_mem_table_postcopy(
+                    msg_reply.payload.memory.regions[msg_i].userspace_addr,
+                    msg.payload.memory.regions[msg_i].userspace_addr,
+                    msg_i, region_i);
+                msg_i++;
+            }
+        }
+        if (msg_i != fd_num) {
+            error_report("%s: postcopy reply not fully consumed "
+                         "%d vs %zd",
+                         __func__, msg_i, fd_num);
+            return -1;
+        }
+
+        /*
+         * Now we've registered this with the postcopy code, we ack to the
+         * client, because now we're in the position to be able to deal
+         * with any faults it generates.
+         */
+        /* TODO: Use this for failure cases as well with a bad value. */
+        msg.hdr.size = sizeof(msg.payload.u64);
+        msg.payload.u64 = 0; /* OK */
+        if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
+            return -1;
         }
-    }
-    if (msg_i != fd_num) {
-        error_report("%s: postcopy reply not fully consumed "
-                     "%d vs %zd",
-                     __func__, msg_i, fd_num);
-        return -1;
-    }
-    /* Now we've registered this with the postcopy code, we ack to the client,
-     * because now we're in the position to be able to deal with any faults
-     * it generates.
-     */
-    /* TODO: Use this for failure cases as well with a bad value */
-    msg.hdr.size = sizeof(msg.payload.u64);
-    msg.payload.u64 = 0; /* OK */
-    if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
-        return -1;
     }
 
     return 0;
@@ -592,12 +947,17 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
     bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
     bool reply_supported = virtio_has_feature(dev->protocol_features,
                                               VHOST_USER_PROTOCOL_F_REPLY_ACK);
+    bool config_mem_slots =
+        virtio_has_feature(dev->protocol_features,
+                           VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS);
 
     if (do_postcopy) {
-        /* Postcopy has enough differences that it's best done in it's own
+        /*
+         * Postcopy has enough differences that it's best done in it's own
          * version
          */
-        return vhost_user_set_mem_table_postcopy(dev, mem);
+        return vhost_user_set_mem_table_postcopy(dev, mem, reply_supported,
+                                                 config_mem_slots);
     }
 
     VhostUserMsg msg = {
@@ -608,17 +968,23 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
         msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
     }
 
-    if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
+    if (config_mem_slots) {
+        if (vhost_user_add_remove_regions(dev, &msg, reply_supported,
                                           false) < 0) {
-        return -1;
-    }
+            return -1;
+        }
+    } else {
+        if (vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
+                                              false) < 0) {
+            return -1;
+        }
+        if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
+            return -1;
+        }
 
-    if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
-        return -1;
-    }
-
-    if (reply_supported) {
-        return process_message_reply(dev, &msg);
+        if (reply_supported) {
+            return process_message_reply(dev, &msg);
+        }
     }
 
     return 0;

From 27598393a23215cfbf92ad550b9541675b0b8f2b Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:40 +0000
Subject: [PATCH 40/58] Lift max memory slots limit imposed by vhost-user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Historically, sending all memory regions to vhost-user backends in a
single message imposed a limitation on the number of times memory
could be hot-added to a VM with a vhost-user device. Now that backends
which support the VHOST_USER_PROTOCOL_F_CONFIGURE_SLOTS send memory
regions individually, we no longer need to impose this limitation on
devices which support this feature.

With this change, VMs with a vhost-user device which supports the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS can support a configurable
number of memory slots, up to the maximum allowed by the target
platform.

Existing backends which do not support
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com>
Suggested-by: Mike Cui <cui@nutanix.com>
Message-Id: <1588533678-23450-6-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 docs/interop/vhost-user.rst |  7 ++---
 hw/virtio/vhost-user.c      | 56 ++++++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index 037eefab0e..688b7c6900 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -1273,10 +1273,9 @@ Master message types
   feature has been successfully negotiated, this message is submitted
   by master to the slave. The slave should return the message with a
   u64 payload containing the maximum number of memory slots for
-  QEMU to expose to the guest. At this point, the value returned
-  by the backend will be capped at the maximum number of ram slots
-  which can be supported by vhost-user. Currently that limit is set
-  at VHOST_USER_MAX_RAM_SLOTS = 8.
+  QEMU to expose to the guest. The value returned by the backend
+  will be capped at the maximum number of ram slots which can be
+  supported by the target platform.
 
 ``VHOST_USER_ADD_MEM_REG``
   :id: 37
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 3640f017a2..4d6cd4e58a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -35,10 +35,28 @@
 #include <linux/userfaultfd.h>
 #endif
 
-#define VHOST_MEMORY_MAX_NREGIONS    8
+#define VHOST_MEMORY_BASELINE_NREGIONS    8
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 #define VHOST_USER_SLAVE_MAX_FDS     8
 
+/*
+ * Set maximum number of RAM slots supported to
+ * the maximum number supported by the target
+ * hardware plaform.
+ */
+#if defined(TARGET_X86) || defined(TARGET_X86_64) || \
+    defined(TARGET_ARM) || defined(TARGET_ARM_64)
+#include "hw/acpi/acpi.h"
+#define VHOST_USER_MAX_RAM_SLOTS ACPI_MAX_RAM_SLOTS
+
+#elif defined(TARGET_PPC) || defined(TARGET_PPC_64)
+#include "hw/ppc/spapr.h"
+#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS
+
+#else
+#define VHOST_USER_MAX_RAM_SLOTS 512
+#endif
+
 /*
  * Maximum size of virtio device config space
  */
@@ -127,7 +145,7 @@ typedef struct VhostUserMemoryRegion {
 typedef struct VhostUserMemory {
     uint32_t nregions;
     uint32_t padding;
-    VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+    VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS];
 } VhostUserMemory;
 
 typedef struct VhostUserMemRegMsg {
@@ -222,7 +240,7 @@ struct vhost_user {
     int slave_fd;
     NotifierWithReturn postcopy_notifier;
     struct PostCopyFD  postcopy_fd;
-    uint64_t           postcopy_client_bases[VHOST_MEMORY_MAX_NREGIONS];
+    uint64_t           postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS];
     /* Length of the region_rb and region_rb_offset arrays */
     size_t             region_rb_len;
     /* RAMBlock associated with a given region */
@@ -237,7 +255,7 @@ struct vhost_user {
 
     /* Our current regions */
     int num_shadow_regions;
-    struct vhost_memory_region shadow_regions[VHOST_MEMORY_MAX_NREGIONS];
+    struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS];
 };
 
 struct scrub_regions {
@@ -392,7 +410,7 @@ int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd)
 static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
                                    struct vhost_log *log)
 {
-    int fds[VHOST_MEMORY_MAX_NREGIONS];
+    int fds[VHOST_USER_MAX_RAM_SLOTS];
     size_t fd_num = 0;
     bool shmfd = virtio_has_feature(dev->protocol_features,
                                     VHOST_USER_PROTOCOL_F_LOG_SHMFD);
@@ -470,7 +488,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
         mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
         if (fd > 0) {
             if (track_ramblocks) {
-                assert(*fd_num < VHOST_MEMORY_MAX_NREGIONS);
+                assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS);
                 trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name,
                                                       reg->memory_size,
                                                       reg->guest_phys_addr,
@@ -478,7 +496,7 @@ static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
                                                       offset);
                 u->region_rb_offset[i] = offset;
                 u->region_rb[i] = mr->ram_block;
-            } else if (*fd_num == VHOST_MEMORY_MAX_NREGIONS) {
+            } else if (*fd_num == VHOST_MEMORY_BASELINE_NREGIONS) {
                 error_report("Failed preparing vhost-user memory table msg");
                 return -1;
             }
@@ -523,7 +541,7 @@ static void scrub_shadow_regions(struct vhost_dev *dev,
                                  bool track_ramblocks)
 {
     struct vhost_user *u = dev->opaque;
-    bool found[VHOST_MEMORY_MAX_NREGIONS] = {};
+    bool found[VHOST_USER_MAX_RAM_SLOTS] = {};
     struct vhost_memory_region *reg, *shadow_reg;
     int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0;
     ram_addr_t offset;
@@ -777,9 +795,9 @@ static int vhost_user_add_remove_regions(struct vhost_dev *dev,
                                          bool track_ramblocks)
 {
     struct vhost_user *u = dev->opaque;
-    struct scrub_regions add_reg[VHOST_MEMORY_MAX_NREGIONS];
-    struct scrub_regions rem_reg[VHOST_MEMORY_MAX_NREGIONS];
-    uint64_t shadow_pcb[VHOST_MEMORY_MAX_NREGIONS] = {};
+    struct scrub_regions add_reg[VHOST_USER_MAX_RAM_SLOTS];
+    struct scrub_regions rem_reg[VHOST_USER_MAX_RAM_SLOTS];
+    uint64_t shadow_pcb[VHOST_USER_MAX_RAM_SLOTS] = {};
     int nr_add_reg, nr_rem_reg;
 
     msg->hdr.size = sizeof(msg->payload.mem_reg.padding) +
@@ -803,7 +821,7 @@ static int vhost_user_add_remove_regions(struct vhost_dev *dev,
 
     if (track_ramblocks) {
         memcpy(u->postcopy_client_bases, shadow_pcb,
-               sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
+               sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
         /*
          * Now we've registered this with the postcopy code, we ack to the
          * client, because now we're in the position to be able to deal with
@@ -823,7 +841,7 @@ static int vhost_user_add_remove_regions(struct vhost_dev *dev,
 err:
     if (track_ramblocks) {
         memcpy(u->postcopy_client_bases, shadow_pcb,
-               sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
+               sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
     }
 
     return -1;
@@ -835,7 +853,7 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
                                              bool config_mem_slots)
 {
     struct vhost_user *u = dev->opaque;
-    int fds[VHOST_MEMORY_MAX_NREGIONS];
+    int fds[VHOST_MEMORY_BASELINE_NREGIONS];
     size_t fd_num = 0;
     VhostUserMsg msg_reply;
     int region_i, msg_i;
@@ -893,7 +911,7 @@ static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
         }
 
         memset(u->postcopy_client_bases, 0,
-               sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
+               sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
 
         /*
          * They're in the same order as the regions that were sent
@@ -942,7 +960,7 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
                                     struct vhost_memory *mem)
 {
     struct vhost_user *u = dev->opaque;
-    int fds[VHOST_MEMORY_MAX_NREGIONS];
+    int fds[VHOST_MEMORY_BASELINE_NREGIONS];
     size_t fd_num = 0;
     bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
     bool reply_supported = virtio_has_feature(dev->protocol_features,
@@ -1149,7 +1167,7 @@ static int vhost_set_vring_file(struct vhost_dev *dev,
                                 VhostUserRequest request,
                                 struct vhost_vring_file *file)
 {
-    int fds[VHOST_MEMORY_MAX_NREGIONS];
+    int fds[VHOST_USER_MAX_RAM_SLOTS];
     size_t fd_num = 0;
     VhostUserMsg msg = {
         .hdr.request = request,
@@ -1845,7 +1863,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
         /* get max memory regions if backend supports configurable RAM slots */
         if (!virtio_has_feature(dev->protocol_features,
                                 VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) {
-            u->user->memory_slots = VHOST_MEMORY_MAX_NREGIONS;
+            u->user->memory_slots = VHOST_MEMORY_BASELINE_NREGIONS;
         } else {
             err = vhost_user_get_max_memslots(dev, &ram_slots);
             if (err < 0) {
@@ -1860,7 +1878,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
                 return -1;
             }
 
-            u->user->memory_slots = MIN(ram_slots, VHOST_MEMORY_MAX_NREGIONS);
+            u->user->memory_slots = MIN(ram_slots, VHOST_USER_MAX_RAM_SLOTS);
         }
     }
 

From 08fccf8f077397b47c740d4850cdeedc947f21ac Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:47 +0000
Subject: [PATCH 41/58] Refactor out libvhost-user fault generation logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In libvhost-user, the incoming postcopy migration path for setting the
backend's memory tables has become convolued. In particular, moving the
logic which starts generating faults, having received the final ACK from
qemu can be moved to a separate function. This simplifies the code
substantially.

This logic will also be needed by the postcopy path once the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is supported.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-7-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 147 ++++++++++++++------------
 1 file changed, 79 insertions(+), 68 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 3bca996c62..cccfa22209 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -583,6 +583,84 @@ map_ring(VuDev *dev, VuVirtq *vq)
     return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
 }
 
+static bool
+generate_faults(VuDev *dev) {
+    int i;
+    for (i = 0; i < dev->nregions; i++) {
+        VuDevRegion *dev_region = &dev->regions[i];
+        int ret;
+#ifdef UFFDIO_REGISTER
+        /*
+         * We should already have an open ufd. Mark each memory
+         * range as ufd.
+         * Discard any mapping we have here; note I can't use MADV_REMOVE
+         * or fallocate to make the hole since I don't want to lose
+         * data that's already arrived in the shared process.
+         * TODO: How to do hugepage
+         */
+        ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
+                      dev_region->size + dev_region->mmap_offset,
+                      MADV_DONTNEED);
+        if (ret) {
+            fprintf(stderr,
+                    "%s: Failed to madvise(DONTNEED) region %d: %s\n",
+                    __func__, i, strerror(errno));
+        }
+        /*
+         * Turn off transparent hugepages so we dont get lose wakeups
+         * in neighbouring pages.
+         * TODO: Turn this backon later.
+         */
+        ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
+                      dev_region->size + dev_region->mmap_offset,
+                      MADV_NOHUGEPAGE);
+        if (ret) {
+            /*
+             * Note: This can happen legally on kernels that are configured
+             * without madvise'able hugepages
+             */
+            fprintf(stderr,
+                    "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
+                    __func__, i, strerror(errno));
+        }
+        struct uffdio_register reg_struct;
+        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
+        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
+        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
+            vu_panic(dev, "%s: Failed to userfault region %d "
+                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
+                     __func__, i,
+                     dev_region->mmap_addr,
+                     dev_region->size, dev_region->mmap_offset,
+                     dev->postcopy_ufd, strerror(errno));
+            return false;
+        }
+        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
+            vu_panic(dev, "%s Region (%d) doesn't support COPY",
+                     __func__, i);
+            return false;
+        }
+        DPRINT("%s: region %d: Registered userfault for %"
+               PRIx64 " + %" PRIx64 "\n", __func__, i,
+               (uint64_t)reg_struct.range.start,
+               (uint64_t)reg_struct.range.len);
+        /* Now it's registered we can let the client at it */
+        if (mprotect((void *)(uintptr_t)dev_region->mmap_addr,
+                     dev_region->size + dev_region->mmap_offset,
+                     PROT_READ | PROT_WRITE)) {
+            vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
+                     i, strerror(errno));
+            return false;
+        }
+        /* TODO: Stash 'zero' support flags somewhere */
+#endif
+    }
+
+    return true;
+}
+
 static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -655,74 +733,7 @@ vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
     }
 
     /* OK, now we can go and register the memory and generate faults */
-    for (i = 0; i < dev->nregions; i++) {
-        VuDevRegion *dev_region = &dev->regions[i];
-        int ret;
-#ifdef UFFDIO_REGISTER
-        /* We should already have an open ufd. Mark each memory
-         * range as ufd.
-         * Discard any mapping we have here; note I can't use MADV_REMOVE
-         * or fallocate to make the hole since I don't want to lose
-         * data that's already arrived in the shared process.
-         * TODO: How to do hugepage
-         */
-        ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
-                      dev_region->size + dev_region->mmap_offset,
-                      MADV_DONTNEED);
-        if (ret) {
-            fprintf(stderr,
-                    "%s: Failed to madvise(DONTNEED) region %d: %s\n",
-                    __func__, i, strerror(errno));
-        }
-        /* Turn off transparent hugepages so we dont get lose wakeups
-         * in neighbouring pages.
-         * TODO: Turn this backon later.
-         */
-        ret = madvise((void *)(uintptr_t)dev_region->mmap_addr,
-                      dev_region->size + dev_region->mmap_offset,
-                      MADV_NOHUGEPAGE);
-        if (ret) {
-            /* Note: This can happen legally on kernels that are configured
-             * without madvise'able hugepages
-             */
-            fprintf(stderr,
-                    "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
-                    __func__, i, strerror(errno));
-        }
-        struct uffdio_register reg_struct;
-        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
-        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
-        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
-
-        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
-            vu_panic(dev, "%s: Failed to userfault region %d "
-                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
-                     __func__, i,
-                     dev_region->mmap_addr,
-                     dev_region->size, dev_region->mmap_offset,
-                     dev->postcopy_ufd, strerror(errno));
-            return false;
-        }
-        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
-            vu_panic(dev, "%s Region (%d) doesn't support COPY",
-                     __func__, i);
-            return false;
-        }
-        DPRINT("%s: region %d: Registered userfault for %"
-               PRIx64 " + %" PRIx64 "\n", __func__, i,
-               (uint64_t)reg_struct.range.start,
-               (uint64_t)reg_struct.range.len);
-        /* Now it's registered we can let the client at it */
-        if (mprotect((void *)(uintptr_t)dev_region->mmap_addr,
-                     dev_region->size + dev_region->mmap_offset,
-                     PROT_READ | PROT_WRITE)) {
-            vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
-                     i, strerror(errno));
-            return false;
-        }
-        /* TODO: Stash 'zero' support flags somewhere */
-#endif
-    }
+    (void)generate_faults(dev);
 
     return false;
 }

From 6fb2e173d20c9bbb5466183d33a3ad7dcd0375fa Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:50 +0000
Subject: [PATCH 42/58] Support ram slot configuration in libvhost-user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The VHOST_USER_GET_MAX_MEM_SLOTS message allows a vhost-user backend to
specify a maximum number of ram slots it is willing to support. This
change adds support for libvhost-user to process this message. For now
the backend will reply with 8 as the maximum number of regions
supported.

libvhost-user does not yet support the vhost-user protocol feature
VHOST_USER_PROTOCOL_F_CONFIGIRE_MEM_SLOTS, so qemu should never
send the VHOST_USER_GET_MAX_MEM_SLOTS message. Therefore this new
functionality is not currently used.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-8-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 19 +++++++++++++++++++
 contrib/libvhost-user/libvhost-user.h |  1 +
 2 files changed, 20 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index cccfa22209..9f039b707e 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -137,6 +137,7 @@ vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_SET_INFLIGHT_FD),
         REQ(VHOST_USER_GPU_SET_SOCKET),
         REQ(VHOST_USER_VRING_KICK),
+        REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -1565,6 +1566,22 @@ vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg)
     return false;
 }
 
+static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg)
+{
+    vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
+    vmsg->size  = sizeof(vmsg->payload.u64);
+    vmsg->payload.u64 = VHOST_MEMORY_MAX_NREGIONS;
+    vmsg->fd_num = 0;
+
+    if (!vu_message_write(dev, dev->sock, vmsg)) {
+        vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno));
+    }
+
+    DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_MEMORY_MAX_NREGIONS);
+
+    return false;
+}
+
 static bool
 vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1649,6 +1666,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_set_inflight_fd(dev, vmsg);
     case VHOST_USER_VRING_KICK:
         return vu_handle_vring_kick(dev, vmsg);
+    case VHOST_USER_GET_MAX_MEM_SLOTS:
+        return vu_handle_get_max_memslots(dev, vmsg);
     default:
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Unhandled request: %d", vmsg->request);
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index f30394fab6..88ef40d26a 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -97,6 +97,7 @@ typedef enum VhostUserRequest {
     VHOST_USER_SET_INFLIGHT_FD = 32,
     VHOST_USER_GPU_SET_SOCKET = 33,
     VHOST_USER_VRING_KICK = 35,
+    VHOST_USER_GET_MAX_MEM_SLOTS = 36,
     VHOST_USER_MAX
 } VhostUserRequest;
 

From ec94c8e621de96c50c2d381c8c9ec94f5beec7c1 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:52 +0000
Subject: [PATCH 43/58] Support adding individual regions in libvhost-user

When the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS is enabled, qemu will
transmit memory regions to a backend individually using the new message
VHOST_USER_ADD_MEM_REG. With this change vhost-user backends built with
libvhost-user can now map in new memory regions when VHOST_USER_ADD_MEM_REG
messages are received.

Qemu only sends VHOST_USER_ADD_MEM_REG messages when the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is negotiated, and
since it is not yet supported in libvhost-user, this new functionality
is not yet used.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-9-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 103 ++++++++++++++++++++++++++
 contrib/libvhost-user/libvhost-user.h |   7 ++
 2 files changed, 110 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 9f039b707e..d8ee7a23a3 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -138,6 +138,7 @@ vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_GPU_SET_SOCKET),
         REQ(VHOST_USER_VRING_KICK),
         REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
+        REQ(VHOST_USER_ADD_MEM_REG),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -662,6 +663,106 @@ generate_faults(VuDev *dev) {
     return true;
 }
 
+static bool
+vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
+    int i;
+    bool track_ramblocks = dev->postcopy_listening;
+    VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
+    VuDevRegion *dev_region = &dev->regions[dev->nregions];
+    void *mmap_addr;
+
+    /*
+     * If we are in postcopy mode and we receive a u64 payload with a 0 value
+     * we know all the postcopy client bases have been recieved, and we
+     * should start generating faults.
+     */
+    if (track_ramblocks &&
+        vmsg->size == sizeof(vmsg->payload.u64) &&
+        vmsg->payload.u64 == 0) {
+        (void)generate_faults(dev);
+        return false;
+    }
+
+    DPRINT("Adding region: %d\n", dev->nregions);
+    DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
+           msg_region->guest_phys_addr);
+    DPRINT("    memory_size:     0x%016"PRIx64"\n",
+           msg_region->memory_size);
+    DPRINT("    userspace_addr   0x%016"PRIx64"\n",
+           msg_region->userspace_addr);
+    DPRINT("    mmap_offset      0x%016"PRIx64"\n",
+           msg_region->mmap_offset);
+
+    dev_region->gpa = msg_region->guest_phys_addr;
+    dev_region->size = msg_region->memory_size;
+    dev_region->qva = msg_region->userspace_addr;
+    dev_region->mmap_offset = msg_region->mmap_offset;
+
+    /*
+     * We don't use offset argument of mmap() since the
+     * mapped address has to be page aligned, and we use huge
+     * pages.
+     */
+    if (track_ramblocks) {
+        /*
+         * In postcopy we're using PROT_NONE here to catch anyone
+         * accessing it before we userfault.
+         */
+        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+                         PROT_NONE, MAP_SHARED,
+                         vmsg->fds[0], 0);
+    } else {
+        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
+                         PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[0],
+                         0);
+    }
+
+    if (mmap_addr == MAP_FAILED) {
+        vu_panic(dev, "region mmap error: %s", strerror(errno));
+    } else {
+        dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+        DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
+               dev_region->mmap_addr);
+    }
+
+    close(vmsg->fds[0]);
+
+    if (track_ramblocks) {
+        /*
+         * Return the address to QEMU so that it can translate the ufd
+         * fault addresses back.
+         */
+        msg_region->userspace_addr = (uintptr_t)(mmap_addr +
+                                                 dev_region->mmap_offset);
+
+        /* Send the message back to qemu with the addresses filled in. */
+        vmsg->fd_num = 0;
+        if (!vu_send_reply(dev, dev->sock, vmsg)) {
+            vu_panic(dev, "failed to respond to add-mem-region for postcopy");
+            return false;
+        }
+
+        DPRINT("Successfully added new region in postcopy\n");
+        dev->nregions++;
+        return false;
+
+    } else {
+        for (i = 0; i < dev->max_queues; i++) {
+            if (dev->vq[i].vring.desc) {
+                if (map_ring(dev, &dev->vq[i])) {
+                    vu_panic(dev, "remapping queue %d for new memory region",
+                             i);
+                }
+            }
+        }
+
+        DPRINT("Successfully added new region\n");
+        dev->nregions++;
+        vmsg_set_reply_u64(vmsg, 0);
+        return true;
+    }
+}
+
 static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1668,6 +1769,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_handle_vring_kick(dev, vmsg);
     case VHOST_USER_GET_MAX_MEM_SLOTS:
         return vu_handle_get_max_memslots(dev, vmsg);
+    case VHOST_USER_ADD_MEM_REG:
+        return vu_add_mem_reg(dev, vmsg);
     default:
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Unhandled request: %d", vmsg->request);
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 88ef40d26a..60ef7fd13e 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -98,6 +98,7 @@ typedef enum VhostUserRequest {
     VHOST_USER_GPU_SET_SOCKET = 33,
     VHOST_USER_VRING_KICK = 35,
     VHOST_USER_GET_MAX_MEM_SLOTS = 36,
+    VHOST_USER_ADD_MEM_REG = 37,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -124,6 +125,11 @@ typedef struct VhostUserMemory {
     VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
 } VhostUserMemory;
 
+typedef struct VhostUserMemRegMsg {
+    uint32_t padding;
+    VhostUserMemoryRegion region;
+} VhostUserMemRegMsg;
+
 typedef struct VhostUserLog {
     uint64_t mmap_size;
     uint64_t mmap_offset;
@@ -176,6 +182,7 @@ typedef struct VhostUserMsg {
         struct vhost_vring_state state;
         struct vhost_vring_addr addr;
         VhostUserMemory memory;
+        VhostUserMemRegMsg memreg;
         VhostUserLog log;
         VhostUserConfig config;
         VhostUserVringArea area;

From 875b9fd97b34dae796d61330804b094ca7b1b403 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:56 +0000
Subject: [PATCH 44/58] Support individual region unmap in libvhost-user

When the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature is
enabled, on memory hot-unplug qemu will transmit memory regions to
remove individually using the new message VHOST_USER_REM_MEM_REG
message. With this change, vhost-user backends build with libvhost-user
can now unmap individual memory regions when receiving the
VHOST_USER_REM_MEM_REG message.

Qemu only sends VHOST_USER_REM_MEM_REG messages when the
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS feature is negotiated, and
support for that feature has not yet been added in libvhost-user, this
new functionality is not yet used.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-10-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 63 +++++++++++++++++++++++++++
 contrib/libvhost-user/libvhost-user.h |  1 +
 2 files changed, 64 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index d8ee7a23a3..386449b697 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -139,6 +139,7 @@ vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_VRING_KICK),
         REQ(VHOST_USER_GET_MAX_MEM_SLOTS),
         REQ(VHOST_USER_ADD_MEM_REG),
+        REQ(VHOST_USER_REM_MEM_REG),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -763,6 +764,66 @@ vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
     }
 }
 
+static inline bool reg_equal(VuDevRegion *vudev_reg,
+                             VhostUserMemoryRegion *msg_reg)
+{
+    if (vudev_reg->gpa == msg_reg->guest_phys_addr &&
+        vudev_reg->qva == msg_reg->userspace_addr &&
+        vudev_reg->size == msg_reg->memory_size) {
+        return true;
+    }
+
+    return false;
+}
+
+static bool
+vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
+    int i, j;
+    bool found = false;
+    VuDevRegion shadow_regions[VHOST_MEMORY_MAX_NREGIONS] = {};
+    VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
+
+    DPRINT("Removing region:\n");
+    DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
+           msg_region->guest_phys_addr);
+    DPRINT("    memory_size:     0x%016"PRIx64"\n",
+           msg_region->memory_size);
+    DPRINT("    userspace_addr   0x%016"PRIx64"\n",
+           msg_region->userspace_addr);
+    DPRINT("    mmap_offset      0x%016"PRIx64"\n",
+           msg_region->mmap_offset);
+
+    for (i = 0, j = 0; i < dev->nregions; i++) {
+        if (!reg_equal(&dev->regions[i], msg_region)) {
+            shadow_regions[j].gpa = dev->regions[i].gpa;
+            shadow_regions[j].size = dev->regions[i].size;
+            shadow_regions[j].qva = dev->regions[i].qva;
+            shadow_regions[j].mmap_offset = dev->regions[i].mmap_offset;
+            j++;
+        } else {
+            found = true;
+            VuDevRegion *r = &dev->regions[i];
+            void *m = (void *) (uintptr_t) r->mmap_addr;
+
+            if (m) {
+                munmap(m, r->size + r->mmap_offset);
+            }
+        }
+    }
+
+    if (found) {
+        memcpy(dev->regions, shadow_regions,
+               sizeof(VuDevRegion) * VHOST_MEMORY_MAX_NREGIONS);
+        DPRINT("Successfully removed a region\n");
+        dev->nregions--;
+        vmsg_set_reply_u64(vmsg, 0);
+    } else {
+        vu_panic(dev, "Specified region not found\n");
+    }
+
+    return true;
+}
+
 static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1771,6 +1832,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_handle_get_max_memslots(dev, vmsg);
     case VHOST_USER_ADD_MEM_REG:
         return vu_add_mem_reg(dev, vmsg);
+    case VHOST_USER_REM_MEM_REG:
+        return vu_rem_mem_reg(dev, vmsg);
     default:
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Unhandled request: %d", vmsg->request);
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 60ef7fd13e..f8439713a8 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -99,6 +99,7 @@ typedef enum VhostUserRequest {
     VHOST_USER_VRING_KICK = 35,
     VHOST_USER_GET_MAX_MEM_SLOTS = 36,
     VHOST_USER_ADD_MEM_REG = 37,
+    VHOST_USER_REM_MEM_REG = 38,
     VHOST_USER_MAX
 } VhostUserRequest;
 

From b650d5f4b1cd3f9f8c4fdb319838c5c1e0695e41 Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.norwitz@nutanix.com>
Date: Thu, 21 May 2020 05:00:59 +0000
Subject: [PATCH 45/58] Lift max ram slots limit in libvhost-user

Historically, VMs with vhost-user devices could hot-add memory a maximum
of 8 times. Now that the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS
protocol feature has been added, VMs with vhost-user backends which
support this new feature can support a configurable number of ram slots
up to the maximum supported by the target platform.

This change adds VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS support for
backends built with libvhost-user, and increases the number of supported
ram slots from 8 to 32.

Memory hot-add, hot-remove and postcopy migration were tested with
the vhost-user-bridge sample.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <1588533678-23450-11-git-send-email-raphael.norwitz@nutanix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 17 +++++++++--------
 contrib/libvhost-user/libvhost-user.h | 15 +++++++++++----
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 386449b697..b1e607298c 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -269,7 +269,7 @@ have_userfault(void)
 static bool
 vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
-    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+    char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
     struct iovec iov = {
         .iov_base = (char *)vmsg,
         .iov_len = VHOST_USER_HDR_SIZE,
@@ -340,7 +340,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 {
     int rc;
     uint8_t *p = (uint8_t *)vmsg;
-    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
+    char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {};
     struct iovec iov = {
         .iov_base = (char *)vmsg,
         .iov_len = VHOST_USER_HDR_SIZE,
@@ -353,7 +353,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
     struct cmsghdr *cmsg;
 
     memset(control, 0, sizeof(control));
-    assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS);
+    assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS);
     if (vmsg->fd_num > 0) {
         size_t fdsize = vmsg->fd_num * sizeof(int);
         msg.msg_controllen = CMSG_SPACE(fdsize);
@@ -780,7 +780,7 @@ static bool
 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
     int i, j;
     bool found = false;
-    VuDevRegion shadow_regions[VHOST_MEMORY_MAX_NREGIONS] = {};
+    VuDevRegion shadow_regions[VHOST_USER_MAX_RAM_SLOTS] = {};
     VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m;
 
     DPRINT("Removing region:\n");
@@ -813,7 +813,7 @@ vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) {
 
     if (found) {
         memcpy(dev->regions, shadow_regions,
-               sizeof(VuDevRegion) * VHOST_MEMORY_MAX_NREGIONS);
+               sizeof(VuDevRegion) * VHOST_USER_MAX_RAM_SLOTS);
         DPRINT("Successfully removed a region\n");
         dev->nregions--;
         vmsg_set_reply_u64(vmsg, 0);
@@ -1394,7 +1394,8 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
                         1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
                         1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
                         1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD |
-                        1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
+                        1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
+                        1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS;
 
     if (have_userfault()) {
         features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
@@ -1732,14 +1733,14 @@ static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg)
 {
     vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
     vmsg->size  = sizeof(vmsg->payload.u64);
-    vmsg->payload.u64 = VHOST_MEMORY_MAX_NREGIONS;
+    vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS;
     vmsg->fd_num = 0;
 
     if (!vu_message_write(dev, dev->sock, vmsg)) {
         vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno));
     }
 
-    DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_MEMORY_MAX_NREGIONS);
+    DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS);
 
     return false;
 }
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index f8439713a8..844c37c648 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -28,7 +28,13 @@
 
 #define VIRTQUEUE_MAX_SIZE 1024
 
-#define VHOST_MEMORY_MAX_NREGIONS 8
+#define VHOST_MEMORY_BASELINE_NREGIONS 8
+
+/*
+ * Set a reasonable maximum number of ram slots, which will be supported by
+ * any architecture.
+ */
+#define VHOST_USER_MAX_RAM_SLOTS 32
 
 typedef enum VhostSetConfigType {
     VHOST_SET_CONFIG_TYPE_MASTER = 0,
@@ -55,6 +61,7 @@ enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
     VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
     VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
+    VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
 
     VHOST_USER_PROTOCOL_F_MAX
 };
@@ -123,7 +130,7 @@ typedef struct VhostUserMemoryRegion {
 typedef struct VhostUserMemory {
     uint32_t nregions;
     uint32_t padding;
-    VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+    VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS];
 } VhostUserMemory;
 
 typedef struct VhostUserMemRegMsg {
@@ -190,7 +197,7 @@ typedef struct VhostUserMsg {
         VhostUserInflight inflight;
     } payload;
 
-    int fds[VHOST_MEMORY_MAX_NREGIONS];
+    int fds[VHOST_MEMORY_BASELINE_NREGIONS];
     int fd_num;
     uint8_t *data;
 } VU_PACKED VhostUserMsg;
@@ -368,7 +375,7 @@ typedef struct VuDevInflightInfo {
 struct VuDev {
     int sock;
     uint32_t nregions;
-    VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+    VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS];
     VuVirtq *vq;
     VuDevInflightInfo inflight_info;
     int log_call_fd;

From a9a5c473d25d83bcdad27d97e5fb451950ee894a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 29 May 2020 17:13:38 +0100
Subject: [PATCH 46/58] libvhost-user: advertise vring features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

libvhost-user implements several vring features without advertising
them. There is no way for the vhost-user master to detect support for
these features.

Things more or less work today because QEMU assumes the vhost-user
backend always implements certain feature bits like
VIRTIO_RING_F_EVENT_IDX. This is not documented anywhere.

This patch explicitly advertises features implemented in libvhost-user
so that the vhost-user master does not need to make undocumented
assumptions.

Feature bits that libvhost-user now advertises can be removed from
vhost-user-blk.c. Devices should not be responsible for advertising
vring feature bits, that is libvhost-user's job.

Cc: Marc-André Lureau <marcandre.lureau@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20200529161338.456017-1-stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c   | 10 ++++++++++
 contrib/vhost-user-blk/vhost-user-blk.c |  4 +---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index b1e607298c..d315db1396 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -498,6 +498,16 @@ static bool
 vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
     vmsg->payload.u64 =
+        /*
+         * The following VIRTIO feature bits are supported by our virtqueue
+         * implementation:
+         */
+        1ULL << VIRTIO_F_NOTIFY_ON_EMPTY |
+        1ULL << VIRTIO_RING_F_INDIRECT_DESC |
+        1ULL << VIRTIO_RING_F_EVENT_IDX |
+        1ULL << VIRTIO_F_VERSION_1 |
+
+        /* vhost-user feature bits */
         1ULL << VHOST_F_LOG_ALL |
         1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
 
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index 6fd91c7e99..25eccd02b5 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -382,9 +382,7 @@ vub_get_features(VuDev *dev)
                1ull << VIRTIO_BLK_F_DISCARD |
                1ull << VIRTIO_BLK_F_WRITE_ZEROES |
                #endif
-               1ull << VIRTIO_BLK_F_CONFIG_WCE |
-               1ull << VIRTIO_F_VERSION_1 |
-               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
+               1ull << VIRTIO_BLK_F_CONFIG_WCE;
 
     if (vdev_blk->enable_ro) {
         features |= 1ull << VIRTIO_BLK_F_RO;

From 00823980b296c80a3ff7b448df20c48897cd62e9 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 27 May 2020 17:31:52 +0200
Subject: [PATCH 47/58] hw/pci: Fix crash when running QEMU with "-nic
 model=rocker"

QEMU currently aborts when being started with "-nic model=rocker" or with
"-net nic,model=rocker". This happens because the "rocker" device is not
a normal NIC but a switch, which has different properties. Thus we should
only consider real NIC devices for "-nic" and "-net". These devices can
be identified by the "netdev" property, so check for this property before
adding the device to the list.

Reported-by: Michael Tokarev <mjt@tls.msk.ru>
Fixes: 52310c3fa7dc854d ("net: allow using any PCI NICs in -net or -nic")
Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <20200527153152.9211-1-thuth@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/pci.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 7bf2ae6d92..1b88a32cf7 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1891,7 +1891,18 @@ PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus,
         if (test_bit(DEVICE_CATEGORY_NETWORK, dc->categories) &&
             dc->user_creatable) {
             const char *name = object_class_get_name(list->data);
-            g_ptr_array_add(pci_nic_models, (gpointer)name);
+            /*
+             * A network device might also be something else than a NIC, see
+             * e.g. the "rocker" device. Thus we have to look for the "netdev"
+             * property, too. Unfortunately, some devices like virtio-net only
+             * create this property during instance_init, so we have to create
+             * a temporary instance here to be able to check it.
+             */
+            Object *obj = object_new_with_class(OBJECT_CLASS(dc));
+            if (object_property_find(obj, "netdev", NULL)) {
+                g_ptr_array_add(pci_nic_models, (gpointer)name);
+            }
+            object_unref(obj);
         }
         next = list->next;
         g_slist_free_1(list);

From c6136ec0c6fea0db638de08720018fd4bc777787 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 22 May 2020 14:25:10 +0200
Subject: [PATCH 48/58] vhost-vsock: add vhost-vsock-common abstraction

This patch prepares the introduction of vhost-user-vsock, moving
the common code usable for both vhost-vsock and vhost-user-vsock
devices, in the new vhost-vsock-common parent class.

While moving the code, fixed checkpatch warnings about block comments.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20200522122512.87413-2-sgarzare@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/Makefile.objs                |   2 +-
 hw/virtio/vhost-vsock-common.c         | 258 ++++++++++++++++++++++
 hw/virtio/vhost-vsock.c                | 283 ++++---------------------
 include/hw/virtio/vhost-vsock-common.h |  47 ++++
 include/hw/virtio/vhost-vsock.h        |  11 +-
 5 files changed, 350 insertions(+), 251 deletions(-)
 create mode 100644 hw/virtio/vhost-vsock-common.c
 create mode 100644 include/hw/virtio/vhost-vsock-common.h

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 4e4d39a0a4..b1eeb44eac 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -17,7 +17,7 @@ obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
 common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pmem-pci.o
 obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-pci.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
-obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
+obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-common.o vhost-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o
diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c
new file mode 100644
index 0000000000..5b2ebf3496
--- /dev/null
+++ b/hw/virtio/vhost-vsock-common.c
@@ -0,0 +1,258 @@
+/*
+ * Parent class for vhost-vsock devices
+ *
+ * Copyright 2015-2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "standard-headers/linux/virtio_vsock.h"
+#include "qapi/error.h"
+#include "hw/virtio/virtio-access.h"
+#include "qemu/error-report.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-vsock.h"
+#include "qemu/iov.h"
+#include "monitor/monitor.h"
+
+int vhost_vsock_common_start(VirtIODevice *vdev)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    int ret;
+    int i;
+
+    if (!k->set_guest_notifiers) {
+        error_report("binding does not support guest notifiers");
+        return -ENOSYS;
+    }
+
+    ret = vhost_dev_enable_notifiers(&vvc->vhost_dev, vdev);
+    if (ret < 0) {
+        error_report("Error enabling host notifiers: %d", -ret);
+        return ret;
+    }
+
+    ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, true);
+    if (ret < 0) {
+        error_report("Error binding guest notifier: %d", -ret);
+        goto err_host_notifiers;
+    }
+
+    vvc->vhost_dev.acked_features = vdev->guest_features;
+    ret = vhost_dev_start(&vvc->vhost_dev, vdev);
+    if (ret < 0) {
+        error_report("Error starting vhost: %d", -ret);
+        goto err_guest_notifiers;
+    }
+
+    /*
+     * guest_notifier_mask/pending not used yet, so just unmask
+     * everything here.  virtio-pci will do the right thing by
+     * enabling/disabling irqfd.
+     */
+    for (i = 0; i < vvc->vhost_dev.nvqs; i++) {
+        vhost_virtqueue_mask(&vvc->vhost_dev, vdev, i, false);
+    }
+
+    return 0;
+
+err_guest_notifiers:
+    k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false);
+err_host_notifiers:
+    vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev);
+    return ret;
+}
+
+void vhost_vsock_common_stop(VirtIODevice *vdev)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    int ret;
+
+    if (!k->set_guest_notifiers) {
+        return;
+    }
+
+    vhost_dev_stop(&vvc->vhost_dev, vdev);
+
+    ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false);
+    if (ret < 0) {
+        error_report("vhost guest notifier cleanup failed: %d", ret);
+        return;
+    }
+
+    vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev);
+}
+
+
+static void vhost_vsock_common_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+    /* Do nothing */
+}
+
+static void vhost_vsock_common_guest_notifier_mask(VirtIODevice *vdev, int idx,
+                                            bool mask)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+    vhost_virtqueue_mask(&vvc->vhost_dev, vdev, idx, mask);
+}
+
+static bool vhost_vsock_common_guest_notifier_pending(VirtIODevice *vdev,
+                                               int idx)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+    return vhost_virtqueue_pending(&vvc->vhost_dev, idx);
+}
+
+static void vhost_vsock_common_send_transport_reset(VHostVSockCommon *vvc)
+{
+    VirtQueueElement *elem;
+    VirtQueue *vq = vvc->event_vq;
+    struct virtio_vsock_event event = {
+        .id = cpu_to_le32(VIRTIO_VSOCK_EVENT_TRANSPORT_RESET),
+    };
+
+    elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+    if (!elem) {
+        error_report("vhost-vsock missed transport reset event");
+        return;
+    }
+
+    if (elem->out_num) {
+        error_report("invalid vhost-vsock event virtqueue element with "
+                     "out buffers");
+        goto out;
+    }
+
+    if (iov_from_buf(elem->in_sg, elem->in_num, 0,
+                     &event, sizeof(event)) != sizeof(event)) {
+        error_report("vhost-vsock event virtqueue element is too short");
+        goto out;
+    }
+
+    virtqueue_push(vq, elem, sizeof(event));
+    virtio_notify(VIRTIO_DEVICE(vvc), vq);
+
+out:
+    g_free(elem);
+}
+
+static void vhost_vsock_common_post_load_timer_cleanup(VHostVSockCommon *vvc)
+{
+    if (!vvc->post_load_timer) {
+        return;
+    }
+
+    timer_del(vvc->post_load_timer);
+    timer_free(vvc->post_load_timer);
+    vvc->post_load_timer = NULL;
+}
+
+static void vhost_vsock_common_post_load_timer_cb(void *opaque)
+{
+    VHostVSockCommon *vvc = opaque;
+
+    vhost_vsock_common_post_load_timer_cleanup(vvc);
+    vhost_vsock_common_send_transport_reset(vvc);
+}
+
+int vhost_vsock_common_pre_save(void *opaque)
+{
+    VHostVSockCommon *vvc = opaque;
+
+    /*
+     * At this point, backend must be stopped, otherwise
+     * it might keep writing to memory.
+     */
+    assert(!vvc->vhost_dev.started);
+
+    return 0;
+}
+
+int vhost_vsock_common_post_load(void *opaque, int version_id)
+{
+    VHostVSockCommon *vvc = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(vvc);
+
+    if (virtio_queue_get_addr(vdev, 2)) {
+        /*
+         * Defer transport reset event to a vm clock timer so that virtqueue
+         * changes happen after migration has completed.
+         */
+        assert(!vvc->post_load_timer);
+        vvc->post_load_timer =
+            timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                         vhost_vsock_common_post_load_timer_cb,
+                         vvc);
+        timer_mod(vvc->post_load_timer, 1);
+    }
+    return 0;
+}
+
+void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+    virtio_init(vdev, name, VIRTIO_ID_VSOCK,
+                sizeof(struct virtio_vsock_config));
+
+    /* Receive and transmit queues belong to vhost */
+    vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+                                      vhost_vsock_common_handle_output);
+    vvc->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+                                       vhost_vsock_common_handle_output);
+
+    /* The event queue belongs to QEMU */
+    vvc->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+                                       vhost_vsock_common_handle_output);
+
+    vvc->vhost_dev.nvqs = ARRAY_SIZE(vvc->vhost_vqs);
+    vvc->vhost_dev.vqs = vvc->vhost_vqs;
+
+    vvc->post_load_timer = NULL;
+}
+
+void vhost_vsock_common_unrealize(VirtIODevice *vdev)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+    vhost_vsock_common_post_load_timer_cleanup(vvc);
+
+    virtio_delete_queue(vvc->recv_vq);
+    virtio_delete_queue(vvc->trans_vq);
+    virtio_delete_queue(vvc->event_vq);
+    virtio_cleanup(vdev);
+}
+
+static void vhost_vsock_common_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    vdc->guest_notifier_mask = vhost_vsock_common_guest_notifier_mask;
+    vdc->guest_notifier_pending = vhost_vsock_common_guest_notifier_pending;
+}
+
+static const TypeInfo vhost_vsock_common_info = {
+    .name = TYPE_VHOST_VSOCK_COMMON,
+    .parent = TYPE_VIRTIO_DEVICE,
+    .instance_size = sizeof(VHostVSockCommon),
+    .class_init = vhost_vsock_common_class_init,
+    .abstract = true,
+};
+
+static void vhost_vsock_common_register_types(void)
+{
+    type_register_static(&vhost_vsock_common_info);
+}
+
+type_init(vhost_vsock_common_register_types)
diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c
index 4a228f5168..c8f0699b4f 100644
--- a/hw/virtio/vhost-vsock.c
+++ b/hw/virtio/vhost-vsock.c
@@ -12,24 +12,14 @@
  */
 
 #include "qemu/osdep.h"
-#include <sys/ioctl.h>
 #include "standard-headers/linux/virtio_vsock.h"
 #include "qapi/error.h"
-#include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 #include "qemu/error-report.h"
 #include "hw/qdev-properties.h"
 #include "hw/virtio/vhost-vsock.h"
-#include "qemu/iov.h"
-#include "qemu/module.h"
 #include "monitor/monitor.h"
 
-enum {
-    VHOST_VSOCK_SAVEVM_VERSION = 0,
-
-    VHOST_VSOCK_QUEUE_SIZE = 128,
-};
-
 static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config)
 {
     VHostVSock *vsock = VHOST_VSOCK(vdev);
@@ -39,16 +29,18 @@ static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config)
     memcpy(config, &vsockcfg, sizeof(vsockcfg));
 }
 
-static int vhost_vsock_set_guest_cid(VHostVSock *vsock)
+static int vhost_vsock_set_guest_cid(VirtIODevice *vdev)
 {
-    const VhostOps *vhost_ops = vsock->vhost_dev.vhost_ops;
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+    VHostVSock *vsock = VHOST_VSOCK(vdev);
+    const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops;
     int ret;
 
     if (!vhost_ops->vhost_vsock_set_guest_cid) {
         return -ENOSYS;
     }
 
-    ret = vhost_ops->vhost_vsock_set_guest_cid(&vsock->vhost_dev,
+    ret = vhost_ops->vhost_vsock_set_guest_cid(&vvc->vhost_dev,
                                                vsock->conf.guest_cid);
     if (ret < 0) {
         return -errno;
@@ -56,123 +48,58 @@ static int vhost_vsock_set_guest_cid(VHostVSock *vsock)
     return 0;
 }
 
-static int vhost_vsock_set_running(VHostVSock *vsock, int start)
+static int vhost_vsock_set_running(VirtIODevice *vdev, int start)
 {
-    const VhostOps *vhost_ops = vsock->vhost_dev.vhost_ops;
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+    const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops;
     int ret;
 
     if (!vhost_ops->vhost_vsock_set_running) {
         return -ENOSYS;
     }
 
-    ret = vhost_ops->vhost_vsock_set_running(&vsock->vhost_dev, start);
+    ret = vhost_ops->vhost_vsock_set_running(&vvc->vhost_dev, start);
     if (ret < 0) {
         return -errno;
     }
     return 0;
 }
 
-static void vhost_vsock_start(VirtIODevice *vdev)
-{
-    VHostVSock *vsock = VHOST_VSOCK(vdev);
-    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-    int ret;
-    int i;
-
-    if (!k->set_guest_notifiers) {
-        error_report("binding does not support guest notifiers");
-        return;
-    }
-
-    ret = vhost_dev_enable_notifiers(&vsock->vhost_dev, vdev);
-    if (ret < 0) {
-        error_report("Error enabling host notifiers: %d", -ret);
-        return;
-    }
-
-    ret = k->set_guest_notifiers(qbus->parent, vsock->vhost_dev.nvqs, true);
-    if (ret < 0) {
-        error_report("Error binding guest notifier: %d", -ret);
-        goto err_host_notifiers;
-    }
-
-    vsock->vhost_dev.acked_features = vdev->guest_features;
-    ret = vhost_dev_start(&vsock->vhost_dev, vdev);
-    if (ret < 0) {
-        error_report("Error starting vhost: %d", -ret);
-        goto err_guest_notifiers;
-    }
-
-    ret = vhost_vsock_set_running(vsock, 1);
-    if (ret < 0) {
-        error_report("Error starting vhost vsock: %d", -ret);
-        goto err_dev_start;
-    }
-
-    /* guest_notifier_mask/pending not used yet, so just unmask
-     * everything here.  virtio-pci will do the right thing by
-     * enabling/disabling irqfd.
-     */
-    for (i = 0; i < vsock->vhost_dev.nvqs; i++) {
-        vhost_virtqueue_mask(&vsock->vhost_dev, vdev, i, false);
-    }
-
-    return;
-
-err_dev_start:
-    vhost_dev_stop(&vsock->vhost_dev, vdev);
-err_guest_notifiers:
-    k->set_guest_notifiers(qbus->parent, vsock->vhost_dev.nvqs, false);
-err_host_notifiers:
-    vhost_dev_disable_notifiers(&vsock->vhost_dev, vdev);
-}
-
-static void vhost_vsock_stop(VirtIODevice *vdev)
-{
-    VHostVSock *vsock = VHOST_VSOCK(vdev);
-    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-    int ret;
-
-    if (!k->set_guest_notifiers) {
-        return;
-    }
-
-    ret = vhost_vsock_set_running(vsock, 0);
-    if (ret < 0) {
-        error_report("vhost vsock set running failed: %d", ret);
-        return;
-    }
-
-    vhost_dev_stop(&vsock->vhost_dev, vdev);
-
-    ret = k->set_guest_notifiers(qbus->parent, vsock->vhost_dev.nvqs, false);
-    if (ret < 0) {
-        error_report("vhost guest notifier cleanup failed: %d", ret);
-        return;
-    }
-
-    vhost_dev_disable_notifiers(&vsock->vhost_dev, vdev);
-}
 
 static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status)
 {
-    VHostVSock *vsock = VHOST_VSOCK(vdev);
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
     bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
+    int ret;
 
     if (!vdev->vm_running) {
         should_start = false;
     }
 
-    if (vsock->vhost_dev.started == should_start) {
+    if (vvc->vhost_dev.started == should_start) {
         return;
     }
 
     if (should_start) {
-        vhost_vsock_start(vdev);
+        ret = vhost_vsock_common_start(vdev);
+        if (ret < 0) {
+            return;
+        }
+
+        ret = vhost_vsock_set_running(vdev, 1);
+        if (ret < 0) {
+            vhost_vsock_common_stop(vdev);
+            error_report("Error starting vhost vsock: %d", -ret);
+            return;
+        }
     } else {
-        vhost_vsock_stop(vdev);
+        ret = vhost_vsock_set_running(vdev, 0);
+        if (ret < 0) {
+            error_report("vhost vsock set running failed: %d", ret);
+            return;
+        }
+
+        vhost_vsock_common_stop(vdev);
     }
 }
 
@@ -184,108 +111,6 @@ static uint64_t vhost_vsock_get_features(VirtIODevice *vdev,
     return requested_features;
 }
 
-static void vhost_vsock_handle_output(VirtIODevice *vdev, VirtQueue *vq)
-{
-    /* Do nothing */
-}
-
-static void vhost_vsock_guest_notifier_mask(VirtIODevice *vdev, int idx,
-                                            bool mask)
-{
-    VHostVSock *vsock = VHOST_VSOCK(vdev);
-
-    vhost_virtqueue_mask(&vsock->vhost_dev, vdev, idx, mask);
-}
-
-static bool vhost_vsock_guest_notifier_pending(VirtIODevice *vdev, int idx)
-{
-    VHostVSock *vsock = VHOST_VSOCK(vdev);
-
-    return vhost_virtqueue_pending(&vsock->vhost_dev, idx);
-}
-
-static void vhost_vsock_send_transport_reset(VHostVSock *vsock)
-{
-    VirtQueueElement *elem;
-    VirtQueue *vq = vsock->event_vq;
-    struct virtio_vsock_event event = {
-        .id = cpu_to_le32(VIRTIO_VSOCK_EVENT_TRANSPORT_RESET),
-    };
-
-    elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
-    if (!elem) {
-        error_report("vhost-vsock missed transport reset event");
-        return;
-    }
-
-    if (elem->out_num) {
-        error_report("invalid vhost-vsock event virtqueue element with "
-                     "out buffers");
-        goto out;
-    }
-
-    if (iov_from_buf(elem->in_sg, elem->in_num, 0,
-                     &event, sizeof(event)) != sizeof(event)) {
-        error_report("vhost-vsock event virtqueue element is too short");
-        goto out;
-    }
-
-    virtqueue_push(vq, elem, sizeof(event));
-    virtio_notify(VIRTIO_DEVICE(vsock), vq);
-
-out:
-    g_free(elem);
-}
-
-static void vhost_vsock_post_load_timer_cleanup(VHostVSock *vsock)
-{
-    if (!vsock->post_load_timer) {
-        return;
-    }
-
-    timer_del(vsock->post_load_timer);
-    timer_free(vsock->post_load_timer);
-    vsock->post_load_timer = NULL;
-}
-
-static void vhost_vsock_post_load_timer_cb(void *opaque)
-{
-    VHostVSock *vsock = opaque;
-
-    vhost_vsock_post_load_timer_cleanup(vsock);
-    vhost_vsock_send_transport_reset(vsock);
-}
-
-static int vhost_vsock_pre_save(void *opaque)
-{
-    VHostVSock *vsock = opaque;
-
-    /* At this point, backend must be stopped, otherwise
-     * it might keep writing to memory. */
-    assert(!vsock->vhost_dev.started);
-
-    return 0;
-}
-
-static int vhost_vsock_post_load(void *opaque, int version_id)
-{
-    VHostVSock *vsock = opaque;
-    VirtIODevice *vdev = VIRTIO_DEVICE(vsock);
-
-    if (virtio_queue_get_addr(vdev, 2)) {
-        /* Defer transport reset event to a vm clock timer so that virtqueue
-         * changes happen after migration has completed.
-         */
-        assert(!vsock->post_load_timer);
-        vsock->post_load_timer =
-            timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                         vhost_vsock_post_load_timer_cb,
-                         vsock);
-        timer_mod(vsock->post_load_timer, 1);
-    }
-    return 0;
-}
-
 static const VMStateDescription vmstate_virtio_vhost_vsock = {
     .name = "virtio-vhost_vsock",
     .minimum_version_id = VHOST_VSOCK_SAVEVM_VERSION,
@@ -294,12 +119,13 @@ static const VMStateDescription vmstate_virtio_vhost_vsock = {
         VMSTATE_VIRTIO_DEVICE,
         VMSTATE_END_OF_LIST()
     },
-    .pre_save = vhost_vsock_pre_save,
-    .post_load = vhost_vsock_post_load,
+    .pre_save = vhost_vsock_common_pre_save,
+    .post_load = vhost_vsock_common_post_load,
 };
 
 static void vhost_vsock_device_realize(DeviceState *dev, Error **errp)
 {
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VHostVSock *vsock = VHOST_VSOCK(dev);
     int vhostfd;
@@ -331,46 +157,29 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp)
         }
     }
 
-    virtio_init(vdev, "vhost-vsock", VIRTIO_ID_VSOCK,
-                sizeof(struct virtio_vsock_config));
+    vhost_vsock_common_realize(vdev, "vhost-vsock");
 
-    /* Receive and transmit queues belong to vhost */
-    vsock->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
-                                      vhost_vsock_handle_output);
-    vsock->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
-                                       vhost_vsock_handle_output);
-
-    /* The event queue belongs to QEMU */
-    vsock->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
-                                       vhost_vsock_handle_output);
-
-    vsock->vhost_dev.nvqs = ARRAY_SIZE(vsock->vhost_vqs);
-    vsock->vhost_dev.vqs = vsock->vhost_vqs;
-    ret = vhost_dev_init(&vsock->vhost_dev, (void *)(uintptr_t)vhostfd,
+    ret = vhost_dev_init(&vvc->vhost_dev, (void *)(uintptr_t)vhostfd,
                          VHOST_BACKEND_TYPE_KERNEL, 0);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "vhost-vsock: vhost_dev_init failed");
         goto err_virtio;
     }
 
-    ret = vhost_vsock_set_guest_cid(vsock);
+    ret = vhost_vsock_set_guest_cid(vdev);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "vhost-vsock: unable to set guest cid");
         goto err_vhost_dev;
     }
 
-    vsock->post_load_timer = NULL;
     return;
 
 err_vhost_dev:
-    vhost_dev_cleanup(&vsock->vhost_dev);
+    vhost_dev_cleanup(&vvc->vhost_dev);
     /* vhost_dev_cleanup() closes the vhostfd passed to vhost_dev_init() */
     vhostfd = -1;
 err_virtio:
-    virtio_delete_queue(vsock->recv_vq);
-    virtio_delete_queue(vsock->trans_vq);
-    virtio_delete_queue(vsock->event_vq);
-    virtio_cleanup(vdev);
+    vhost_vsock_common_unrealize(vdev);
     if (vhostfd >= 0) {
         close(vhostfd);
     }
@@ -379,19 +188,14 @@ err_virtio:
 
 static void vhost_vsock_device_unrealize(DeviceState *dev)
 {
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
-    VHostVSock *vsock = VHOST_VSOCK(dev);
-
-    vhost_vsock_post_load_timer_cleanup(vsock);
 
     /* This will stop vhost backend if appropriate. */
     vhost_vsock_set_status(vdev, 0);
 
-    vhost_dev_cleanup(&vsock->vhost_dev);
-    virtio_delete_queue(vsock->recv_vq);
-    virtio_delete_queue(vsock->trans_vq);
-    virtio_delete_queue(vsock->event_vq);
-    virtio_cleanup(vdev);
+    vhost_dev_cleanup(&vvc->vhost_dev);
+    vhost_vsock_common_unrealize(vdev);
 }
 
 static Property vhost_vsock_properties[] = {
@@ -407,19 +211,16 @@ static void vhost_vsock_class_init(ObjectClass *klass, void *data)
 
     device_class_set_props(dc, vhost_vsock_properties);
     dc->vmsd = &vmstate_virtio_vhost_vsock;
-    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     vdc->realize = vhost_vsock_device_realize;
     vdc->unrealize = vhost_vsock_device_unrealize;
     vdc->get_features = vhost_vsock_get_features;
     vdc->get_config = vhost_vsock_get_config;
     vdc->set_status = vhost_vsock_set_status;
-    vdc->guest_notifier_mask = vhost_vsock_guest_notifier_mask;
-    vdc->guest_notifier_pending = vhost_vsock_guest_notifier_pending;
 }
 
 static const TypeInfo vhost_vsock_info = {
     .name = TYPE_VHOST_VSOCK,
-    .parent = TYPE_VIRTIO_DEVICE,
+    .parent = TYPE_VHOST_VSOCK_COMMON,
     .instance_size = sizeof(VHostVSock),
     .class_init = vhost_vsock_class_init,
 };
diff --git a/include/hw/virtio/vhost-vsock-common.h b/include/hw/virtio/vhost-vsock-common.h
new file mode 100644
index 0000000000..f8b4aaae00
--- /dev/null
+++ b/include/hw/virtio/vhost-vsock-common.h
@@ -0,0 +1,47 @@
+/*
+ * Parent class for vhost-vsock devices
+ *
+ * Copyright 2015-2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef _QEMU_VHOST_VSOCK_COMMON_H
+#define _QEMU_VHOST_VSOCK_COMMON_H
+
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vhost.h"
+
+#define TYPE_VHOST_VSOCK_COMMON "vhost-vsock-common"
+#define VHOST_VSOCK_COMMON(obj) \
+        OBJECT_CHECK(VHostVSockCommon, (obj), TYPE_VHOST_VSOCK_COMMON)
+
+enum {
+    VHOST_VSOCK_SAVEVM_VERSION = 0,
+
+    VHOST_VSOCK_QUEUE_SIZE = 128,
+};
+
+typedef struct {
+    VirtIODevice parent;
+
+    struct vhost_virtqueue vhost_vqs[2];
+    struct vhost_dev vhost_dev;
+
+    VirtQueue *event_vq;
+    VirtQueue *recv_vq;
+    VirtQueue *trans_vq;
+
+    QEMUTimer *post_load_timer;
+} VHostVSockCommon;
+
+int vhost_vsock_common_start(VirtIODevice *vdev);
+void vhost_vsock_common_stop(VirtIODevice *vdev);
+int vhost_vsock_common_pre_save(void *opaque);
+int vhost_vsock_common_post_load(void *opaque, int version_id);
+void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name);
+void vhost_vsock_common_unrealize(VirtIODevice *vdev);
+
+#endif /* _QEMU_VHOST_VSOCK_COMMON_H */
diff --git a/include/hw/virtio/vhost-vsock.h b/include/hw/virtio/vhost-vsock.h
index bc5a988ee5..8cbb7b90f9 100644
--- a/include/hw/virtio/vhost-vsock.h
+++ b/include/hw/virtio/vhost-vsock.h
@@ -14,8 +14,7 @@
 #ifndef QEMU_VHOST_VSOCK_H
 #define QEMU_VHOST_VSOCK_H
 
-#include "hw/virtio/virtio.h"
-#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-vsock-common.h"
 
 #define TYPE_VHOST_VSOCK "vhost-vsock-device"
 #define VHOST_VSOCK(obj) \
@@ -28,14 +27,8 @@ typedef struct {
 
 typedef struct {
     /*< private >*/
-    VirtIODevice parent;
+    VHostVSockCommon parent;
     VHostVSockConf conf;
-    struct vhost_virtqueue vhost_vqs[2];
-    struct vhost_dev vhost_dev;
-    VirtQueue *event_vq;
-    VirtQueue *recv_vq;
-    VirtQueue *trans_vq;
-    QEMUTimer *post_load_timer;
 
     /*< public >*/
 } VHostVSock;

From 5fe97d8829e8bc2a297474df20cd3ac12eebdbba Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 22 May 2020 14:25:11 +0200
Subject: [PATCH 49/58] virtio: add vhost-user-vsock base device

This patch introduces a vhost-user device for vsock, using the
vhost-vsock-common parent class.

The vhost-user-vsock device can be used to implement the virtio-vsock
device emulation in user-space.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20200522122512.87413-3-sgarzare@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 configure                            |   3 +
 hw/virtio/Makefile.objs              |   1 +
 hw/virtio/vhost-user-vsock.c         | 181 +++++++++++++++++++++++++++
 include/hw/virtio/vhost-user-vsock.h |  36 ++++++
 4 files changed, 221 insertions(+)
 create mode 100644 hw/virtio/vhost-user-vsock.c
 create mode 100644 include/hw/virtio/vhost-user-vsock.h

diff --git a/configure b/configure
index 597e909b53..7c2adf36e5 100755
--- a/configure
+++ b/configure
@@ -7196,6 +7196,9 @@ if test "$vhost_crypto" = "yes" ; then
 fi
 if test "$vhost_vsock" = "yes" ; then
   echo "CONFIG_VHOST_VSOCK=y" >> $config_host_mak
+  if test "$vhost_user" = "yes" ; then
+    echo "CONFIG_VHOST_USER_VSOCK=y" >> $config_host_mak
+  fi
 fi
 if test "$vhost_kernel" = "yes" ; then
   echo "CONFIG_VHOST_KERNEL=y" >> $config_host_mak
diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index b1eeb44eac..dd42daedb1 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -18,6 +18,7 @@ common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pme
 obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-pci.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-common.o vhost-vsock.o
+obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-vsock-common.o vhost-user-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
new file mode 100644
index 0000000000..3534a39d62
--- /dev/null
+++ b/hw/virtio/vhost-user-vsock.c
@@ -0,0 +1,181 @@
+/*
+ * Vhost-user vsock virtio device
+ *
+ * Copyright 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-vsock.h"
+
+static const int user_feature_bits[] = {
+    VIRTIO_F_VERSION_1,
+    VIRTIO_RING_F_INDIRECT_DESC,
+    VIRTIO_RING_F_EVENT_IDX,
+    VIRTIO_F_NOTIFY_ON_EMPTY,
+    VHOST_INVALID_FEATURE_BIT
+};
+
+static void vuv_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VHostUserVSock *vsock = VHOST_USER_VSOCK(vdev);
+
+    memcpy(config, &vsock->vsockcfg, sizeof(struct virtio_vsock_config));
+}
+
+static int vuv_handle_config_change(struct vhost_dev *dev)
+{
+    VHostUserVSock *vsock = VHOST_USER_VSOCK(dev->vdev);
+    int ret = vhost_dev_get_config(dev, (uint8_t *)&vsock->vsockcfg,
+                                   sizeof(struct virtio_vsock_config));
+    if (ret < 0) {
+        error_report("get config space failed");
+        return -1;
+    }
+
+    virtio_notify_config(dev->vdev);
+
+    return 0;
+}
+
+const VhostDevConfigOps vsock_ops = {
+    .vhost_dev_config_notifier = vuv_handle_config_change,
+};
+
+static void vuv_set_status(VirtIODevice *vdev, uint8_t status)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+    bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
+
+    if (!vdev->vm_running) {
+        should_start = false;
+    }
+
+    if (vvc->vhost_dev.started == should_start) {
+        return;
+    }
+
+    if (should_start) {
+        int ret = vhost_vsock_common_start(vdev);
+        if (ret < 0) {
+            return;
+        }
+    } else {
+        vhost_vsock_common_stop(vdev);
+    }
+}
+
+static uint64_t vuv_get_features(VirtIODevice *vdev,
+                                 uint64_t features,
+                                 Error **errp)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+    return vhost_get_features(&vvc->vhost_dev, user_feature_bits, features);
+}
+
+static const VMStateDescription vuv_vmstate = {
+    .name = "vhost-user-vsock",
+    .unmigratable = 1,
+};
+
+static void vuv_device_realize(DeviceState *dev, Error **errp)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserVSock *vsock = VHOST_USER_VSOCK(dev);
+    int ret;
+
+    if (!vsock->conf.chardev.chr) {
+        error_setg(errp, "missing chardev");
+        return;
+    }
+
+    if (!vhost_user_init(&vsock->vhost_user, &vsock->conf.chardev, errp)) {
+        return;
+    }
+
+    vhost_vsock_common_realize(vdev, "vhost-user-vsock");
+
+    vhost_dev_set_config_notifier(&vvc->vhost_dev, &vsock_ops);
+
+    ret = vhost_dev_init(&vvc->vhost_dev, &vsock->vhost_user,
+                         VHOST_BACKEND_TYPE_USER, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "vhost_dev_init failed");
+        goto err_virtio;
+    }
+
+    ret = vhost_dev_get_config(&vvc->vhost_dev, (uint8_t *)&vsock->vsockcfg,
+                               sizeof(struct virtio_vsock_config));
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "get config space failed");
+        goto err_vhost_dev;
+    }
+
+    return;
+
+err_vhost_dev:
+    vhost_dev_cleanup(&vvc->vhost_dev);
+err_virtio:
+    vhost_vsock_common_unrealize(vdev);
+    vhost_user_cleanup(&vsock->vhost_user);
+    return;
+}
+
+static void vuv_device_unrealize(DeviceState *dev)
+{
+    VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserVSock *vsock = VHOST_USER_VSOCK(dev);
+
+    /* This will stop vhost backend if appropriate. */
+    vuv_set_status(vdev, 0);
+
+    vhost_dev_cleanup(&vvc->vhost_dev);
+
+    vhost_vsock_common_unrealize(vdev);
+
+    vhost_user_cleanup(&vsock->vhost_user);
+
+}
+
+static Property vuv_properties[] = {
+    DEFINE_PROP_CHR("chardev", VHostUserVSock, conf.chardev),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vuv_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+    device_class_set_props(dc, vuv_properties);
+    dc->vmsd = &vuv_vmstate;
+    vdc->realize = vuv_device_realize;
+    vdc->unrealize = vuv_device_unrealize;
+    vdc->get_features = vuv_get_features;
+    vdc->get_config = vuv_get_config;
+    vdc->set_status = vuv_set_status;
+}
+
+static const TypeInfo vuv_info = {
+    .name = TYPE_VHOST_USER_VSOCK,
+    .parent = TYPE_VHOST_VSOCK_COMMON,
+    .instance_size = sizeof(VHostUserVSock),
+    .class_init = vuv_class_init,
+};
+
+static void vuv_register_types(void)
+{
+    type_register_static(&vuv_info);
+}
+
+type_init(vuv_register_types)
diff --git a/include/hw/virtio/vhost-user-vsock.h b/include/hw/virtio/vhost-user-vsock.h
new file mode 100644
index 0000000000..4e128a4b9f
--- /dev/null
+++ b/include/hw/virtio/vhost-user-vsock.h
@@ -0,0 +1,36 @@
+/*
+ * Vhost-user vsock virtio device
+ *
+ * Copyright 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#ifndef _QEMU_VHOST_USER_VSOCK_H
+#define _QEMU_VHOST_USER_VSOCK_H
+
+#include "hw/virtio/vhost-vsock-common.h"
+#include "hw/virtio/vhost-user.h"
+#include "standard-headers/linux/virtio_vsock.h"
+
+#define TYPE_VHOST_USER_VSOCK "vhost-user-vsock-device"
+#define VHOST_USER_VSOCK(obj) \
+        OBJECT_CHECK(VHostUserVSock, (obj), TYPE_VHOST_USER_VSOCK)
+
+typedef struct {
+    CharBackend chardev;
+} VHostUserVSockConf;
+
+typedef struct {
+    /*< private >*/
+    VHostVSockCommon parent;
+    VhostUserState vhost_user;
+    VHostUserVSockConf conf;
+    struct virtio_vsock_config vsockcfg;
+
+    /*< public >*/
+} VHostUserVSock;
+
+#endif /* _QEMU_VHOST_USER_VSOCK_H */

From 9b83bb27477f3fb2c08db32ea2ee9e8f817f0f9d Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 22 May 2020 14:25:12 +0200
Subject: [PATCH 50/58] virtio: add vhost-user-vsock-pci device

Add the PCI version of vhost-user-vsock

Launch QEMU like this:

  qemu -chardev socket,path=/tmp/vm.vsock,id=chr0 \
       -device vhost-user-vsock-pci,chardev=chr0

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20200522122512.87413-4-sgarzare@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/Makefile.objs          |  1 +
 hw/virtio/vhost-user-vsock-pci.c | 84 ++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 hw/virtio/vhost-user-vsock-pci.c

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index dd42daedb1..13e75f171f 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -22,6 +22,7 @@ obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-vsock-common.o vhost-user-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o
+obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-user-vsock-pci.o
 obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk-pci.o
 obj-$(CONFIG_VHOST_USER_INPUT) += vhost-user-input-pci.o
 obj-$(CONFIG_VHOST_USER_SCSI) += vhost-user-scsi-pci.o
diff --git a/hw/virtio/vhost-user-vsock-pci.c b/hw/virtio/vhost-user-vsock-pci.c
new file mode 100644
index 0000000000..0a6847e6fc
--- /dev/null
+++ b/hw/virtio/vhost-user-vsock-pci.c
@@ -0,0 +1,84 @@
+/*
+ * Vhost-user vsock PCI Bindings
+ *
+ * Copyright 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "virtio-pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-vsock.h"
+
+typedef struct VHostUserVSockPCI VHostUserVSockPCI;
+
+/*
+ * vhost-user-vsock-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_USER_VSOCK_PCI "vhost-user-vsock-pci-base"
+#define VHOST_USER_VSOCK_PCI(obj) \
+        OBJECT_CHECK(VHostUserVSockPCI, (obj), TYPE_VHOST_USER_VSOCK_PCI)
+
+struct VHostUserVSockPCI {
+    VirtIOPCIProxy parent_obj;
+    VHostUserVSock vdev;
+};
+
+/* vhost-user-vsock-pci */
+
+static Property vhost_user_vsock_pci_properties[] = {
+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_vsock_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&dev->vdev);
+
+    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void vhost_user_vsock_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+    k->realize = vhost_user_vsock_pci_realize;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    device_class_set_props(dc, vhost_user_vsock_pci_properties);
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_VSOCK;
+    pcidev_k->revision = 0x00;
+    pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void vhost_user_vsock_pci_instance_init(Object *obj)
+{
+    VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_USER_VSOCK);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_vsock_pci_info = {
+    .base_name             = TYPE_VHOST_USER_VSOCK_PCI,
+    .generic_name          = "vhost-user-vsock-pci",
+    .transitional_name     = "vhost-user-vsock-pci-transitional",
+    .non_transitional_name = "vhost-user-vsock-pci-non-transitional",
+    .instance_size = sizeof(VHostUserVSockPCI),
+    .instance_init = vhost_user_vsock_pci_instance_init,
+    .class_init    = vhost_user_vsock_pci_class_init,
+};
+
+static void virtio_pci_vhost_register(void)
+{
+    virtio_pci_types_register(&vhost_user_vsock_pci_info);
+}
+
+type_init(virtio_pci_vhost_register)

From 1dc32f9aeb127d995efd52a29f57460c011780da Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 20 May 2020 15:19:46 +0200
Subject: [PATCH 51/58] acpi: make build_madt() more generic.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove PCMachineState dependency from build_madt().
Pass AcpiDeviceIf as separate argument instead of
depending on PCMachineState->acpi_dev.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <20200520132003.9492-6-kraxel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/acpi-build.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 1ecb68f45f..d217fc1fe6 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -366,14 +366,13 @@ void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
 }
 
 static void
-build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms)
+build_madt(GArray *table_data, BIOSLinker *linker,
+           X86MachineState *x86ms, AcpiDeviceIf *adev)
 {
-    MachineClass *mc = MACHINE_GET_CLASS(pcms);
-    X86MachineState *x86ms = X86_MACHINE(pcms);
-    const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(pcms));
+    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
+    const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
     int madt_start = table_data->len;
-    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(pcms->acpi_dev);
-    AcpiDeviceIf *adev = ACPI_DEVICE_IF(pcms->acpi_dev);
+    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev);
     bool x2apic_mode = false;
 
     AcpiMultipleApicTable *madt;
@@ -2708,7 +2707,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
     aml_len += tables_blob->len - fadt;
 
     acpi_add_table(table_offsets, tables_blob);
-    build_madt(tables_blob, tables->linker, pcms);
+    build_madt(tables_blob, tables->linker, x86ms,
+               ACPI_DEVICE_IF(pcms->acpi_dev));
 
     vmgenid_dev = find_vmgenid_dev();
     if (vmgenid_dev) {

From eb66ffabc00a7b062b2c73b9178eb2679328813b Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 20 May 2020 15:19:47 +0200
Subject: [PATCH 52/58] acpi: create acpi-common.c and move madt code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We'll need madt support for microvm.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <20200520132003.9492-7-kraxel@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/Makefile.objs |   1 +
 hw/i386/acpi-build.c  | 126 +---------------------------------
 hw/i386/acpi-common.c | 152 ++++++++++++++++++++++++++++++++++++++++++
 hw/i386/acpi-common.h |  14 ++++
 4 files changed, 170 insertions(+), 123 deletions(-)
 create mode 100644 hw/i386/acpi-common.c
 create mode 100644 hw/i386/acpi-common.h

diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index 8ce1b26533..6abc74551a 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -16,4 +16,5 @@ obj-$(CONFIG_VMMOUSE) += vmmouse.o
 obj-$(CONFIG_PC) += port92.o
 
 obj-y += kvmvapic.o
+obj-$(CONFIG_ACPI) += acpi-common.o
 obj-$(CONFIG_PC) += acpi-build.o
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index d217fc1fe6..26c0c8aefa 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -24,6 +24,7 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qnum.h"
 #include "acpi-build.h"
+#include "acpi-common.h"
 #include "qemu/bitmap.h"
 #include "qemu/error-report.h"
 #include "hw/pci/pci.h"
@@ -89,9 +90,6 @@
 #define ACPI_BUILD_DPRINTF(fmt, ...)
 #endif
 
-/* Default IOAPIC ID */
-#define ACPI_BUILD_IOAPIC_ID 0x0
-
 typedef struct AcpiPmInfo {
     bool s3_disabled;
     bool s4_disabled;
@@ -327,124 +325,6 @@ build_facs(GArray *table_data)
     facs->length = cpu_to_le32(sizeof(*facs));
 }
 
-void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
-                       const CPUArchIdList *apic_ids, GArray *entry)
-{
-    uint32_t apic_id = apic_ids->cpus[uid].arch_id;
-
-    /* ACPI spec says that LAPIC entry for non present
-     * CPU may be omitted from MADT or it must be marked
-     * as disabled. However omitting non present CPU from
-     * MADT breaks hotplug on linux. So possible CPUs
-     * should be put in MADT but kept disabled.
-     */
-    if (apic_id < 255) {
-        AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic);
-
-        apic->type = ACPI_APIC_PROCESSOR;
-        apic->length = sizeof(*apic);
-        apic->processor_id = uid;
-        apic->local_apic_id = apic_id;
-        if (apic_ids->cpus[uid].cpu != NULL) {
-            apic->flags = cpu_to_le32(1);
-        } else {
-            apic->flags = cpu_to_le32(0);
-        }
-    } else {
-        AcpiMadtProcessorX2Apic *apic = acpi_data_push(entry, sizeof *apic);
-
-        apic->type = ACPI_APIC_LOCAL_X2APIC;
-        apic->length = sizeof(*apic);
-        apic->uid = cpu_to_le32(uid);
-        apic->x2apic_id = cpu_to_le32(apic_id);
-        if (apic_ids->cpus[uid].cpu != NULL) {
-            apic->flags = cpu_to_le32(1);
-        } else {
-            apic->flags = cpu_to_le32(0);
-        }
-    }
-}
-
-static void
-build_madt(GArray *table_data, BIOSLinker *linker,
-           X86MachineState *x86ms, AcpiDeviceIf *adev)
-{
-    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
-    const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
-    int madt_start = table_data->len;
-    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev);
-    bool x2apic_mode = false;
-
-    AcpiMultipleApicTable *madt;
-    AcpiMadtIoApic *io_apic;
-    AcpiMadtIntsrcovr *intsrcovr;
-    int i;
-
-    madt = acpi_data_push(table_data, sizeof *madt);
-    madt->local_apic_address = cpu_to_le32(APIC_DEFAULT_ADDRESS);
-    madt->flags = cpu_to_le32(1);
-
-    for (i = 0; i < apic_ids->len; i++) {
-        adevc->madt_cpu(adev, i, apic_ids, table_data);
-        if (apic_ids->cpus[i].arch_id > 254) {
-            x2apic_mode = true;
-        }
-    }
-
-    io_apic = acpi_data_push(table_data, sizeof *io_apic);
-    io_apic->type = ACPI_APIC_IO;
-    io_apic->length = sizeof(*io_apic);
-    io_apic->io_apic_id = ACPI_BUILD_IOAPIC_ID;
-    io_apic->address = cpu_to_le32(IO_APIC_DEFAULT_ADDRESS);
-    io_apic->interrupt = cpu_to_le32(0);
-
-    if (x86ms->apic_xrupt_override) {
-        intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr);
-        intsrcovr->type   = ACPI_APIC_XRUPT_OVERRIDE;
-        intsrcovr->length = sizeof(*intsrcovr);
-        intsrcovr->source = 0;
-        intsrcovr->gsi    = cpu_to_le32(2);
-        intsrcovr->flags  = cpu_to_le16(0); /* conforms to bus specifications */
-    }
-    for (i = 1; i < 16; i++) {
-#define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11))
-        if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) {
-            /* No need for a INT source override structure. */
-            continue;
-        }
-        intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr);
-        intsrcovr->type   = ACPI_APIC_XRUPT_OVERRIDE;
-        intsrcovr->length = sizeof(*intsrcovr);
-        intsrcovr->source = i;
-        intsrcovr->gsi    = cpu_to_le32(i);
-        intsrcovr->flags  = cpu_to_le16(0xd); /* active high, level triggered */
-    }
-
-    if (x2apic_mode) {
-        AcpiMadtLocalX2ApicNmi *local_nmi;
-
-        local_nmi = acpi_data_push(table_data, sizeof *local_nmi);
-        local_nmi->type   = ACPI_APIC_LOCAL_X2APIC_NMI;
-        local_nmi->length = sizeof(*local_nmi);
-        local_nmi->uid    = 0xFFFFFFFF; /* all processors */
-        local_nmi->flags  = cpu_to_le16(0);
-        local_nmi->lint   = 1; /* ACPI_LINT1 */
-    } else {
-        AcpiMadtLocalNmi *local_nmi;
-
-        local_nmi = acpi_data_push(table_data, sizeof *local_nmi);
-        local_nmi->type         = ACPI_APIC_LOCAL_NMI;
-        local_nmi->length       = sizeof(*local_nmi);
-        local_nmi->processor_id = 0xff; /* all processors */
-        local_nmi->flags        = cpu_to_le16(0);
-        local_nmi->lint         = 1; /* ACPI_LINT1 */
-    }
-
-    build_header(linker, table_data,
-                 (void *)(table_data->data + madt_start), "APIC",
-                 table_data->len - madt_start, 1, NULL, NULL);
-}
-
 static void build_append_pcihp_notify_entry(Aml *method, int slot)
 {
     Aml *if_ctx;
@@ -2707,8 +2587,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
     aml_len += tables_blob->len - fadt;
 
     acpi_add_table(table_offsets, tables_blob);
-    build_madt(tables_blob, tables->linker, x86ms,
-               ACPI_DEVICE_IF(pcms->acpi_dev));
+    acpi_build_madt(tables_blob, tables->linker, x86ms,
+                    ACPI_DEVICE_IF(pcms->acpi_dev));
 
     vmgenid_dev = find_vmgenid_dev();
     if (vmgenid_dev) {
diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
new file mode 100644
index 0000000000..5caca16a0b
--- /dev/null
+++ b/hw/i386/acpi-common.c
@@ -0,0 +1,152 @@
+/* Support for generating ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2008-2010  Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+
+#include "exec/memory.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/utils.h"
+#include "hw/i386/pc.h"
+#include "target/i386/cpu.h"
+
+#include "acpi-build.h"
+#include "acpi-common.h"
+
+void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
+                       const CPUArchIdList *apic_ids, GArray *entry)
+{
+    uint32_t apic_id = apic_ids->cpus[uid].arch_id;
+
+    /* ACPI spec says that LAPIC entry for non present
+     * CPU may be omitted from MADT or it must be marked
+     * as disabled. However omitting non present CPU from
+     * MADT breaks hotplug on linux. So possible CPUs
+     * should be put in MADT but kept disabled.
+     */
+    if (apic_id < 255) {
+        AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic);
+
+        apic->type = ACPI_APIC_PROCESSOR;
+        apic->length = sizeof(*apic);
+        apic->processor_id = uid;
+        apic->local_apic_id = apic_id;
+        if (apic_ids->cpus[uid].cpu != NULL) {
+            apic->flags = cpu_to_le32(1);
+        } else {
+            apic->flags = cpu_to_le32(0);
+        }
+    } else {
+        AcpiMadtProcessorX2Apic *apic = acpi_data_push(entry, sizeof *apic);
+
+        apic->type = ACPI_APIC_LOCAL_X2APIC;
+        apic->length = sizeof(*apic);
+        apic->uid = cpu_to_le32(uid);
+        apic->x2apic_id = cpu_to_le32(apic_id);
+        if (apic_ids->cpus[uid].cpu != NULL) {
+            apic->flags = cpu_to_le32(1);
+        } else {
+            apic->flags = cpu_to_le32(0);
+        }
+    }
+}
+
+void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
+                     X86MachineState *x86ms, AcpiDeviceIf *adev)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
+    const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
+    int madt_start = table_data->len;
+    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev);
+    bool x2apic_mode = false;
+
+    AcpiMultipleApicTable *madt;
+    AcpiMadtIoApic *io_apic;
+    AcpiMadtIntsrcovr *intsrcovr;
+    int i;
+
+    madt = acpi_data_push(table_data, sizeof *madt);
+    madt->local_apic_address = cpu_to_le32(APIC_DEFAULT_ADDRESS);
+    madt->flags = cpu_to_le32(1);
+
+    for (i = 0; i < apic_ids->len; i++) {
+        adevc->madt_cpu(adev, i, apic_ids, table_data);
+        if (apic_ids->cpus[i].arch_id > 254) {
+            x2apic_mode = true;
+        }
+    }
+
+    io_apic = acpi_data_push(table_data, sizeof *io_apic);
+    io_apic->type = ACPI_APIC_IO;
+    io_apic->length = sizeof(*io_apic);
+    io_apic->io_apic_id = ACPI_BUILD_IOAPIC_ID;
+    io_apic->address = cpu_to_le32(IO_APIC_DEFAULT_ADDRESS);
+    io_apic->interrupt = cpu_to_le32(0);
+
+    if (x86ms->apic_xrupt_override) {
+        intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr);
+        intsrcovr->type   = ACPI_APIC_XRUPT_OVERRIDE;
+        intsrcovr->length = sizeof(*intsrcovr);
+        intsrcovr->source = 0;
+        intsrcovr->gsi    = cpu_to_le32(2);
+        intsrcovr->flags  = cpu_to_le16(0); /* conforms to bus specifications */
+    }
+    for (i = 1; i < 16; i++) {
+#define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11))
+        if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) {
+            /* No need for a INT source override structure. */
+            continue;
+        }
+        intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr);
+        intsrcovr->type   = ACPI_APIC_XRUPT_OVERRIDE;
+        intsrcovr->length = sizeof(*intsrcovr);
+        intsrcovr->source = i;
+        intsrcovr->gsi    = cpu_to_le32(i);
+        intsrcovr->flags  = cpu_to_le16(0xd); /* active high, level triggered */
+    }
+
+    if (x2apic_mode) {
+        AcpiMadtLocalX2ApicNmi *local_nmi;
+
+        local_nmi = acpi_data_push(table_data, sizeof *local_nmi);
+        local_nmi->type   = ACPI_APIC_LOCAL_X2APIC_NMI;
+        local_nmi->length = sizeof(*local_nmi);
+        local_nmi->uid    = 0xFFFFFFFF; /* all processors */
+        local_nmi->flags  = cpu_to_le16(0);
+        local_nmi->lint   = 1; /* ACPI_LINT1 */
+    } else {
+        AcpiMadtLocalNmi *local_nmi;
+
+        local_nmi = acpi_data_push(table_data, sizeof *local_nmi);
+        local_nmi->type         = ACPI_APIC_LOCAL_NMI;
+        local_nmi->length       = sizeof(*local_nmi);
+        local_nmi->processor_id = 0xff; /* all processors */
+        local_nmi->flags        = cpu_to_le16(0);
+        local_nmi->lint         = 1; /* ACPI_LINT1 */
+    }
+
+    build_header(linker, table_data,
+                 (void *)(table_data->data + madt_start), "APIC",
+                 table_data->len - madt_start, 1, NULL, NULL);
+}
+
diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h
new file mode 100644
index 0000000000..c30e461f18
--- /dev/null
+++ b/hw/i386/acpi-common.h
@@ -0,0 +1,14 @@
+#ifndef HW_I386_ACPI_COMMON_H
+#define HW_I386_ACPI_COMMON_H
+#include "include/hw/acpi/acpi_dev_interface.h"
+
+#include "include/hw/acpi/bios-linker-loader.h"
+#include "include/hw/i386/x86.h"
+
+/* Default IOAPIC ID */
+#define ACPI_BUILD_IOAPIC_ID 0x0
+
+void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
+                     X86MachineState *x86ms, AcpiDeviceIf *adev);
+
+#endif

From 5794d34a13d11e12d9bc53ea4ade997c2ff81c4c Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 20 May 2020 15:19:48 +0200
Subject: [PATCH 53/58] acpi: madt: skip pci override on pci-less systems.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Needed for microvm.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200520132003.9492-8-kraxel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/acpi-build.c  |  2 +-
 hw/i386/acpi-common.c | 26 +++++++++++++++-----------
 hw/i386/acpi-common.h |  3 ++-
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 26c0c8aefa..473cbdfffd 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2588,7 +2588,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
 
     acpi_add_table(table_offsets, tables_blob);
     acpi_build_madt(tables_blob, tables->linker, x86ms,
-                    ACPI_DEVICE_IF(pcms->acpi_dev));
+                    ACPI_DEVICE_IF(pcms->acpi_dev), true);
 
     vmgenid_dev = find_vmgenid_dev();
     if (vmgenid_dev) {
diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
index 5caca16a0b..ab9b00581a 100644
--- a/hw/i386/acpi-common.c
+++ b/hw/i386/acpi-common.c
@@ -72,7 +72,8 @@ void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
 }
 
 void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
-                     X86MachineState *x86ms, AcpiDeviceIf *adev)
+                     X86MachineState *x86ms, AcpiDeviceIf *adev,
+                     bool has_pci)
 {
     MachineClass *mc = MACHINE_GET_CLASS(x86ms);
     const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
@@ -111,18 +112,21 @@ void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
         intsrcovr->gsi    = cpu_to_le32(2);
         intsrcovr->flags  = cpu_to_le16(0); /* conforms to bus specifications */
     }
-    for (i = 1; i < 16; i++) {
+
+    if (has_pci) {
+        for (i = 1; i < 16; i++) {
 #define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11))
-        if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) {
-            /* No need for a INT source override structure. */
-            continue;
+            if (!(ACPI_BUILD_PCI_IRQS & (1 << i))) {
+                /* No need for a INT source override structure. */
+                continue;
+            }
+            intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr);
+            intsrcovr->type   = ACPI_APIC_XRUPT_OVERRIDE;
+            intsrcovr->length = sizeof(*intsrcovr);
+            intsrcovr->source = i;
+            intsrcovr->gsi    = cpu_to_le32(i);
+            intsrcovr->flags  = cpu_to_le16(0xd); /* active high, level triggered */
         }
-        intsrcovr = acpi_data_push(table_data, sizeof *intsrcovr);
-        intsrcovr->type   = ACPI_APIC_XRUPT_OVERRIDE;
-        intsrcovr->length = sizeof(*intsrcovr);
-        intsrcovr->source = i;
-        intsrcovr->gsi    = cpu_to_le32(i);
-        intsrcovr->flags  = cpu_to_le16(0xd); /* active high, level triggered */
     }
 
     if (x2apic_mode) {
diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h
index c30e461f18..9cac18dddf 100644
--- a/hw/i386/acpi-common.h
+++ b/hw/i386/acpi-common.h
@@ -9,6 +9,7 @@
 #define ACPI_BUILD_IOAPIC_ID 0x0
 
 void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
-                     X86MachineState *x86ms, AcpiDeviceIf *adev);
+                     X86MachineState *x86ms, AcpiDeviceIf *adev,
+                     bool has_pci);
 
 #endif

From c8ed8f57cc558045638512249ffc5f3633f76db1 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 20 May 2020 15:19:49 +0200
Subject: [PATCH 54/58] acpi: fadt: add hw-reduced sleep register support

Add fields to struct AcpiFadtData and update build_fadt() to properly
generate sleep register entries.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <20200520132003.9492-9-kraxel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/aml-build.c         | 4 ++--
 include/hw/acpi/acpi-defs.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index d24e9e6c3a..2cb7b991ef 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1866,9 +1866,9 @@ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f,
     }
 
     /* SLEEP_CONTROL_REG */
-    build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
+    build_append_gas_from_struct(tbl, &f->sleep_ctl);
     /* SLEEP_STATUS_REG */
-    build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
+    build_append_gas_from_struct(tbl, &f->sleep_sts);
 
     /* TODO: extra fields need to be added to support revisions above rev5 */
     assert(f->rev == 5);
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index c13327fa78..3be9ab5049 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -88,6 +88,8 @@ typedef struct AcpiFadtData {
     struct AcpiGenericAddress pm_tmr;    /* PM_TMR_BLK */
     struct AcpiGenericAddress gpe0_blk;  /* GPE0_BLK */
     struct AcpiGenericAddress reset_reg; /* RESET_REG */
+    struct AcpiGenericAddress sleep_ctl; /* SLEEP_CONTROL_REG */
+    struct AcpiGenericAddress sleep_sts; /* SLEEP_STATUS_REG */
     uint8_t reset_val;         /* RESET_VALUE */
     uint8_t  rev;              /* Revision */
     uint32_t flags;            /* Flags */

From 32905fc95c0a082f4cb28b8076e0160b3c62c7f8 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 20 May 2020 15:19:50 +0200
Subject: [PATCH 55/58] acpi: ged: rename event memory region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename memory region and callbacks and ops to carry "evt" in the name
because a second region will be added shortly.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Message-Id: <20200520132003.9492-10-kraxel@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Igor Mammedow <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/acpi/generic_event_device.c         | 16 ++++++++--------
 include/hw/acpi/generic_event_device.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index b1cbdd86b6..1cb34111e5 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -142,7 +142,7 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev,
 }
 
 /* Memory read by the GED _EVT AML dynamic method */
-static uint64_t ged_read(void *opaque, hwaddr addr, unsigned size)
+static uint64_t ged_evt_read(void *opaque, hwaddr addr, unsigned size)
 {
     uint64_t val = 0;
     GEDState *ged_st = opaque;
@@ -161,14 +161,14 @@ static uint64_t ged_read(void *opaque, hwaddr addr, unsigned size)
 }
 
 /* Nothing is expected to be written to the GED memory region */
-static void ged_write(void *opaque, hwaddr addr, uint64_t data,
-                      unsigned int size)
+static void ged_evt_write(void *opaque, hwaddr addr, uint64_t data,
+                          unsigned int size)
 {
 }
 
-static const MemoryRegionOps ged_ops = {
-    .read = ged_read,
-    .write = ged_write,
+static const MemoryRegionOps ged_evt_ops = {
+    .read = ged_evt_read,
+    .write = ged_evt_write,
     .endianness = DEVICE_LITTLE_ENDIAN,
     .valid = {
         .min_access_size = 4,
@@ -287,9 +287,9 @@ static void acpi_ged_initfn(Object *obj)
     SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
     GEDState *ged_st = &s->ged_state;
 
-    memory_region_init_io(&ged_st->io, obj, &ged_ops, ged_st,
+    memory_region_init_io(&ged_st->evt, obj, &ged_evt_ops, ged_st,
                           TYPE_ACPI_GED, ACPI_GED_EVT_SEL_LEN);
-    sysbus_init_mmio(sbd, &ged_st->io);
+    sysbus_init_mmio(sbd, &ged_st->evt);
 
     sysbus_init_irq(sbd, &s->irq);
 
diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h
index 83917de024..90a9180db5 100644
--- a/include/hw/acpi/generic_event_device.h
+++ b/include/hw/acpi/generic_event_device.h
@@ -86,7 +86,7 @@
 #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4
 
 typedef struct GEDState {
-    MemoryRegion io;
+    MemoryRegion evt;
     uint32_t     sel;
 } GEDState;
 

From 62925fd2b8b778539607b321216cd54897ddb63a Mon Sep 17 00:00:00 2001
From: Raphael Norwitz <raphael.s.norwitz@gmail.com>
Date: Thu, 7 May 2020 17:37:58 -0400
Subject: [PATCH 56/58] Fix parameter type in vhost migration log path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ‘enable’ parameter to the vhost_migration_log() function is given as
an int, but "true"/"false" values are passed in wherever it is invoked.
Inside the function itself it is only ever compared with bool values.
Therefore the parameter value itself should be changed to bool.

Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
Message-Id: <CAFubqFtqNZw=Y-ar3N=3zTQi6LkKg_G-7W7OOHHbE7Y1fV7HAQ@mail.gmail.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index aff98a0ede..aa06a36919 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -809,12 +809,12 @@ err_features:
     return r;
 }
 
-static int vhost_migration_log(MemoryListener *listener, int enable)
+static int vhost_migration_log(MemoryListener *listener, bool enable)
 {
     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                          memory_listener);
     int r;
-    if (!!enable == dev->log_enabled) {
+    if (enable == dev->log_enabled) {
         return 0;
     }
     if (!dev->started) {

From 12fcf49c1a016cc05638b965f4e7b909794bffa2 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 17 Mar 2020 15:59:08 -0400
Subject: [PATCH 57/58] pci: Display PCI IRQ pin in "info pci"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sometimes it would be good to be able to read the pin number along
with the IRQ number allocated.  Since we'll dump the IRQ number, no
reason to not dump the pin information.  For example, the vfio-pci
device will overwrite the pin with the hardware pin number.  It would
be nice to know the pin number of one assigned device from QMP/HMP.

CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
CC: Alex Williamson <alex.williamson@redhat.com>
CC: Michael S. Tsirkin <mst@redhat.com>
CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
CC: Julia Suvorova <jusual@redhat.com>
CC: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20200317195908.283800-1-peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
---
 hw/pci/pci.c       | 1 +
 monitor/hmp-cmds.c | 3 ++-
 qapi/misc.json     | 6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 1b88a32cf7..a60cf3ae3b 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1776,6 +1776,7 @@ static PciDeviceInfo *qmp_query_pci_device(PCIDevice *dev, PCIBus *bus,
     info->regions = qmp_query_pci_regions(dev);
     info->qdev_id = g_strdup(dev->qdev.id ? dev->qdev.id : "");
 
+    info->irq_pin = dev->config[PCI_INTERRUPT_PIN];
     if (dev->config[PCI_INTERRUPT_PIN] != 0) {
         info->has_irq = true;
         info->irq = dev->config[PCI_INTERRUPT_LINE];
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 9c61e769ca..e03adf0d4d 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -688,7 +688,8 @@ static void hmp_info_pci_device(Monitor *mon, const PciDeviceInfo *dev)
     }
 
     if (dev->has_irq) {
-        monitor_printf(mon, "      IRQ %" PRId64 ".\n", dev->irq);
+        monitor_printf(mon, "      IRQ %" PRId64 ", pin %c\n",
+                       dev->irq, (char)('A' + dev->irq_pin - 1));
     }
 
     if (dev->has_pci_bridge) {
diff --git a/qapi/misc.json b/qapi/misc.json
index 99b90ac80b..a5a0beb902 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -403,6 +403,8 @@
 #
 # @irq: if an IRQ is assigned to the device, the IRQ number
 #
+# @irq_pin: the IRQ pin, zero means no IRQ (since 5.1)
+#
 # @qdev_id: the device name of the PCI device
 #
 # @pci_bridge: if the device is a PCI bridge, the bridge information
@@ -417,8 +419,8 @@
 { 'struct': 'PciDeviceInfo',
   'data': {'bus': 'int', 'slot': 'int', 'function': 'int',
            'class_info': 'PciDeviceClass', 'id': 'PciDeviceId',
-           '*irq': 'int', 'qdev_id': 'str', '*pci_bridge': 'PciBridgeInfo',
-           'regions': ['PciMemoryRegion']} }
+           '*irq': 'int', 'irq_pin': 'int', 'qdev_id': 'str',
+           '*pci_bridge': 'PciBridgeInfo', 'regions': ['PciMemoryRegion'] }}
 
 ##
 # @PciInfo:

From 10d35e581901c09ee3817ac7cddd296d05291a9d Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 10 Jun 2020 13:43:51 +0800
Subject: [PATCH 58/58] virtio-pci: fix queue_enable write

Spec said: The driver uses this to selectively prevent the device from
executing requests from this virtqueue. 1 - enabled; 0 - disabled.

Though write 0 to queue_enable is forbidden by the spec, we should not
assume that the value is 1.

Fix this by ignore the write value other than 1.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20200610054351.15811-1-jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-pci.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index d028c17c24..7bc8c1c056 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1273,16 +1273,20 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
         virtio_queue_set_vector(vdev, vdev->queue_sel, val);
         break;
     case VIRTIO_PCI_COMMON_Q_ENABLE:
-        virtio_queue_set_num(vdev, vdev->queue_sel,
-                             proxy->vqs[vdev->queue_sel].num);
-        virtio_queue_set_rings(vdev, vdev->queue_sel,
+        if (val == 1) {
+            virtio_queue_set_num(vdev, vdev->queue_sel,
+                                 proxy->vqs[vdev->queue_sel].num);
+            virtio_queue_set_rings(vdev, vdev->queue_sel,
                        ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 |
                        proxy->vqs[vdev->queue_sel].desc[0],
                        ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 |
                        proxy->vqs[vdev->queue_sel].avail[0],
                        ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
                        proxy->vqs[vdev->queue_sel].used[0]);
-        proxy->vqs[vdev->queue_sel].enabled = 1;
+            proxy->vqs[vdev->queue_sel].enabled = 1;
+        } else {
+            virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);
+        }
         break;
     case VIRTIO_PCI_COMMON_Q_DESCLO:
         proxy->vqs[vdev->queue_sel].desc[0] = val;